Coverage report: /home/ellis/comp/ext/cl-ppcre/lexer.lisp

Kind	Covered	All	%
expression	263	689	38.2
branch	34	92	37.0

Key

Not instrumented

Conditionalized out

Executed

Not executed

Both branches taken

One branch taken

Neither branch taken

1

;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.35 2009/09/17 19:17:31 edi Exp $

2

3

;;; The lexer's responsibility is to convert the regex string into a

4

;;; sequence of tokens which are in turn consumed by the parser.

5

;;;

6

;;; The lexer is aware of Perl's 'extended mode' and it also 'knows'

7

;;; (with a little help from the parser) how many register groups it

8

;;; has opened so far. (The latter is necessary for interpreting

9

;;; strings like "\\10" correctly.)

10

11

12

13

;;; Redistribution and use in source and binary forms, with or without

14

;;; modification, are permitted provided that the following conditions

15

;;; are met:

16

17

;;; * Redistributions of source code must retain the above copyright

18

;;; notice, this list of conditions and the following disclaimer.

19

20

;;; * Redistributions in binary form must reproduce the above

21

;;; copyright notice, this list of conditions and the following

22

;;; disclaimer in the documentation and/or other materials

23

;;; provided with the distribution.

24

25

;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED

26

;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

27

;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

28

;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY

29

;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

30

;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE

31

;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

32

;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,

33

;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

34

;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

35

;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

36

37

(in-package :cl-ppcre)

38

39

(declaim (inline map-char-to-special-char-class))

40

(defun map-char-to-special-char-class (chr)

41

(declare #.*standard-optimize-settings*)

42

"Maps escaped characters like \"\\d\" to the tokens which represent

43

their associated character classes."

44

(case chr

45

((#\d)

46

:digit-class)

47

((#\D)

48

:non-digit-class)

49

((#\w)

50

:word-char-class)

51

((#\W)

52

:non-word-char-class)

53

((#\s)

54

:whitespace-char-class)

55

((#\S)

56

:non-whitespace-char-class)))

57

58

(declaim (inline make-lexer-internal))

59

(defstruct (lexer (:constructor make-lexer-internal))

60

"LEXER structures are used to hold the regex string which is

61

currently lexed and to keep track of the lexer's state."

62

(str "" :type string :read-only t)

63

(len 0 :type fixnum :read-only t)

64

(reg 0 :type fixnum)

65

(pos 0 :type fixnum)

66

(last-pos nil :type list))

67

68

(defun make-lexer (string)

69

(declare #-:genera (string string))

70

(make-lexer-internal :str (maybe-coerce-to-simple-string string)

71

:len (length string)))

72

73

(declaim (inline end-of-string-p))

74

(defun end-of-string-p (lexer)

75

(declare #.*standard-optimize-settings*)

76

"Tests whether we're at the end of the regex string."

77

(<= (lexer-len lexer)

78

(lexer-pos lexer)))

79

80

(declaim (inline looking-at-p))

81

(defun looking-at-p (lexer chr)

82

(declare #.*standard-optimize-settings*)

83

"Tests whether the next character the lexer would see is CHR.

84

Does not respect extended mode."

85

(and (not (end-of-string-p lexer))

86

(char= (schar (lexer-str lexer) (lexer-pos lexer))

87

chr)))

88

89

(declaim (inline next-char-non-extended))

90

(defun next-char-non-extended (lexer)

91

(declare #.*standard-optimize-settings*)

92

"Returns the next character which is to be examined and updates the

93

POS slot. Does not respect extended mode."

94

(cond ((end-of-string-p lexer) nil)

95

(t (prog1

96

(schar (lexer-str lexer) (lexer-pos lexer))

97

(incf (lexer-pos lexer))))))

98

99

(defun next-char (lexer)

100

(declare #.*standard-optimize-settings*)

101

"Returns the next character which is to be examined and updates the

102

POS slot. Respects extended mode, i.e. whitespace, comments, and also

103

nested comments are skipped if applicable."

104

(let ((next-char (next-char-non-extended lexer))

105

last-loop-pos)

106

(loop

107

;; remember where we started

108

(setq last-loop-pos (lexer-pos lexer))

109

;; first we look for nested comments like (?#foo)

110

(when (and next-char

111

(char= next-char #\()

112

(looking-at-p lexer #\?))

113

(incf (lexer-pos lexer))

114

(cond ((looking-at-p lexer #\#)

115

;; must be a nested comment - so we have to search for

116

;; the closing parenthesis

117

(let ((error-pos (- (lexer-pos lexer) 2)))

118

(unless

119

;; loop 'til ')' or end of regex string and

120

;; return NIL if ')' wasn't encountered

121

(loop for skip-char = next-char

122

then (next-char-non-extended lexer)

123

while (and skip-char

124

(char/= skip-char #\)))

125

finally (return skip-char))

126

(signal-syntax-error* error-pos "Comment group not closed.")))

127

(setq next-char (next-char-non-extended lexer)))

128

(t

129

;; undo effect of previous INCF if we didn't see a #

130

(decf (lexer-pos lexer)))))

131

(when *extended-mode-p*

132

;; now - if we're in extended mode - we skip whitespace and

133

;; comments; repeat the following loop while we look at

134

;; whitespace or #\#

135

(loop while (and next-char

136

(or (char= next-char #\#)

137

(whitespacep next-char)))

138

do (setq next-char

139

(if (char= next-char #\#)

140

;; if we saw a comment marker skip until

141

;; we're behind #\Newline...

142

(loop for skip-char = next-char

143

then (next-char-non-extended lexer)

144

while (and skip-char

145

                                             (char/= skip-char #\Newline))

146

finally (return (next-char-non-extended lexer)))

147

;; ...otherwise (whitespace) skip until we

148

;; see the next non-whitespace character

149

(loop for skip-char = next-char

150

then (next-char-non-extended lexer)

151

while (and skip-char

152

(whitespacep skip-char))

153

finally (return skip-char))))))

154

;; if the position has moved we have to repeat our tests

155

;; because of cases like /^a (?#xxx) (?#yyy) {3}c/x which

156

;; would be equivalent to /^a{3}c/ in Perl

157

(unless (> (lexer-pos lexer) last-loop-pos)

158

(return next-char)))))

159

160

(declaim (inline fail))

161

(defun fail (lexer)

162

(declare #.*standard-optimize-settings*)

163

"Moves (LEXER-POS LEXER) back to the last position stored in

164

\(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."

165

(unless (lexer-last-pos lexer)

166

(signal-syntax-error "LAST-POS stack of LEXER ~A is empty." lexer))

167

(setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))

168

nil)

169

170

(defun get-number (lexer &key (radix 10) max-length no-whitespace-p)

171

(declare #.*standard-optimize-settings*)

172

"Read and consume the number the lexer is currently looking at and

173

return it. Returns NIL if no number could be identified.

174

RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read

175

at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL

176

we don't tolerate whitespace in front of the number."

177

(when (or (end-of-string-p lexer)

178

(and no-whitespace-p

179

(whitespacep (schar (lexer-str lexer) (lexer-pos lexer)))))

180

(return-from get-number nil))

181

(multiple-value-bind (integer new-pos)

182

(parse-integer (lexer-str lexer)

183

:start (lexer-pos lexer)

184

:end (if max-length

185

(let ((end-pos (+ (lexer-pos lexer)

186

                                               (the fixnum max-length)))

187

(lexer-len (lexer-len lexer)))

188

(if (< end-pos lexer-len)

189

end-pos

190

lexer-len))

191

(lexer-len lexer))

192

:radix radix

193

:junk-allowed t)

194

(cond ((and integer (>= (the fixnum integer) 0))

195

(setf (lexer-pos lexer) new-pos)

196

integer)

197

(t nil))))

198

199

(declaim (inline try-number))

200

(defun try-number (lexer &key (radix 10) max-length no-whitespace-p)

201

(declare #.*standard-optimize-settings*)

202

"Like GET-NUMBER but won't consume anything if no number is seen."

203

;; remember current position

204

(push (lexer-pos lexer) (lexer-last-pos lexer))

205

(let ((number (get-number lexer

206

:radix radix

207

:max-length max-length

208

:no-whitespace-p no-whitespace-p)))

209

(or number (fail lexer))))

210

211

(declaim (inline make-char-from-code))

212

(defun make-char-from-code (number error-pos)

213

(declare #.*standard-optimize-settings*)

214

"Create character from char-code NUMBER. NUMBER can be NIL

215

which is interpreted as 0. ERROR-POS is the position where

216

the corresponding number started within the regex string."

217

;; only look at rightmost eight bits in compliance with Perl

218

(let ((code (logand #o377 (the fixnum (or number 0)))))

219

(or (and (< code char-code-limit)

220

(code-char code))

221

(signal-syntax-error* error-pos "No character for hex-code ~X." number))))

222

223

(defun unescape-char (lexer)

224

(declare #.*standard-optimize-settings*)

225

"Convert the characters\(s) following a backslash into a token

226

which is returned. This function is to be called when the backslash

227

has already been consumed. Special character classes like \\W are

228

handled elsewhere."

229

(when (end-of-string-p lexer)

230

(signal-syntax-error "String ends with backslash."))

231

(let ((chr (next-char-non-extended lexer)))

232

(case chr

233

((#\E)

234

;; if \Q quoting is on this is ignored, otherwise it's just an

235

;; #\E

236

(if *allow-quoting*

237

:void

238

#\E))

239

((#\c)

240

;; \cx means control-x in Perl

241

(let ((next-char (next-char-non-extended lexer)))

242

(unless next-char

243

(signal-syntax-error* (lexer-pos lexer) "Character missing after '\\c'"))

244

(code-char (logxor #x40 (char-code (char-upcase next-char))))))

245

((#\x)

246

;; \x should be followed by a hexadecimal char code,

247

;; two digits or less

248

(let* ((error-pos (lexer-pos lexer))

249

(number (get-number lexer :radix 16 :max-length 2 :no-whitespace-p t)))

250

;; note that it is OK if \x is followed by zero digits

251

(make-char-from-code number error-pos)))

252

((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)

253

;; \x should be followed by an octal char code,

254

;; three digits or less

255

(let* ((error-pos (decf (lexer-pos lexer)))

256

(number (get-number lexer :radix 8 :max-length 3)))

257

(make-char-from-code number error-pos)))

258

;; the following five character names are 'semi-standard'

259

;; according to the CLHS but I'm not aware of any implementation

260

;; that doesn't implement them

261

((#\t)

262

#\Tab)

263

((#\n)

264

#\Newline)

265

((#\r)

266

#\Return)

267

((#\f)

268

#\Page)

269

((#\b)

270

#\Backspace)

271

((#\a)

272

(code-char 7)) ; ASCII bell

273

((#\e)

274

(code-char 27)) ; ASCII escape

275

(otherwise

276

;; all other characters aren't affected by a backslash

277

chr))))

278

279

(defun read-char-property (lexer first-char)

280

(declare #.*standard-optimize-settings*)

281

(unless (eql (next-char-non-extended lexer) #\{)

282

(signal-syntax-error* (lexer-pos lexer) "Expected left brace after \\~A." first-char))

283

(let ((name (with-output-to-string (out nil :element-type 'character)

284

(loop

285

(let ((char (or (next-char-non-extended lexer)

286

                                    (signal-syntax-error "Unexpected EOF after \\~A{." first-char))))

287

(when (char= char #\})

288

(return))

289

(write-char char out))))))

290

(list (if (char= first-char #\p) :property :inverted-property)

291

name)))

292

293

(defun collect-char-class (lexer)

294

"Reads and consumes characters from regex string until a right

295

bracket is seen. Assembles them into a list \(which is returned) of

296

characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and

297

tokens representing special character classes."

298

(declare #.*standard-optimize-settings*)

299

(let ((start-pos (lexer-pos lexer)) ; remember start for error message

300

hyphen-seen

301

last-char

302

list)

303

(flet ((handle-char (c)

304

"Do the right thing with character C depending on whether

305

we're inside a range or not."

306

(cond ((and hyphen-seen last-char)

307

(setf (car list) (list :range last-char c)

308

last-char nil))

309

(t

310

(push c list)

311

(setq last-char c)))

312

(setq hyphen-seen nil)))

313

(loop for first = t then nil

314

for c = (next-char-non-extended lexer)

315

;; leave loop if at end of string

316

while c

317

do (cond

318

((char= c #\\)

319

;; we've seen a backslash

320

(let ((next-char (next-char-non-extended lexer)))

321

(case next-char

322

((#\d #\D #\w #\W #\s #\S)

323

;; a special character class

324

(push (map-char-to-special-char-class next-char) list)

325

;; if the last character was a hyphen

326

;; just collect it literally

327

(when hyphen-seen

328

(push #\- list))

329

;; if the next character is a hyphen do the same

330

(when (looking-at-p lexer #\-)

331

(push #\- list)

332

(incf (lexer-pos lexer)))

333

(setq hyphen-seen nil))

334

((#\P #\p)

335

;; maybe a character property

336

(cond ((null *property-resolver*)

337

(handle-char next-char))

338

(t

339

(push (read-char-property lexer next-char) list)

340

;; if the last character was a hyphen

341

;; just collect it literally

342

(when hyphen-seen

343

(push #\- list))

344

;; if the next character is a hyphen do the same

345

(when (looking-at-p lexer #\-)

346

(push #\- list)

347

(incf (lexer-pos lexer)))

348

(setq hyphen-seen nil))))

349

((#\E)

350

;; if \Q quoting is on we ignore \E,

351

;; otherwise it's just a plain #\E

352

(unless *allow-quoting*

353

(handle-char #\E)))

354

(otherwise

355

;; otherwise unescape the following character(s)

356

(decf (lexer-pos lexer))

357

(handle-char (unescape-char lexer))))))

358

(first

359

;; the first character must not be a right bracket

360

;; and isn't treated specially if it's a hyphen

361

(handle-char c))

362

((char= c #\])

363

;; end of character class

364

;; make sure we collect a pending hyphen

365

(when hyphen-seen

366

(setq hyphen-seen nil)

367

(handle-char #\-))

368

;; reverse the list to preserve the order intended

369

;; by the author of the regex string

370

(return-from collect-char-class (nreverse list)))

371

((and (char= c #\-)

372

last-char

373

(not hyphen-seen))

374

;; if the last character was 'just a character'

375

;; we expect to be in the middle of a range

376

(setq hyphen-seen t))

377

((char= c #\-)

378

;; otherwise this is just an ordinary hyphen

379

(handle-char #\-))

380

(t

381

;; default case - just collect the character

382

(handle-char c))))

383

;; we can only exit the loop normally if we've reached the end

384

;; of the regex string without seeing a right bracket

385

(signal-syntax-error* start-pos "Missing right bracket to close character class."))))

386

387

(defun maybe-parse-flags (lexer)

388

(declare #.*standard-optimize-settings*)

389

"Reads a sequence of modifiers \(including #\\- to reverse their

390

meaning) and returns a corresponding list of \"flag\" tokens. The

391

\"x\" modifier is treated specially in that it dynamically modifies

392

the behaviour of the lexer itself via the special variable

393

*EXTENDED-MODE-P*."

394

(prog1

395

(loop with set = t

396

for chr = (next-char-non-extended lexer)

397

unless chr

398

do (signal-syntax-error "Unexpected end of string.")

399

while (find chr "-imsx" :test #'char=)

400

;; the first #\- will invert the meaning of all modifiers

401

;; following it

402

if (char= chr #\-)

403

do (setq set nil)

404

else if (char= chr #\x)

405

do (setq *extended-mode-p* set)

406

else collect (if set

407

(case chr

408

((#\i)

409

:case-insensitive-p)

410

((#\m)

411

:multi-line-mode-p)

412

((#\s)

413

:single-line-mode-p))

414

(case chr

415

((#\i)

416

:case-sensitive-p)

417

((#\m)

418

:not-multi-line-mode-p)

419

((#\s)

420

:not-single-line-mode-p))))

421

(decf (lexer-pos lexer))))

422

423

(defun get-quantifier (lexer)

424

(declare #.*standard-optimize-settings*)

425

"Returns a list of two values (min max) if what the lexer is looking

426

at can be interpreted as a quantifier. Otherwise returns NIL and

427

resets the lexer to its old position."

428

;; remember starting position for FAIL and UNGET-TOKEN functions

429

(push (lexer-pos lexer) (lexer-last-pos lexer))

430

(let ((next-char (next-char lexer)))

431

(case next-char

432

((#\*)

433

;; * (Kleene star): match 0 or more times

434

'(0 nil))

435

((#\+)

436

;; +: match 1 or more times

437

'(1 nil))

438

((#\?)

439

;; ?: match 0 or 1 times

440

'(0 1))

441

((#\{)

442

;; one of

443

;; {n}: match exactly n times

444

;; {n,}: match at least n times

445

;; {n,m}: match at least n but not more than m times

446

;; note that anything not matching one of these patterns will

447

;; be interpreted literally - even whitespace isn't allowed

448

(let ((num1 (get-number lexer :no-whitespace-p t)))

449

(if num1

450

(let ((next-char (next-char-non-extended lexer)))

451

(case next-char

452

((#\,)

453

(let* ((num2 (get-number lexer :no-whitespace-p t))

454

(next-char (next-char-non-extended lexer)))

455

(case next-char

456

((#\})

457

;; this is the case {n,} (NUM2 is NIL) or {n,m}

458

(list num1 num2))

459

(otherwise

460

(fail lexer)))))

461

((#\})

462

;; this is the case {n}

463

(list num1 num1))

464

(otherwise

465

(fail lexer))))

466

;; no number following left curly brace, so we treat it

467

;; like a normal character

468

(fail lexer))))

469

;; cannot be a quantifier

470

(otherwise

471

(fail lexer)))))

472

473

(defun parse-register-name-aux (lexer)

474

"Reads and returns the name in a named register group. It is

475

assumed that the starting #\< character has already been read. The

476

closing #\> will also be consumed."

477

;; we have to look for an ending > character now

478

(let ((end-name (position #\>

479

(lexer-str lexer)

480

:start (lexer-pos lexer)

481

:test #'char=)))

482

(unless end-name

483

;; there has to be > somewhere, syntax error otherwise

484

       (signal-syntax-error* (1- (lexer-pos lexer)) "Opening #\< in named group has no closing #\>."))

485

(let ((name (subseq (lexer-str lexer)

486

(lexer-pos lexer)

487

end-name)))

488

(unless (every #'(lambda (char)

489

(or (alphanumericp char)

490

(char= #\- char)))

491

name)

492

;; register name can contain only alphanumeric characters or #\-

493

(signal-syntax-error* (lexer-pos lexer) "Invalid character in named register group."))

494

;; advance lexer beyond "<name>" part

495

(setf (lexer-pos lexer) (1+ end-name))

496

name)))

497

498

(declaim (inline unget-token))

499

(defun unget-token (lexer)

500

(declare #.*standard-optimize-settings*)

501

"Moves the lexer back to the last position stored in the LAST-POS stack."

502

(if (lexer-last-pos lexer)

503

(setf (lexer-pos lexer)

504

(pop (lexer-last-pos lexer)))

505

(error "No token to unget \(this should not happen)")))

506

507

(defun get-token (lexer)

508

(declare #.*standard-optimize-settings*)

509

"Returns and consumes the next token from the regex string \(or NIL)."

510

;; remember starting position for UNGET-TOKEN function

511

(push (lexer-pos lexer)

512

(lexer-last-pos lexer))

513

(let ((next-char (next-char lexer)))

514

(cond (next-char

515

(case next-char

516

;; the easy cases first - the following six characters

517

;; always have a special meaning and get translated

518

;; into tokens immediately

519

((#\))

520

:close-paren)

521

((#\|)

522

:vertical-bar)

523

((#\?)

524

:question-mark)

525

((#\.)

526

:everything)

527

((#\^)

528

:start-anchor)

529

((#\$)

530

:end-anchor)

531

((#\+ #\*)

532

;; quantifiers will always be consumend by

533

;; GET-QUANTIFIER, they must not appear here

534

               (signal-syntax-error* (1- (lexer-pos lexer)) "Quantifier '~A' not allowed." next-char))

535

((#\{)

536

;; left brace isn't a special character in it's own

537

;; right but we must check if what follows might

538

;; look like a quantifier

539

(let ((this-pos (lexer-pos lexer))

540

(this-last-pos (lexer-last-pos lexer)))

541

(unget-token lexer)

542

(when (get-quantifier lexer)

543

(signal-syntax-error* (car this-last-pos)

544

"Quantifier '~A' not allowed."

545

(subseq (lexer-str lexer)

546

(car this-last-pos)

547

(lexer-pos lexer))))

548

(setf (lexer-pos lexer) this-pos

549

(lexer-last-pos lexer) this-last-pos)

550

next-char))

551

((#\[)

552

;; left bracket always starts a character class

553

(cons (cond ((looking-at-p lexer #\^)

554

(incf (lexer-pos lexer))

555

:inverted-char-class)

556

(t

557

:char-class))

558

(collect-char-class lexer)))

559

((#\\)

560

;; backslash might mean different things so we have

561

;; to peek one char ahead:

562

(let ((next-char (next-char-non-extended lexer)))

563

(case next-char

564

((#\A)

565

:modeless-start-anchor)

566

((#\Z)

567

:modeless-end-anchor)

568

((#\z)

569

:modeless-end-anchor-no-newline)

570

((#\b)

571

:word-boundary)

572

((#\B)

573

:non-word-boundary)

574

((#\k)

575

(cond ((and *allow-named-registers*

576

(looking-at-p lexer #\<))

577

;; back-referencing a named register

578

(incf (lexer-pos lexer))

579

(list :back-reference

580

(parse-register-name-aux lexer)))

581

(t

582

;; false alarm, just unescape \k

583

#\k)))

584

((#\d #\D #\w #\W #\s #\S)

585

;; these will be treated like character classes

586

(map-char-to-special-char-class next-char))

587

((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)

588

;; uh, a digit...

589

(let* ((old-pos (decf (lexer-pos lexer)))

590

;; ...so let's get the whole number first

591

(backref-number (get-number lexer)))

592

(declare (fixnum backref-number))

593

(cond ((and (> backref-number (lexer-reg lexer))

594

(<= 10 backref-number))

595

;; \10 and higher are treated as octal

596

;; character codes if we haven't

597

;; opened that much register groups

598

;; yet

599

(setf (lexer-pos lexer) old-pos)

600

;; re-read the number from the old

601

;; position and convert it to its

602

;; corresponding character

603

                             (make-char-from-code (get-number lexer :radix 8 :max-length 3)

604

old-pos))

605

(t

606

;; otherwise this must refer to a

607

;; backreference

608

(list :back-reference backref-number)))))

609

((#\0)

610

;; this always means an octal character code

611

;; (at most three digits)

612

(let ((old-pos (decf (lexer-pos lexer))))

613

(make-char-from-code (get-number lexer :radix 8 :max-length 3)

614

old-pos)))

615

((#\P #\p)

616

;; might be a named property

617

(cond (*property-resolver* (read-char-property lexer next-char))

618

(t next-char)))

619

(otherwise

620

;; in all other cases just unescape the

621

;; character

622

(decf (lexer-pos lexer))

623

(unescape-char lexer)))))

624

((#\()

625

;; an open parenthesis might mean different things

626

;; depending on what follows...

627

(cond ((looking-at-p lexer #\?)

628

;; this is the case '(?' (and probably more behind)

629

(incf (lexer-pos lexer))

630

;; we have to check for modifiers first

631

;; because a colon might follow

632

(let* ((flags (maybe-parse-flags lexer))

633

(next-char (next-char-non-extended lexer)))

634

;; modifiers are only allowed if a colon

635

;; or a closing parenthesis are following

636

(when (and flags

637

(not (find next-char ":)" :test #'char=)))

638

(signal-syntax-error* (car (lexer-last-pos lexer))

639

                                                "Sequence '~A' not recognized."

640

                                                (subseq (lexer-str lexer)

641

                                                        (car (lexer-last-pos lexer))

642

                                                        (lexer-pos lexer))))

643

(case next-char

644

((nil)

645

;; syntax error

646

(signal-syntax-error "End of string following '(?'."))

647

((#\))

648

;; an empty group except for the flags

649

;; (if there are any)

650

(or (and flags

651

(cons :flags flags))

652

:void))

653

((#\()

654

;; branch

655

:open-paren-paren)

656

((#\>)

657

;; standalone

658

:open-paren-greater)

659

((#\=)

660

;; positive look-ahead

661

:open-paren-equal)

662

((#\!)

663

;; negative look-ahead

664

:open-paren-exclamation)

665

((#\:)

666

;; non-capturing group - return flags as

667

;; second value

668

(values :open-paren-colon flags))

669

((#\<)

670

;; might be a look-behind assertion or a named group, so

671

;; check next character

672

(let ((next-char (next-char-non-extended lexer)))

673

(cond ((and next-char

674

(alpha-char-p next-char))

675

;; we have encountered a named group

676

;; are we supporting register naming?

677

(unless *allow-named-registers*

678

                                      (signal-syntax-error* (1- (lexer-pos lexer))

679

                                                            "Character '~A' may not follow '(?<' (because ~a = NIL)"

680

                                                            next-char

681

                                                            '*allow-named-registers*))

682

;; put the letter back

683

(decf (lexer-pos lexer))

684

;; named group

685

:open-paren-less-letter)

686

(t

687

(case next-char

688

((#\=)

689

;; positive look-behind

690

:open-paren-less-equal)

691

((#\!)

692

;; negative look-behind

693

:open-paren-less-exclamation)

694

((#\))

695

;; Perl allows "(?<)" and treats

696

;; it like a null string

697

:void)

698

((nil)

699

;; syntax error

700

                                       (signal-syntax-error "End of string following '(?<'."))

701

(t

702

;; also syntax error

703

                                       (signal-syntax-error* (1- (lexer-pos lexer))

704

                                                             "Character '~A' may not follow '(?<'."

705

                                                             next-char )))))))

706

(otherwise

707

(signal-syntax-error* (1- (lexer-pos lexer))

708

                                                 "Character '~A' may not follow '(?'."

709

next-char)))))

710

(t

711

;; if next-char was not #\? (this is within

712

;; the first COND), we've just seen an opening

713

;; parenthesis and leave it like that

714

:open-paren)))

715

(otherwise

716

;; all other characters are their own tokens

717

next-char)))

718

;; we didn't get a character (this if the "else" branch from

719

;; the first IF), so we don't return a token but NIL

720

(t

721

(pop (lexer-last-pos lexer))

722

nil))))

723

724

(declaim (inline start-of-subexpr-p))

725

(defun start-of-subexpr-p (lexer)

726

(declare #.*standard-optimize-settings*)

727

"Tests whether the next token can start a valid sub-expression, i.e.

728

a stand-alone regex."

729

(let* ((pos (lexer-pos lexer))

730

(next-char (next-char lexer)))

731

(not (or (null next-char)

732

(prog1

733

(member (the character next-char)

734

'(#\) #\|)

735

:test #'char=)

736

(setf (lexer-pos lexer) pos))))))