Coverage report: /home/ellis/comp/ext/cl-ppcre/lexer.lisp
Kind | Covered | All | % |
expression | 263 | 689 | 38.2 |
branch | 34 | 92 | 37.0 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; $Header: /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.35 2009/09/17 19:17:31 edi Exp $
3
;;; The lexer's responsibility is to convert the regex string into a
4
;;; sequence of tokens which are in turn consumed by the parser.
6
;;; The lexer is aware of Perl's 'extended mode' and it also 'knows'
7
;;; (with a little help from the parser) how many register groups it
8
;;; has opened so far. (The latter is necessary for interpreting
9
;;; strings like "\\10" correctly.)
11
;;; Copyright (c) 2002-2009, Dr. Edmund Weitz. All rights reserved.
13
;;; Redistribution and use in source and binary forms, with or without
14
;;; modification, are permitted provided that the following conditions
17
;;; * Redistributions of source code must retain the above copyright
18
;;; notice, this list of conditions and the following disclaimer.
20
;;; * Redistributions in binary form must reproduce the above
21
;;; copyright notice, this list of conditions and the following
22
;;; disclaimer in the documentation and/or other materials
23
;;; provided with the distribution.
25
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
26
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
27
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
29
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
31
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
33
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37
(in-package :cl-ppcre)
39
(declaim (inline map-char-to-special-char-class))
40
(defun map-char-to-special-char-class (chr)
41
(declare #.*standard-optimize-settings*)
42
"Maps escaped characters like \"\\d\" to the tokens which represent
43
their associated character classes."
54
:whitespace-char-class)
56
:non-whitespace-char-class)))
58
(declaim (inline make-lexer-internal))
59
(defstruct (lexer (:constructor make-lexer-internal))
60
"LEXER structures are used to hold the regex string which is
61
currently lexed and to keep track of the lexer's state."
62
(str "" :type string :read-only t)
63
(len 0 :type fixnum :read-only t)
66
(last-pos nil :type list))
68
(defun make-lexer (string)
69
(declare #-:genera (string string))
70
(make-lexer-internal :str (maybe-coerce-to-simple-string string)
71
:len (length string)))
73
(declaim (inline end-of-string-p))
74
(defun end-of-string-p (lexer)
75
(declare #.*standard-optimize-settings*)
76
"Tests whether we're at the end of the regex string."
80
(declaim (inline looking-at-p))
81
(defun looking-at-p (lexer chr)
82
(declare #.*standard-optimize-settings*)
83
"Tests whether the next character the lexer would see is CHR.
84
Does not respect extended mode."
85
(and (not (end-of-string-p lexer))
86
(char= (schar (lexer-str lexer) (lexer-pos lexer))
89
(declaim (inline next-char-non-extended))
90
(defun next-char-non-extended (lexer)
91
(declare #.*standard-optimize-settings*)
92
"Returns the next character which is to be examined and updates the
93
POS slot. Does not respect extended mode."
94
(cond ((end-of-string-p lexer) nil)
96
(schar (lexer-str lexer) (lexer-pos lexer))
97
(incf (lexer-pos lexer))))))
99
(defun next-char (lexer)
100
(declare #.*standard-optimize-settings*)
101
"Returns the next character which is to be examined and updates the
102
POS slot. Respects extended mode, i.e. whitespace, comments, and also
103
nested comments are skipped if applicable."
104
(let ((next-char (next-char-non-extended lexer))
107
;; remember where we started
108
(setq last-loop-pos (lexer-pos lexer))
109
;; first we look for nested comments like (?#foo)
111
(char= next-char #\()
112
(looking-at-p lexer #\?))
113
(incf (lexer-pos lexer))
114
(cond ((looking-at-p lexer #\#)
115
;; must be a nested comment - so we have to search for
116
;; the closing parenthesis
117
(let ((error-pos (- (lexer-pos lexer) 2)))
119
;; loop 'til ')' or end of regex string and
120
;; return NIL if ')' wasn't encountered
121
(loop for skip-char = next-char
122
then (next-char-non-extended lexer)
124
(char/= skip-char #\)))
125
finally (return skip-char))
126
(signal-syntax-error* error-pos "Comment group not closed.")))
127
(setq next-char (next-char-non-extended lexer)))
129
;; undo effect of previous INCF if we didn't see a #
130
(decf (lexer-pos lexer)))))
131
(when *extended-mode-p*
132
;; now - if we're in extended mode - we skip whitespace and
133
;; comments; repeat the following loop while we look at
135
(loop while (and next-char
136
(or (char= next-char #\#)
137
(whitespacep next-char)))
139
(if (char= next-char #\#)
140
;; if we saw a comment marker skip until
141
;; we're behind #\Newline...
142
(loop for skip-char = next-char
143
then (next-char-non-extended lexer)
145
(char/= skip-char #\Newline))
146
finally (return (next-char-non-extended lexer)))
147
;; ...otherwise (whitespace) skip until we
148
;; see the next non-whitespace character
149
(loop for skip-char = next-char
150
then (next-char-non-extended lexer)
152
(whitespacep skip-char))
153
finally (return skip-char))))))
154
;; if the position has moved we have to repeat our tests
155
;; because of cases like /^a (?#xxx) (?#yyy) {3}c/x which
156
;; would be equivalent to /^a{3}c/ in Perl
157
(unless (> (lexer-pos lexer) last-loop-pos)
158
(return next-char)))))
160
(declaim (inline fail))
162
(declare #.*standard-optimize-settings*)
163
"Moves (LEXER-POS LEXER) back to the last position stored in
164
\(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
165
(unless (lexer-last-pos lexer)
166
(signal-syntax-error "LAST-POS stack of LEXER ~A is empty." lexer))
167
(setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
170
(defun get-number (lexer &key (radix 10) max-length no-whitespace-p)
171
(declare #.*standard-optimize-settings*)
172
"Read and consume the number the lexer is currently looking at and
173
return it. Returns NIL if no number could be identified.
174
RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read
175
at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL
176
we don't tolerate whitespace in front of the number."
177
(when (or (end-of-string-p lexer)
179
(whitespacep (schar (lexer-str lexer) (lexer-pos lexer)))))
180
(return-from get-number nil))
181
(multiple-value-bind (integer new-pos)
182
(parse-integer (lexer-str lexer)
183
:start (lexer-pos lexer)
185
(let ((end-pos (+ (lexer-pos lexer)
186
(the fixnum max-length)))
187
(lexer-len (lexer-len lexer)))
188
(if (< end-pos lexer-len)
194
(cond ((and integer (>= (the fixnum integer) 0))
195
(setf (lexer-pos lexer) new-pos)
199
(declaim (inline try-number))
200
(defun try-number (lexer &key (radix 10) max-length no-whitespace-p)
201
(declare #.*standard-optimize-settings*)
202
"Like GET-NUMBER but won't consume anything if no number is seen."
203
;; remember current position
204
(push (lexer-pos lexer) (lexer-last-pos lexer))
205
(let ((number (get-number lexer
207
:max-length max-length
208
:no-whitespace-p no-whitespace-p)))
209
(or number (fail lexer))))
211
(declaim (inline make-char-from-code))
212
(defun make-char-from-code (number error-pos)
213
(declare #.*standard-optimize-settings*)
214
"Create character from char-code NUMBER. NUMBER can be NIL
215
which is interpreted as 0. ERROR-POS is the position where
216
the corresponding number started within the regex string."
217
;; only look at rightmost eight bits in compliance with Perl
218
(let ((code (logand #o377 (the fixnum (or number 0)))))
219
(or (and (< code char-code-limit)
221
(signal-syntax-error* error-pos "No character for hex-code ~X." number))))
223
(defun unescape-char (lexer)
224
(declare #.*standard-optimize-settings*)
225
"Convert the characters\(s) following a backslash into a token
226
which is returned. This function is to be called when the backslash
227
has already been consumed. Special character classes like \\W are
229
(when (end-of-string-p lexer)
230
(signal-syntax-error "String ends with backslash."))
231
(let ((chr (next-char-non-extended lexer)))
234
;; if \Q quoting is on this is ignored, otherwise it's just an
240
;; \cx means control-x in Perl
241
(let ((next-char (next-char-non-extended lexer)))
243
(signal-syntax-error* (lexer-pos lexer) "Character missing after '\\c'"))
244
(code-char (logxor #x40 (char-code (char-upcase next-char))))))
246
;; \x should be followed by a hexadecimal char code,
247
;; two digits or less
248
(let* ((error-pos (lexer-pos lexer))
249
(number (get-number lexer :radix 16 :max-length 2 :no-whitespace-p t)))
250
;; note that it is OK if \x is followed by zero digits
251
(make-char-from-code number error-pos)))
252
((#\0 #\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
253
;; \x should be followed by an octal char code,
254
;; three digits or less
255
(let* ((error-pos (decf (lexer-pos lexer)))
256
(number (get-number lexer :radix 8 :max-length 3)))
257
(make-char-from-code number error-pos)))
258
;; the following five character names are 'semi-standard'
259
;; according to the CLHS but I'm not aware of any implementation
260
;; that doesn't implement them
272
(code-char 7)) ; ASCII bell
274
(code-char 27)) ; ASCII escape
276
;; all other characters aren't affected by a backslash
279
(defun read-char-property (lexer first-char)
280
(declare #.*standard-optimize-settings*)
281
(unless (eql (next-char-non-extended lexer) #\{)
282
(signal-syntax-error* (lexer-pos lexer) "Expected left brace after \\~A." first-char))
283
(let ((name (with-output-to-string (out nil :element-type 'character)
285
(let ((char (or (next-char-non-extended lexer)
286
(signal-syntax-error "Unexpected EOF after \\~A{." first-char))))
287
(when (char= char #\})
289
(write-char char out))))))
290
(list (if (char= first-char #\p) :property :inverted-property)
293
(defun collect-char-class (lexer)
294
"Reads and consumes characters from regex string until a right
295
bracket is seen. Assembles them into a list \(which is returned) of
296
characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
297
tokens representing special character classes."
298
(declare #.*standard-optimize-settings*)
299
(let ((start-pos (lexer-pos lexer)) ; remember start for error message
303
(flet ((handle-char (c)
304
"Do the right thing with character C depending on whether
305
we're inside a range or not."
306
(cond ((and hyphen-seen last-char)
307
(setf (car list) (list :range last-char c)
312
(setq hyphen-seen nil)))
313
(loop for first = t then nil
314
for c = (next-char-non-extended lexer)
315
;; leave loop if at end of string
319
;; we've seen a backslash
320
(let ((next-char (next-char-non-extended lexer)))
322
((#\d #\D #\w #\W #\s #\S)
323
;; a special character class
324
(push (map-char-to-special-char-class next-char) list)
325
;; if the last character was a hyphen
326
;; just collect it literally
329
;; if the next character is a hyphen do the same
330
(when (looking-at-p lexer #\-)
332
(incf (lexer-pos lexer)))
333
(setq hyphen-seen nil))
335
;; maybe a character property
336
(cond ((null *property-resolver*)
337
(handle-char next-char))
339
(push (read-char-property lexer next-char) list)
340
;; if the last character was a hyphen
341
;; just collect it literally
344
;; if the next character is a hyphen do the same
345
(when (looking-at-p lexer #\-)
347
(incf (lexer-pos lexer)))
348
(setq hyphen-seen nil))))
350
;; if \Q quoting is on we ignore \E,
351
;; otherwise it's just a plain #\E
352
(unless *allow-quoting*
355
;; otherwise unescape the following character(s)
356
(decf (lexer-pos lexer))
357
(handle-char (unescape-char lexer))))))
359
;; the first character must not be a right bracket
360
;; and isn't treated specially if it's a hyphen
363
;; end of character class
364
;; make sure we collect a pending hyphen
366
(setq hyphen-seen nil)
368
;; reverse the list to preserve the order intended
369
;; by the author of the regex string
370
(return-from collect-char-class (nreverse list)))
374
;; if the last character was 'just a character'
375
;; we expect to be in the middle of a range
376
(setq hyphen-seen t))
378
;; otherwise this is just an ordinary hyphen
381
;; default case - just collect the character
383
;; we can only exit the loop normally if we've reached the end
384
;; of the regex string without seeing a right bracket
385
(signal-syntax-error* start-pos "Missing right bracket to close character class."))))
387
(defun maybe-parse-flags (lexer)
388
(declare #.*standard-optimize-settings*)
389
"Reads a sequence of modifiers \(including #\\- to reverse their
390
meaning) and returns a corresponding list of \"flag\" tokens. The
391
\"x\" modifier is treated specially in that it dynamically modifies
392
the behaviour of the lexer itself via the special variable
396
for chr = (next-char-non-extended lexer)
398
do (signal-syntax-error "Unexpected end of string.")
399
while (find chr "-imsx" :test #'char=)
400
;; the first #\- will invert the meaning of all modifiers
404
else if (char= chr #\x)
405
do (setq *extended-mode-p* set)
413
:single-line-mode-p))
418
:not-multi-line-mode-p)
420
:not-single-line-mode-p))))
421
(decf (lexer-pos lexer))))
423
(defun get-quantifier (lexer)
424
(declare #.*standard-optimize-settings*)
425
"Returns a list of two values (min max) if what the lexer is looking
426
at can be interpreted as a quantifier. Otherwise returns NIL and
427
resets the lexer to its old position."
428
;; remember starting position for FAIL and UNGET-TOKEN functions
429
(push (lexer-pos lexer) (lexer-last-pos lexer))
430
(let ((next-char (next-char lexer)))
433
;; * (Kleene star): match 0 or more times
436
;; +: match 1 or more times
439
;; ?: match 0 or 1 times
443
;; {n}: match exactly n times
444
;; {n,}: match at least n times
445
;; {n,m}: match at least n but not more than m times
446
;; note that anything not matching one of these patterns will
447
;; be interpreted literally - even whitespace isn't allowed
448
(let ((num1 (get-number lexer :no-whitespace-p t)))
450
(let ((next-char (next-char-non-extended lexer)))
453
(let* ((num2 (get-number lexer :no-whitespace-p t))
454
(next-char (next-char-non-extended lexer)))
457
;; this is the case {n,} (NUM2 is NIL) or {n,m}
462
;; this is the case {n}
466
;; no number following left curly brace, so we treat it
467
;; like a normal character
469
;; cannot be a quantifier
473
(defun parse-register-name-aux (lexer)
474
"Reads and returns the name in a named register group. It is
475
assumed that the starting #\< character has already been read. The
476
closing #\> will also be consumed."
477
;; we have to look for an ending > character now
478
(let ((end-name (position #\>
480
:start (lexer-pos lexer)
483
;; there has to be > somewhere, syntax error otherwise
484
(signal-syntax-error* (1- (lexer-pos lexer)) "Opening #\< in named group has no closing #\>."))
485
(let ((name (subseq (lexer-str lexer)
488
(unless (every #'(lambda (char)
489
(or (alphanumericp char)
492
;; register name can contain only alphanumeric characters or #\-
493
(signal-syntax-error* (lexer-pos lexer) "Invalid character in named register group."))
494
;; advance lexer beyond "<name>" part
495
(setf (lexer-pos lexer) (1+ end-name))
498
(declaim (inline unget-token))
499
(defun unget-token (lexer)
500
(declare #.*standard-optimize-settings*)
501
"Moves the lexer back to the last position stored in the LAST-POS stack."
502
(if (lexer-last-pos lexer)
503
(setf (lexer-pos lexer)
504
(pop (lexer-last-pos lexer)))
505
(error "No token to unget \(this should not happen)")))
507
(defun get-token (lexer)
508
(declare #.*standard-optimize-settings*)
509
"Returns and consumes the next token from the regex string \(or NIL)."
510
;; remember starting position for UNGET-TOKEN function
511
(push (lexer-pos lexer)
512
(lexer-last-pos lexer))
513
(let ((next-char (next-char lexer)))
516
;; the easy cases first - the following six characters
517
;; always have a special meaning and get translated
518
;; into tokens immediately
532
;; quantifiers will always be consumend by
533
;; GET-QUANTIFIER, they must not appear here
534
(signal-syntax-error* (1- (lexer-pos lexer)) "Quantifier '~A' not allowed." next-char))
536
;; left brace isn't a special character in it's own
537
;; right but we must check if what follows might
538
;; look like a quantifier
539
(let ((this-pos (lexer-pos lexer))
540
(this-last-pos (lexer-last-pos lexer)))
542
(when (get-quantifier lexer)
543
(signal-syntax-error* (car this-last-pos)
544
"Quantifier '~A' not allowed."
545
(subseq (lexer-str lexer)
548
(setf (lexer-pos lexer) this-pos
549
(lexer-last-pos lexer) this-last-pos)
552
;; left bracket always starts a character class
553
(cons (cond ((looking-at-p lexer #\^)
554
(incf (lexer-pos lexer))
555
:inverted-char-class)
558
(collect-char-class lexer)))
560
;; backslash might mean different things so we have
561
;; to peek one char ahead:
562
(let ((next-char (next-char-non-extended lexer)))
565
:modeless-start-anchor)
567
:modeless-end-anchor)
569
:modeless-end-anchor-no-newline)
575
(cond ((and *allow-named-registers*
576
(looking-at-p lexer #\<))
577
;; back-referencing a named register
578
(incf (lexer-pos lexer))
579
(list :back-reference
580
(parse-register-name-aux lexer)))
582
;; false alarm, just unescape \k
584
((#\d #\D #\w #\W #\s #\S)
585
;; these will be treated like character classes
586
(map-char-to-special-char-class next-char))
587
((#\1 #\2 #\3 #\4 #\5 #\6 #\7 #\8 #\9)
589
(let* ((old-pos (decf (lexer-pos lexer)))
590
;; ...so let's get the whole number first
591
(backref-number (get-number lexer)))
592
(declare (fixnum backref-number))
593
(cond ((and (> backref-number (lexer-reg lexer))
594
(<= 10 backref-number))
595
;; \10 and higher are treated as octal
596
;; character codes if we haven't
597
;; opened that much register groups
599
(setf (lexer-pos lexer) old-pos)
600
;; re-read the number from the old
601
;; position and convert it to its
602
;; corresponding character
603
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
606
;; otherwise this must refer to a
608
(list :back-reference backref-number)))))
610
;; this always means an octal character code
611
;; (at most three digits)
612
(let ((old-pos (decf (lexer-pos lexer))))
613
(make-char-from-code (get-number lexer :radix 8 :max-length 3)
616
;; might be a named property
617
(cond (*property-resolver* (read-char-property lexer next-char))
620
;; in all other cases just unescape the
622
(decf (lexer-pos lexer))
623
(unescape-char lexer)))))
625
;; an open parenthesis might mean different things
626
;; depending on what follows...
627
(cond ((looking-at-p lexer #\?)
628
;; this is the case '(?' (and probably more behind)
629
(incf (lexer-pos lexer))
630
;; we have to check for modifiers first
631
;; because a colon might follow
632
(let* ((flags (maybe-parse-flags lexer))
633
(next-char (next-char-non-extended lexer)))
634
;; modifiers are only allowed if a colon
635
;; or a closing parenthesis are following
637
(not (find next-char ":)" :test #'char=)))
638
(signal-syntax-error* (car (lexer-last-pos lexer))
639
"Sequence '~A' not recognized."
640
(subseq (lexer-str lexer)
641
(car (lexer-last-pos lexer))
646
(signal-syntax-error "End of string following '(?'."))
648
;; an empty group except for the flags
649
;; (if there are any)
660
;; positive look-ahead
663
;; negative look-ahead
664
:open-paren-exclamation)
666
;; non-capturing group - return flags as
668
(values :open-paren-colon flags))
670
;; might be a look-behind assertion or a named group, so
671
;; check next character
672
(let ((next-char (next-char-non-extended lexer)))
673
(cond ((and next-char
674
(alpha-char-p next-char))
675
;; we have encountered a named group
676
;; are we supporting register naming?
677
(unless *allow-named-registers*
678
(signal-syntax-error* (1- (lexer-pos lexer))
679
"Character '~A' may not follow '(?<' (because ~a = NIL)"
681
'*allow-named-registers*))
682
;; put the letter back
683
(decf (lexer-pos lexer))
685
:open-paren-less-letter)
689
;; positive look-behind
690
:open-paren-less-equal)
692
;; negative look-behind
693
:open-paren-less-exclamation)
695
;; Perl allows "(?<)" and treats
696
;; it like a null string
700
(signal-syntax-error "End of string following '(?<'."))
703
(signal-syntax-error* (1- (lexer-pos lexer))
704
"Character '~A' may not follow '(?<'."
707
(signal-syntax-error* (1- (lexer-pos lexer))
708
"Character '~A' may not follow '(?'."
711
;; if next-char was not #\? (this is within
712
;; the first COND), we've just seen an opening
713
;; parenthesis and leave it like that
716
;; all other characters are their own tokens
718
;; we didn't get a character (this if the "else" branch from
719
;; the first IF), so we don't return a token but NIL
721
(pop (lexer-last-pos lexer))
724
(declaim (inline start-of-subexpr-p))
725
(defun start-of-subexpr-p (lexer)
726
(declare #.*standard-optimize-settings*)
727
"Tests whether the next token can start a valid sub-expression, i.e.
728
a stand-alone regex."
729
(let* ((pos (lexer-pos lexer))
730
(next-char (next-char lexer)))
731
(not (or (null next-char)
733
(member (the character next-char)
736
(setf (lexer-pos lexer) pos))))))