Coverage report: /home/ellis/comp/ext/cl-ppcre/scanner.lisp
Kind | Covered | All | % |
expression | 50 | 245 | 20.4 |
branch | 11 | 58 | 19.0 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; $Header: /usr/local/cvsrep/cl-ppcre/scanner.lisp,v 1.36 2009/09/17 19:17:31 edi Exp $
3
;;; Here the scanner for the actual regex as well as utility scanners
4
;;; for the constant start and end strings are created.
6
;;; Copyright (c) 2002-2009, Dr. Edmund Weitz. All rights reserved.
8
;;; Redistribution and use in source and binary forms, with or without
9
;;; modification, are permitted provided that the following conditions
12
;;; * Redistributions of source code must retain the above copyright
13
;;; notice, this list of conditions and the following disclaimer.
15
;;; * Redistributions in binary form must reproduce the above
16
;;; copyright notice, this list of conditions and the following
17
;;; disclaimer in the documentation and/or other materials
18
;;; provided with the distribution.
20
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
21
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
24
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
26
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
(in-package :cl-ppcre)
34
(defmacro bmh-matcher-aux (&key case-insensitive-p)
35
"Auxiliary macro used by CREATE-BMH-MATCHER."
36
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
38
(declare (fixnum start-pos))
39
(if (or (minusp start-pos)
40
(> (the fixnum (+ start-pos m)) *end-pos*))
42
(loop named bmh-matcher
43
for k of-type fixnum = (+ start-pos m -1)
44
then (+ k (max 1 (aref skip (char-code (schar *string* k)))))
46
do (loop for j of-type fixnum downfrom (1- m)
47
for i of-type fixnum downfrom k
49
(,char-compare (schar *string* i)
51
finally (if (minusp j)
52
(return-from bmh-matcher (1+ i)))))))))
54
(defun create-bmh-matcher (pattern case-insensitive-p)
55
"Returns a Boyer-Moore-Horspool matcher which searches the (special)
56
simple-string *STRING* for the first occurence of the substring
57
PATTERN. The search starts at the position START-POS within *STRING*
58
and stops before *END-POS* is reached. Depending on the second
59
argument the search is case-insensitive or not. If the special
60
variable *USE-BMH-MATCHERS* is NIL, use the standard SEARCH function
61
instead. \(BMH matchers are faster but need much more space.)"
62
(declare #.*standard-optimize-settings*)
63
;; see <http://www-igm.univ-mlv.fr/~lecroq/string/node18.html> for
65
(unless *use-bmh-matchers*
66
(let ((test (if case-insensitive-p #'char-equal #'char=)))
67
(return-from create-bmh-matcher
69
(declare (fixnum start-pos))
70
(and (not (minusp start-pos))
76
(let* ((m (length pattern))
77
(skip (make-array *regex-char-code-limit*
81
(loop for k of-type fixnum below m
83
do (setf (aref skip (char-code (char-upcase (schar pattern k)))) (- m k 1)
84
(aref skip (char-code (char-downcase (schar pattern k)))) (- m k 1))
86
do (setf (aref skip (char-code (schar pattern k))) (- m k 1)))
87
(if case-insensitive-p
88
(bmh-matcher-aux :case-insensitive-p t)
91
(defmacro char-searcher-aux (&key case-insensitive-p)
92
"Auxiliary macro used by CREATE-CHAR-SEARCHER."
93
(let ((char-compare (if case-insensitive-p 'char-equal 'char=)))
95
(declare (fixnum start-pos))
96
(and (not (minusp start-pos))
97
(loop for i of-type fixnum from start-pos below *end-pos*
98
thereis (and (,char-compare (schar *string* i) chr) i))))))
100
(defun create-char-searcher (chr case-insensitive-p)
101
"Returns a function which searches the (special) simple-string
102
*STRING* for the first occurence of the character CHR. The search
103
starts at the position START-POS within *STRING* and stops before
104
*END-POS* is reached. Depending on the second argument the search is
105
case-insensitive or not."
106
(declare #.*standard-optimize-settings*)
107
(if case-insensitive-p
108
(char-searcher-aux :case-insensitive-p t)
109
(char-searcher-aux)))
111
(declaim (inline newline-skipper))
112
(defun newline-skipper (start-pos)
113
"Finds the next occurence of a character in *STRING* which is behind
115
(declare #.*standard-optimize-settings*)
116
(declare (fixnum start-pos))
117
;; we can start with (1- START-POS) without testing for (PLUSP
118
;; START-POS) because we know we'll never call NEWLINE-SKIPPER on
119
;; the first iteration
120
(loop for i of-type fixnum from (1- start-pos) below *end-pos*
121
thereis (and (char= (schar *string* i)
125
(defmacro insert-advance-fn (advance-fn)
126
"Creates the actual closure returned by CREATE-SCANNER-AUX by
127
replacing '(ADVANCE-FN-DEFINITION) with a suitable definition for
128
ADVANCE-FN. This is a utility macro used by CREATE-SCANNER-AUX."
130
advance-fn '(advance-fn-definition)
131
'(lambda (string start end)
133
;; initialize a couple of special variables used by the
134
;; matchers (see file specials.lisp)
135
(let* ((*string* string)
138
;; we will search forward for END-STRING if this value
139
;; isn't at least as big as POS (see ADVANCE-FN), so it
140
;; is safe to start to the left of *START-POS*; note
141
;; that this value will _never_ be decremented - this
142
;; is crucial to the scanning process
143
(*end-string-pos* (1- *start-pos*))
144
;; the next five will shadow the variables defined by
145
;; DEFPARAMETER; at this point, we don't know if we'll
146
;; actually use them, though
147
(*repeat-counters* *repeat-counters*)
148
(*last-pos-stores* *last-pos-stores*)
149
(*reg-starts* *reg-starts*)
150
(*regs-maybe-start* *regs-maybe-start*)
151
(*reg-ends* *reg-ends*)
152
;; we might be able to optimize the scanning process by
153
;; (virtually) shifting *START-POS* to the right
154
(scan-start-pos *start-pos*)
155
(starts-with-str (if start-string-test
158
;; we don't need to try further than MAX-END-POS
159
(max-end-pos (- *end-pos* min-len)))
160
(declare (fixnum scan-start-pos)
162
;; definition of ADVANCE-FN will be inserted here by macrology
163
(labels ((advance-fn-definition))
164
(declare (inline advance-fn))
165
(when (plusp rep-num)
166
;; we have at least one REPETITION which needs to count
167
;; the number of repetitions
168
(setq *repeat-counters* (make-array rep-num
170
:element-type 'fixnum)))
171
(when (plusp zero-length-num)
172
;; we have at least one REPETITION which needs to watch
173
;; out for zero-length repetitions
174
(setq *last-pos-stores* (make-array zero-length-num
175
:initial-element nil)))
176
(when (plusp reg-num)
177
;; we have registers in our regular expression
178
(setq *reg-starts* (make-array reg-num :initial-element nil)
179
*regs-maybe-start* (make-array reg-num :initial-element nil)
180
*reg-ends* (make-array reg-num :initial-element nil)))
182
;; the regular expression has a constant end string which
183
;; is anchored at the very end of the target string
184
;; (perhaps modulo a #\Newline)
185
(let ((end-test-pos (- *end-pos* (the fixnum end-string-len))))
186
(declare (fixnum end-test-pos)
187
(function end-string-test))
188
(unless (setq *end-string-pos* (funcall end-string-test
190
(when (and (= 1 (the fixnum end-anchored-p))
191
(> *end-pos* scan-start-pos)
192
(char= #\Newline (schar *string* (1- *end-pos*))))
193
;; if we didn't find an end string candidate from
194
;; END-TEST-POS and if a #\Newline at the end is
195
;; allowed we try it again from one position to the
197
(setq *end-string-pos* (funcall end-string-test
198
(1- end-test-pos))))))
199
(unless (and *end-string-pos*
200
(<= *start-pos* *end-string-pos*))
201
;; no end string candidate found, so give up
202
(return-from scan nil))
203
(when end-string-offset
204
;; if the offset of the constant end string from the
205
;; left of the regular expression is known we can start
206
;; scanning further to the right; this is similar to
207
;; what we might do in ADVANCE-FN
208
(setq scan-start-pos (max scan-start-pos
209
(- (the fixnum *end-string-pos*)
210
(the fixnum end-string-offset))))))
213
;; we're anchored at the start of the target string,
214
;; so no need to try again after first failure
215
(when (or (/= *start-pos* scan-start-pos)
216
(< max-end-pos *start-pos*))
217
;; if END-STRING-OFFSET has proven that we don't
218
;; need to bother to scan from *START-POS* or if the
219
;; minimal length of the regular expression is
220
;; longer than the target string we give up
221
(return-from scan nil))
222
(when starts-with-str
224
(declare (fixnum starts-with-len))
225
(cond ((and (case-insensitive-p starts-with)
226
(not (*string*-equal starts-with-str
231
;; the regular expression has a
232
;; case-insensitive constant start string
233
;; and we didn't find it
234
(return-from scan nil))
235
((and (not (case-insensitive-p starts-with))
236
(not (*string*= starts-with-str
238
(+ *start-pos* starts-with-len)
240
;; the regular expression has a
241
;; case-sensitive constant start string
242
;; and we didn't find it
243
(return-from scan nil))
245
(when (and end-string-test
246
(not end-anchored-p))
247
;; the regular expression has a constant end string
248
;; which isn't anchored so we didn't check for it
250
(block end-string-loop
251
;; we temporarily use *END-STRING-POS* as our
252
;; starting position to look for end string
254
(setq *end-string-pos* *start-pos*)
256
(unless (setq *end-string-pos*
257
(funcall (the function end-string-test)
259
;; no end string candidate found, so give up
260
(return-from scan nil))
261
(unless end-string-offset
262
;; end string doesn't have an offset so we
263
;; can start scanning now
264
(return-from end-string-loop))
265
(let ((maybe-start-pos (- (the fixnum *end-string-pos*)
266
(the fixnum end-string-offset))))
267
(cond ((= maybe-start-pos *start-pos*)
268
;; offset of end string into regular
269
;; expression matches start anchor -
271
(return-from end-string-loop))
272
((and (< maybe-start-pos *start-pos*)
273
(< (+ *end-string-pos* end-string-len) *end-pos*))
274
;; no match but maybe we find another
275
;; one to the right - try again
276
(incf *end-string-pos*))
279
(return-from scan nil)))))))
280
;; if we got here we scan exactly once
281
(let ((next-pos (funcall match-fn *start-pos*)))
283
(values (if next-pos *start-pos* nil)
288
(loop for pos = (if starts-with-everything
289
;; don't jump to the next
290
;; #\Newline on the first
293
(advance-fn scan-start-pos))
294
then (advance-fn pos)
295
;; give up if the regular expression can't fit
296
;; into the rest of the target string
298
(<= (the fixnum pos) max-end-pos))
299
do (let ((next-pos (funcall match-fn pos)))
301
(return-from scan (values pos
305
;; not yet found, increment POS
306
(incf (the fixnum pos))))))))))
309
(defun create-scanner-aux (match-fn
321
"Auxiliary function to create and return a scanner \(which is
322
actually a closure). Used by CREATE-SCANNER."
323
(declare #.*standard-optimize-settings*)
324
(declare (fixnum min-len zero-length-num rep-num reg-num))
325
(let ((starts-with-len (if (typep starts-with 'str)
327
(starts-with-everything (typep starts-with 'everything)))
329
;; this COND statement dispatches on the different versions we
330
;; have for ADVANCE-FN and creates different closures for each;
331
;; note that you see only the bodies of ADVANCE-FN below - the
332
;; actual scanner is defined in INSERT-ADVANCE-FN above; (we
333
;; could have done this with closures instead of macrology but
334
;; would have consed a lot more)
335
((and start-string-test end-string-test end-string-offset)
336
;; we know that the regular expression has constant start and
337
;; end strings and we know the end string's offset (from the
341
(declare (fixnum end-string-offset starts-with-len)
342
(function start-string-test end-string-test))
344
(unless (setq pos (funcall start-string-test pos))
345
;; give up completely if we can't find a start string
347
(return-from scan nil))
349
;; from here we know that POS is a FIXNUM
350
(declare (fixnum pos))
351
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
352
;; if we already found an end string candidate the
353
;; position of which matches the start string
354
;; candidate we're done
355
(return-from advance-fn pos))
356
(let ((try-pos (+ pos starts-with-len)))
357
;; otherwise try (again) to find an end string
358
;; candidate which starts behind the start string
361
(unless (setq *end-string-pos*
362
(funcall end-string-test try-pos))
363
;; no end string candidate found, so give up
364
(return-from scan nil))
365
;; NEW-POS is where we should start scanning
366
;; according to the end string candidate
367
(let ((new-pos (- (the fixnum *end-string-pos*)
369
(declare (fixnum new-pos *end-string-pos*))
370
(cond ((= new-pos pos)
371
;; if POS and NEW-POS are equal then the
372
;; two candidates agree so we're fine
373
(return-from advance-fn pos))
375
;; if NEW-POS is further to the right we
376
;; advance POS and try again, i.e. we go
377
;; back to the start of the outer LOOP
379
;; this means "return from inner LOOP"
382
;; otherwise NEW-POS is smaller than POS,
383
;; so we have to redo the inner LOOP to
384
;; find another end string candidate
385
;; further to the right
386
(setq try-pos (1+ *end-string-pos*))))))))))))
387
((and starts-with-everything end-string-test end-string-offset)
388
;; we know that the regular expression starts with ".*" (which
389
;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends
390
;; with a constant end string and we know the end string's
391
;; offset (from the left)
394
(declare (fixnum end-string-offset)
395
(function end-string-test))
397
(unless (setq pos (newline-skipper pos))
398
;; if we can't find a #\Newline we give up immediately
399
(return-from scan nil))
401
;; from here we know that POS is a FIXNUM
402
(declare (fixnum pos))
403
(when (= pos (- (the fixnum *end-string-pos*) end-string-offset))
404
;; if we already found an end string candidate the
405
;; position of which matches the place behind the
406
;; #\Newline we're done
407
(return-from advance-fn pos))
409
;; otherwise try (again) to find an end string
410
;; candidate which starts behind the #\Newline
412
(unless (setq *end-string-pos*
413
(funcall end-string-test try-pos))
414
;; no end string candidate found, so we give up
415
(return-from scan nil))
416
;; NEW-POS is where we should start scanning
417
;; according to the end string candidate
418
(let ((new-pos (- (the fixnum *end-string-pos*)
420
(declare (fixnum new-pos *end-string-pos*))
421
(cond ((= new-pos pos)
422
;; if POS and NEW-POS are equal then the
423
;; the end string candidate agrees with
424
;; the #\Newline so we're fine
425
(return-from advance-fn pos))
427
;; if NEW-POS is further to the right we
428
;; advance POS and try again, i.e. we go
429
;; back to the start of the outer LOOP
431
;; this means "return from inner LOOP"
434
;; otherwise NEW-POS is smaller than POS,
435
;; so we have to redo the inner LOOP to
436
;; find another end string candidate
437
;; further to the right
438
(setq try-pos (1+ *end-string-pos*))))))))))))
439
((and start-string-test end-string-test)
440
;; we know that the regular expression has constant start and
441
;; end strings; similar to the first case but we only need to
442
;; check for the end string, it doesn't provide enough
443
;; information to advance POS
446
(declare (function start-string-test end-string-test))
447
(unless (setq pos (funcall start-string-test pos))
448
(return-from scan nil))
449
(if (<= (the fixnum pos)
450
(the fixnum *end-string-pos*))
451
(return-from advance-fn pos))
452
(unless (setq *end-string-pos* (funcall end-string-test pos))
453
(return-from scan nil))
455
((and starts-with-everything end-string-test)
456
;; we know that the regular expression starts with ".*" (which
457
;; is not in single-line-mode, see CREATE-SCANNER-AUX) and ends
458
;; with a constant end string; similar to the second case but we
459
;; only need to check for the end string, it doesn't provide
460
;; enough information to advance POS
463
(declare (function end-string-test))
464
(unless (setq pos (newline-skipper pos))
465
(return-from scan nil))
466
(if (<= (the fixnum pos)
467
(the fixnum *end-string-pos*))
468
(return-from advance-fn pos))
469
(unless (setq *end-string-pos* (funcall end-string-test pos))
470
(return-from scan nil))
473
;; just check for constant start string candidate
476
(declare (function start-string-test))
477
(unless (setq pos (funcall start-string-test pos))
478
(return-from scan nil))
480
(starts-with-everything
481
;; just advance POS with NEWLINE-SKIPPER
484
(unless (setq pos (newline-skipper pos))
485
(return-from scan nil))
488
;; just check for the next end string candidate if POS has
489
;; advanced beyond the last one
492
(declare (function end-string-test))
493
(if (<= (the fixnum pos)
494
(the fixnum *end-string-pos*))
495
(return-from advance-fn pos))
496
(unless (setq *end-string-pos* (funcall end-string-test pos))
497
(return-from scan nil))
500
;; not enough optimization information about the regular
501
;; expression to optimize so we just return POS