Coverage report: /home/ellis/comp/ext/cl-ppcre/optimize.lisp
Kind | Covered | All | % |
expression | 369 | 504 | 73.2 |
branch | 35 | 58 | 60.3 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; $Header: /usr/local/cvsrep/cl-ppcre/optimize.lisp,v 1.36 2009/09/17 19:17:31 edi Exp $
3
;;; This file contains optimizations which can be applied to converted
6
;;; Copyright (c) 2002-2009, Dr. Edmund Weitz. All rights reserved.
8
;;; Redistribution and use in source and binary forms, with or without
9
;;; modification, are permitted provided that the following conditions
12
;;; * Redistributions of source code must retain the above copyright
13
;;; notice, this list of conditions and the following disclaimer.
15
;;; * Redistributions in binary form must reproduce the above
16
;;; copyright notice, this list of conditions and the following
17
;;; disclaimer in the documentation and/or other materials
18
;;; provided with the distribution.
20
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
21
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
22
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
24
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
26
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
28
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
(in-package :cl-ppcre)
34
(defgeneric flatten (regex)
35
(declare #.*standard-optimize-settings*)
36
(:documentation "Merges adjacent sequences and alternations, i.e. it
37
transforms #<SEQ #<STR \"a\"> #<SEQ #<STR \"b\"> #<STR \"c\">>> to
38
#<SEQ #<STR \"a\"> #<STR \"b\"> #<STR \"c\">>. This is a destructive
39
operation on REGEX."))
41
(defmethod flatten ((seq seq))
42
(declare #.*standard-optimize-settings*)
43
;; this looks more complicated than it is because we modify SEQ in
44
;; place to avoid unnecessary consing
45
(let ((elements-rest (elements seq)))
49
(let ((flattened-element (flatten (car elements-rest)))
50
(next-elements-rest (cdr elements-rest)))
51
(cond ((typep flattened-element 'seq)
52
;; FLATTENED-ELEMENT is a SEQ object, so we "splice"
53
;; it into out list of elements
54
(let ((flattened-element-elements
55
(elements flattened-element)))
56
(setf (car elements-rest)
57
(car flattened-element-elements)
59
(nconc (cdr flattened-element-elements)
60
(cdr elements-rest)))))
62
;; otherwise we just replace the current element with
63
;; its flattened counterpart
64
(setf (car elements-rest) flattened-element)))
65
(setq elements-rest next-elements-rest))))
66
(let ((elements (elements seq)))
67
(cond ((cadr elements)
71
(t (make-instance 'void)))))
73
(defmethod flatten ((alternation alternation))
74
(declare #.*standard-optimize-settings*)
75
;; same algorithm as above
76
(let ((choices-rest (choices alternation)))
80
(let ((flattened-choice (flatten (car choices-rest)))
81
(next-choices-rest (cdr choices-rest)))
82
(cond ((typep flattened-choice 'alternation)
83
(let ((flattened-choice-choices
84
(choices flattened-choice)))
85
(setf (car choices-rest)
86
(car flattened-choice-choices)
88
(nconc (cdr flattened-choice-choices)
89
(cdr choices-rest)))))
91
(setf (car choices-rest) flattened-choice)))
92
(setq choices-rest next-choices-rest))))
93
(let ((choices (choices alternation)))
97
(signal-syntax-error "Encountered alternation without choices.")))))
99
(defmethod flatten ((branch branch))
100
(declare #.*standard-optimize-settings*)
101
(with-slots (test then-regex else-regex)
107
then-regex (flatten then-regex)
108
else-regex (flatten else-regex))
111
(defmethod flatten ((regex regex))
112
(declare #.*standard-optimize-settings*)
114
((or repetition register lookahead lookbehind standalone)
115
;; if REGEX contains exactly one inner REGEX object flatten it
117
(flatten (regex regex)))
120
;; otherwise (ANCHOR, BACK-REFERENCE, CHAR-CLASS, EVERYTHING,
121
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY)
125
(defgeneric gather-strings (regex)
126
(declare #.*standard-optimize-settings*)
127
(:documentation "Collects adjacent strings or characters into one
128
string provided they have the same case mode. This is a destructive
129
operation on REGEX."))
131
(defmethod gather-strings ((seq seq))
132
(declare #.*standard-optimize-settings*)
133
;; note that GATHER-STRINGS is to be applied after FLATTEN, i.e. it
134
;; expects SEQ to be flattened already; in particular, SEQ cannot be
135
;; empty and cannot contain embedded SEQ objects
136
(let* ((start-point (cons nil (elements seq)))
137
(curr-point start-point)
143
(declare (fixnum collector-length))
145
(let ((elements-rest (cdr curr-point)))
146
(unless elements-rest
148
(let* ((element (car elements-rest))
149
(case-mode (case-mode element old-case-mode)))
150
(cond ((and case-mode
151
(eq case-mode old-case-mode))
152
;; if ELEMENT is a STR and we have collected a STR of
153
;; the same case mode in the last iteration we
154
;; concatenate ELEMENT onto COLLECTOR and remember the
155
;; value of its SKIP slot
156
(let ((old-collector-length collector-length))
157
(unless (and (adjustable-array-p collector)
158
(array-has-fill-pointer-p collector))
160
(make-array collector-length
161
:initial-contents collector
162
:element-type 'character
165
collector-start nil))
166
(adjust-array collector
167
(incf collector-length (len element))
169
(setf (subseq collector
170
old-collector-length)
172
;; it suffices to remember the last SKIP slot
173
;; because due to the way MAYBE-ACCUMULATE
174
;; works adjacent STR objects have the same
176
skip (skip element)))
177
(setf (cdr curr-point) (cdr elements-rest)))
179
(let ((collected-string
180
(cond (collector-start
183
;; if we have collected something already
184
;; we convert it into a STR
193
;; if ELEMENT is a string with a different case
194
;; mode than the last one we have either just
195
;; converted COLLECTOR into a STR or COLLECTOR
196
;; is still empty; in both cases we can now
197
;; begin to fill it anew
198
(setq collector (str element)
199
collector-start element
200
;; and we remember the SKIP value as above
202
collector-length (len element))
203
(cond (collected-string
204
(setf (car elements-rest)
209
(setf (cdr curr-point)
210
(cdr elements-rest)))))
212
;; otherwise this is not a STR so we apply
213
;; GATHER-STRINGS to it and collect it directly
215
(cond (collected-string
216
(setf (car elements-rest)
221
(cons (gather-strings element)
226
(setf (car elements-rest)
227
(gather-strings element)
230
;; we also have to empty COLLECTOR here in case
231
;; it was still filled from the last iteration
233
collector-start nil))))))
234
(setq old-case-mode case-mode))))
236
(setf (cdr curr-point)
245
(setf (elements seq) (cdr start-point))
248
(defmethod gather-strings ((alternation alternation))
249
(declare #.*standard-optimize-settings*)
250
;; loop ON the choices of ALTERNATION so we can modify them directly
251
(loop for choices-rest on (choices alternation)
253
do (setf (car choices-rest)
254
(gather-strings (car choices-rest))))
257
(defmethod gather-strings ((branch branch))
258
(declare #.*standard-optimize-settings*)
259
(with-slots (test then-regex else-regex)
264
(gather-strings test))
265
then-regex (gather-strings then-regex)
266
else-regex (gather-strings else-regex))
269
(defmethod gather-strings ((regex regex))
270
(declare #.*standard-optimize-settings*)
272
((or repetition register lookahead lookbehind standalone)
273
;; if REGEX contains exactly one inner REGEX object apply
274
;; GATHER-STRINGS to it
276
(gather-strings (regex regex)))
279
;; otherwise (ANCHOR, BACK-REFERENCE, CHAR-CLASS, EVERYTHING,
280
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY)
284
;; Note that START-ANCHORED-P will be called after FLATTEN and GATHER-STRINGS.
286
(defgeneric start-anchored-p (regex &optional in-seq-p)
287
(declare #.*standard-optimize-settings*)
288
(:documentation "Returns T if REGEX starts with a \"real\" start
289
anchor, i.e. one that's not in multi-line mode, NIL otherwise. If
290
IN-SEQ-P is true the function will return :ZERO-LENGTH if REGEX is a
291
zero-length assertion."))
293
(defmethod start-anchored-p ((seq seq) &optional in-seq-p)
294
(declare (ignore in-seq-p))
295
;; note that START-ANCHORED-P is to be applied after FLATTEN and
296
;; GATHER-STRINGS, i.e. SEQ cannot be empty and cannot contain
297
;; embedded SEQ objects
298
(loop for element in (elements seq)
299
for anchored-p = (start-anchored-p element t)
300
;; skip zero-length elements because they won't affect the
301
;; "anchoredness" of the sequence
302
while (eq anchored-p :zero-length)
303
finally (return (and anchored-p (not (eq anchored-p :zero-length))))))
305
(defmethod start-anchored-p ((alternation alternation) &optional in-seq-p)
306
(declare #.*standard-optimize-settings*)
307
(declare (ignore in-seq-p))
308
;; clearly an alternation can only be start-anchored if all of its
309
;; choices are start-anchored
310
(loop for choice in (choices alternation)
311
always (start-anchored-p choice)))
313
(defmethod start-anchored-p ((branch branch) &optional in-seq-p)
314
(declare #.*standard-optimize-settings*)
315
(declare (ignore in-seq-p))
316
(and (start-anchored-p (then-regex branch))
317
(start-anchored-p (else-regex branch))))
319
(defmethod start-anchored-p ((repetition repetition) &optional in-seq-p)
320
(declare #.*standard-optimize-settings*)
321
(declare (ignore in-seq-p))
322
;; well, this wouldn't make much sense, but anyway...
323
(and (plusp (minimum repetition))
324
(start-anchored-p (regex repetition))))
326
(defmethod start-anchored-p ((register register) &optional in-seq-p)
327
(declare #.*standard-optimize-settings*)
328
(declare (ignore in-seq-p))
329
(start-anchored-p (regex register)))
331
(defmethod start-anchored-p ((standalone standalone) &optional in-seq-p)
332
(declare #.*standard-optimize-settings*)
333
(declare (ignore in-seq-p))
334
(start-anchored-p (regex standalone)))
336
(defmethod start-anchored-p ((anchor anchor) &optional in-seq-p)
337
(declare #.*standard-optimize-settings*)
338
(declare (ignore in-seq-p))
340
(not (multi-line-p anchor))))
342
(defmethod start-anchored-p ((regex regex) &optional in-seq-p)
343
(declare #.*standard-optimize-settings*)
345
((or lookahead lookbehind word-boundary void)
346
;; zero-length assertions
357
;; BACK-REFERENCE, CHAR-CLASS, EVERYTHING, and STR
360
;; Note that END-STRING-AUX will be called after FLATTEN and GATHER-STRINGS.
362
(defgeneric end-string-aux (regex &optional old-case-insensitive-p)
363
(declare #.*standard-optimize-settings*)
364
(:documentation "Returns the constant string (if it exists) REGEX
365
ends with wrapped into a STR object, otherwise NIL.
366
OLD-CASE-INSENSITIVE-P is the CASE-INSENSITIVE-P slot of the last STR
367
collected or :VOID if no STR has been collected yet. (This is a helper
368
function called by END-STRING.)"))
370
(defmethod end-string-aux ((str str)
371
&optional (old-case-insensitive-p :void))
372
(declare #.*standard-optimize-settings*)
373
(declare (special last-str))
374
(cond ((and (not (skip str)) ; avoid constituents of STARTS-WITH
375
;; only use STR if nothing has been collected yet or if
376
;; the collected string has the same value for
377
;; CASE-INSENSITIVE-P
378
(or (eq old-case-insensitive-p :void)
379
(eq (case-insensitive-p str) old-case-insensitive-p)))
381
;; set the SKIP property of this STR
386
(defmethod end-string-aux ((seq seq)
387
&optional (old-case-insensitive-p :void))
388
(declare #.*standard-optimize-settings*)
389
(declare (special continuep))
390
(let (case-insensitive-p
393
(concatenated-length 0))
394
(declare (fixnum concatenated-length))
395
(loop for element in (reverse (elements seq))
396
;; remember the case-(in)sensitivity of the last relevant
398
for loop-old-case-insensitive-p = old-case-insensitive-p
400
loop-old-case-insensitive-p
401
(case-insensitive-p element-end))
402
;; the end-string of the current element
403
for element-end = (end-string-aux element
404
loop-old-case-insensitive-p)
405
;; whether we encountered a zero-length element
406
for skip = (if element-end
407
(zerop (len element-end))
409
;; set CONTINUEP to NIL if we have to stop collecting to
410
;; alert END-STRING-AUX methods on enclosing SEQ objects
412
do (setq continuep nil)
413
;; end loop if we neither got a STR nor a zero-length
416
;; only collect if not zero-length
418
do (cond (concatenated-string
419
(when concatenated-start
420
(setf concatenated-string
421
(make-array concatenated-length
422
:initial-contents (reverse (str concatenated-start))
423
:element-type 'character
426
concatenated-start nil))
427
(let ((len (len element-end))
428
(str (str element-end)))
429
(declare (fixnum len))
430
(incf concatenated-length len)
431
(loop for i of-type fixnum downfrom (1- len) to 0
432
do (vector-push-extend (char str i)
433
concatenated-string))))
435
(setf concatenated-string
442
(case-insensitive-p element-end))))
443
;; stop collecting if END-STRING-AUX on inner SEQ has said so
445
(cond ((zerop concatenated-length)
446
;; don't bother to return zero-length strings
452
:str (nreverse concatenated-string)
453
:case-insensitive-p case-insensitive-p)))))
455
(defmethod end-string-aux ((register register)
456
&optional (old-case-insensitive-p :void))
457
(declare #.*standard-optimize-settings*)
458
(end-string-aux (regex register) old-case-insensitive-p))
460
(defmethod end-string-aux ((standalone standalone)
461
&optional (old-case-insensitive-p :void))
462
(declare #.*standard-optimize-settings*)
463
(end-string-aux (regex standalone) old-case-insensitive-p))
465
(defmethod end-string-aux ((regex regex)
466
&optional (old-case-insensitive-p :void))
467
(declare #.*standard-optimize-settings*)
468
(declare (special last-str end-anchored-p continuep))
470
((or anchor lookahead lookbehind word-boundary void)
471
;; a zero-length REGEX object - for the sake of END-STRING-AUX
472
;; this is a zero-length string
473
(when (and (typep regex 'anchor)
475
(or (no-newline-p regex)
476
(not (multi-line-p regex)))
477
(eq old-case-insensitive-p :void))
478
;; if this is a "real" end-anchor and we haven't collected
479
;; anything so far we can set END-ANCHORED-P (where 1 or 0
480
;; indicate whether we accept a #\Newline at the end or not)
481
(setq end-anchored-p (if (no-newline-p regex) 0 1)))
484
:case-insensitive-p :void))
486
;; (ALTERNATION, BACK-REFERENCE, BRANCH, CHAR-CLASS, EVERYTHING,
487
;; REPETITION, FILTER)
490
(defun end-string (regex)
491
(declare (special end-string-offset))
492
(declare #.*standard-optimize-settings*)
493
"Returns the constant string (if it exists) REGEX ends with wrapped
494
into a STR object, otherwise NIL."
495
;; LAST-STR points to the last STR object (seen from the end) that's
496
;; part of END-STRING; CONTINUEP is set to T if we stop collecting
497
;; in the middle of a SEQ
500
(declare (special continuep last-str))
502
(end-string-aux regex)
504
;; if we've found something set the START-OF-END-STRING-P of
505
;; the leftmost STR collected accordingly and remember the
506
;; OFFSET of this STR (in a special variable provided by the
507
;; caller of this function)
508
(setf (start-of-end-string-p last-str) t
509
end-string-offset (offset last-str))))))
511
(defgeneric compute-min-rest (regex current-min-rest)
512
(declare #.*standard-optimize-settings*)
513
(:documentation "Returns the minimal length of REGEX plus
514
CURRENT-MIN-REST. This is similar to REGEX-MIN-LENGTH except that it
515
recurses down into REGEX and sets the MIN-REST slots of REPETITION
518
(defmethod compute-min-rest ((seq seq) current-min-rest)
519
(declare #.*standard-optimize-settings*)
520
(loop for element in (reverse (elements seq))
521
for last-min-rest = current-min-rest then this-min-rest
522
for this-min-rest = (compute-min-rest element last-min-rest)
523
finally (return this-min-rest)))
525
(defmethod compute-min-rest ((alternation alternation) current-min-rest)
526
(declare #.*standard-optimize-settings*)
527
(loop for choice in (choices alternation)
528
minimize (compute-min-rest choice current-min-rest)))
530
(defmethod compute-min-rest ((branch branch) current-min-rest)
531
(declare #.*standard-optimize-settings*)
532
(min (compute-min-rest (then-regex branch) current-min-rest)
533
(compute-min-rest (else-regex branch) current-min-rest)))
535
(defmethod compute-min-rest ((str str) current-min-rest)
536
(declare #.*standard-optimize-settings*)
537
(+ current-min-rest (len str)))
539
(defmethod compute-min-rest ((filter filter) current-min-rest)
540
(declare #.*standard-optimize-settings*)
541
(+ current-min-rest (or (len filter) 0)))
543
(defmethod compute-min-rest ((repetition repetition) current-min-rest)
544
(declare #.*standard-optimize-settings*)
545
(setf (min-rest repetition) current-min-rest)
546
(compute-min-rest (regex repetition) current-min-rest)
547
(+ current-min-rest (* (minimum repetition) (min-len repetition))))
549
(defmethod compute-min-rest ((register register) current-min-rest)
550
(declare #.*standard-optimize-settings*)
551
(compute-min-rest (regex register) current-min-rest))
553
(defmethod compute-min-rest ((standalone standalone) current-min-rest)
554
(declare #.*standard-optimize-settings*)
555
(declare (ignore current-min-rest))
556
(compute-min-rest (regex standalone) 0))
558
(defmethod compute-min-rest ((lookahead lookahead) current-min-rest)
559
(declare #.*standard-optimize-settings*)
560
(compute-min-rest (regex lookahead) 0)
563
(defmethod compute-min-rest ((lookbehind lookbehind) current-min-rest)
564
(declare #.*standard-optimize-settings*)
565
(compute-min-rest (regex lookbehind) (+ current-min-rest (len lookbehind)))
568
(defmethod compute-min-rest ((regex regex) current-min-rest)
569
(declare #.*standard-optimize-settings*)
571
((or char-class everything)
572
(1+ current-min-rest))
574
;; zero min-len and no embedded regexes (ANCHOR,
575
;; BACK-REFERENCE, VOID, and WORD-BOUNDARY)