Coverage report: /home/ellis/comp/ext/cl-ppcre/regex-class-util.lisp
Kind | Covered | All | % |
expression | 154 | 446 | 34.5 |
branch | 11 | 28 | 39.3 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; $Header: /usr/local/cvsrep/cl-ppcre/regex-class-util.lisp,v 1.9 2009/09/17 19:17:31 edi Exp $
3
;;; This file contains some utility methods for REGEX objects.
5
;;; Copyright (c) 2002-2009, Dr. Edmund Weitz. All rights reserved.
7
;;; Redistribution and use in source and binary forms, with or without
8
;;; modification, are permitted provided that the following conditions
11
;;; * Redistributions of source code must retain the above copyright
12
;;; notice, this list of conditions and the following disclaimer.
14
;;; * Redistributions in binary form must reproduce the above
15
;;; copyright notice, this list of conditions and the following
16
;;; disclaimer in the documentation and/or other materials
17
;;; provided with the distribution.
19
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
20
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
23
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
25
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
(in-package :cl-ppcre)
33
;;; The following four methods allow a VOID object to behave like a
34
;;; zero-length STR object (only readers needed)
36
(defmethod len ((void void))
37
(declare #.*standard-optimize-settings*)
40
(defmethod str ((void void))
41
(declare #.*standard-optimize-settings*)
44
(defmethod skip ((void void))
45
(declare #.*standard-optimize-settings*)
48
(defmethod start-of-end-string-p ((void void))
49
(declare #.*standard-optimize-settings*)
52
(defgeneric case-mode (regex old-case-mode)
53
(declare #.*standard-optimize-settings*)
54
(:documentation "Utility function used by the optimizer (see GATHER-STRINGS).
55
Returns a keyword denoting the case-(in)sensitivity of a STR or its
56
second argument if the STR has length 0. Returns NIL for REGEX objects
57
which are not of type STR."))
59
(defmethod case-mode ((str str) old-case-mode)
60
(declare #.*standard-optimize-settings*)
61
(cond ((zerop (len str))
63
((case-insensitive-p str)
68
(defmethod case-mode ((regex regex) old-case-mode)
69
(declare #.*standard-optimize-settings*)
70
(declare (ignore old-case-mode))
73
(defgeneric copy-regex (regex)
74
(declare #.*standard-optimize-settings*)
75
(:documentation "Implements a deep copy of a REGEX object."))
77
(defmethod copy-regex ((anchor anchor))
78
(declare #.*standard-optimize-settings*)
79
(make-instance 'anchor
80
:startp (startp anchor)
81
:multi-line-p (multi-line-p anchor)
82
:no-newline-p (no-newline-p anchor)))
84
(defmethod copy-regex ((everything everything))
85
(declare #.*standard-optimize-settings*)
86
(make-instance 'everything
87
:single-line-p (single-line-p everything)))
89
(defmethod copy-regex ((word-boundary word-boundary))
90
(declare #.*standard-optimize-settings*)
91
(make-instance 'word-boundary
92
:negatedp (negatedp word-boundary)))
94
(defmethod copy-regex ((void void))
95
(declare #.*standard-optimize-settings*)
96
(make-instance 'void))
98
(defmethod copy-regex ((lookahead lookahead))
99
(declare #.*standard-optimize-settings*)
100
(make-instance 'lookahead
101
:regex (copy-regex (regex lookahead))
102
:positivep (positivep lookahead)))
104
(defmethod copy-regex ((seq seq))
105
(declare #.*standard-optimize-settings*)
107
:elements (mapcar #'copy-regex (elements seq))))
109
(defmethod copy-regex ((alternation alternation))
110
(declare #.*standard-optimize-settings*)
111
(make-instance 'alternation
112
:choices (mapcar #'copy-regex (choices alternation))))
114
(defmethod copy-regex ((branch branch))
115
(declare #.*standard-optimize-settings*)
118
(make-instance 'branch
119
:test (if (typep test 'regex)
122
:then-regex (copy-regex (then-regex branch))
123
:else-regex (copy-regex (else-regex branch)))))
125
(defmethod copy-regex ((lookbehind lookbehind))
126
(declare #.*standard-optimize-settings*)
127
(make-instance 'lookbehind
128
:regex (copy-regex (regex lookbehind))
129
:positivep (positivep lookbehind)
130
:len (len lookbehind)))
132
(defmethod copy-regex ((repetition repetition))
133
(declare #.*standard-optimize-settings*)
134
(make-instance 'repetition
135
:regex (copy-regex (regex repetition))
136
:greedyp (greedyp repetition)
137
:minimum (minimum repetition)
138
:maximum (maximum repetition)
139
:min-len (min-len repetition)
140
:len (len repetition)
141
:contains-register-p (contains-register-p repetition)))
143
(defmethod copy-regex ((register register))
144
(declare #.*standard-optimize-settings*)
145
(make-instance 'register
146
:regex (copy-regex (regex register))
148
:name (name register)))
150
(defmethod copy-regex ((standalone standalone))
151
(declare #.*standard-optimize-settings*)
152
(make-instance 'standalone
153
:regex (copy-regex (regex standalone))))
155
(defmethod copy-regex ((back-reference back-reference))
156
(declare #.*standard-optimize-settings*)
157
(make-instance 'back-reference
158
:num (num back-reference)
159
:case-insensitive-p (case-insensitive-p back-reference)))
161
(defmethod copy-regex ((char-class char-class))
162
(declare #.*standard-optimize-settings*)
163
(make-instance 'char-class
164
:test-function (test-function char-class)))
166
(defmethod copy-regex ((str str))
167
(declare #.*standard-optimize-settings*)
170
:case-insensitive-p (case-insensitive-p str)))
172
(defmethod copy-regex ((filter filter))
173
(declare #.*standard-optimize-settings*)
174
(make-instance 'filter
178
;;; Note that COPY-REGEX and REMOVE-REGISTERS could have easily been
179
;;; wrapped into one function. Maybe in the next release...
181
;;; Further note that this function is used by CONVERT to factor out
182
;;; complicated repetitions, i.e. cases like
183
;;; (a)* -> (?:a*(a))?
184
;;; This won't work for, say,
185
;;; ((a)|(b))* -> (?:(?:a|b)*((a)|(b)))?
186
;;; and therefore we stop REGISTER removal once we see an ALTERNATION.
188
(defgeneric remove-registers (regex)
189
(declare #.*standard-optimize-settings*)
190
(:documentation "Returns a deep copy of a REGEX (see COPY-REGEX) and
191
optionally removes embedded REGISTER objects if possible and if the
192
special variable REMOVE-REGISTERS-P is true."))
194
(defmethod remove-registers ((register register))
195
(declare #.*standard-optimize-settings*)
196
(declare (special remove-registers-p reg-seen))
197
(cond (remove-registers-p
198
(remove-registers (regex register)))
200
;; mark REG-SEEN as true so enclosing REPETITION objects
201
;; (see method below) know if they contain a register or not
203
(copy-regex register))))
205
(defmethod remove-registers ((repetition repetition))
206
(declare #.*standard-optimize-settings*)
208
(inner-regex (remove-registers (regex repetition))))
209
;; REMOVE-REGISTERS will set REG-SEEN (see method above) if
210
;; (REGEX REPETITION) contains a REGISTER
211
(declare (special reg-seen))
212
(make-instance 'repetition
214
:greedyp (greedyp repetition)
215
:minimum (minimum repetition)
216
:maximum (maximum repetition)
217
:min-len (min-len repetition)
218
:len (len repetition)
219
:contains-register-p reg-seen)))
221
(defmethod remove-registers ((standalone standalone))
222
(declare #.*standard-optimize-settings*)
223
(make-instance 'standalone
224
:regex (remove-registers (regex standalone))))
226
(defmethod remove-registers ((lookahead lookahead))
227
(declare #.*standard-optimize-settings*)
228
(make-instance 'lookahead
229
:regex (remove-registers (regex lookahead))
230
:positivep (positivep lookahead)))
232
(defmethod remove-registers ((lookbehind lookbehind))
233
(declare #.*standard-optimize-settings*)
234
(make-instance 'lookbehind
235
:regex (remove-registers (regex lookbehind))
236
:positivep (positivep lookbehind)
237
:len (len lookbehind)))
239
(defmethod remove-registers ((branch branch))
240
(declare #.*standard-optimize-settings*)
243
(make-instance 'branch
244
:test (if (typep test 'regex)
245
(remove-registers test)
247
:then-regex (remove-registers (then-regex branch))
248
:else-regex (remove-registers (else-regex branch)))))
250
(defmethod remove-registers ((alternation alternation))
251
(declare #.*standard-optimize-settings*)
252
(declare (special remove-registers-p))
253
;; an ALTERNATION, so we can't remove REGISTER objects further down
254
(setq remove-registers-p nil)
255
(copy-regex alternation))
257
(defmethod remove-registers ((regex regex))
258
(declare #.*standard-optimize-settings*)
261
(defmethod remove-registers ((seq seq))
262
(declare #.*standard-optimize-settings*)
264
:elements (mapcar #'remove-registers (elements seq))))
266
(defgeneric everythingp (regex)
267
(declare #.*standard-optimize-settings*)
268
(:documentation "Returns an EVERYTHING object if REGEX is equivalent
269
to this object, otherwise NIL. So, \"(.){1}\" would return true
270
\(i.e. the object corresponding to \".\", for example."))
272
(defmethod everythingp ((seq seq))
273
(declare #.*standard-optimize-settings*)
274
;; we might have degenerate cases like (:SEQUENCE :VOID ...)
275
;; due to the parsing process
276
(let ((cleaned-elements (remove-if #'(lambda (element)
277
(typep element 'void))
279
(and (= 1 (length cleaned-elements))
280
(everythingp (first cleaned-elements)))))
282
(defmethod everythingp ((alternation alternation))
283
(declare #.*standard-optimize-settings*)
284
(with-slots (choices)
286
(and (= 1 (length choices))
287
;; this is unlikely to happen for human-generated regexes,
288
;; but machine-generated ones might look like this
289
(everythingp (first choices)))))
291
(defmethod everythingp ((repetition repetition))
292
(declare #.*standard-optimize-settings*)
293
(with-slots (maximum minimum regex)
296
(= 1 minimum maximum)
297
;; treat "<regex>{1,1}" like "<regex>"
298
(everythingp regex))))
300
(defmethod everythingp ((register register))
301
(declare #.*standard-optimize-settings*)
302
(everythingp (regex register)))
304
(defmethod everythingp ((standalone standalone))
305
(declare #.*standard-optimize-settings*)
306
(everythingp (regex standalone)))
308
(defmethod everythingp ((everything everything))
309
(declare #.*standard-optimize-settings*)
312
(defmethod everythingp ((regex regex))
313
(declare #.*standard-optimize-settings*)
314
;; the general case for ANCHOR, BACK-REFERENCE, BRANCH, CHAR-CLASS,
315
;; LOOKAHEAD, LOOKBEHIND, STR, VOID, FILTER, and WORD-BOUNDARY
318
(defgeneric regex-length (regex)
319
(declare #.*standard-optimize-settings*)
320
(:documentation "Return the length of REGEX if it is fixed, NIL otherwise."))
322
(defmethod regex-length ((seq seq))
323
(declare #.*standard-optimize-settings*)
324
;; simply add all inner lengths unless one of them is NIL
325
(loop for sub-regex in (elements seq)
326
for len = (regex-length sub-regex)
327
if (not len) do (return nil)
330
(defmethod regex-length ((alternation alternation))
331
(declare #.*standard-optimize-settings*)
332
;; only return a true value if all inner lengths are non-NIL and
334
(loop for sub-regex in (choices alternation)
335
for old-len = nil then len
336
for len = (regex-length sub-regex)
338
(and old-len (/= len old-len))) do (return nil)
339
finally (return len)))
341
(defmethod regex-length ((branch branch))
342
(declare #.*standard-optimize-settings*)
343
;; only return a true value if both alternations have a length and
345
(let ((then-length (regex-length (then-regex branch))))
347
(eql then-length (regex-length (else-regex branch)))
350
(defmethod regex-length ((repetition repetition))
351
(declare #.*standard-optimize-settings*)
352
;; we can only compute the length of a REPETITION object if the
353
;; number of repetitions is fixed; note that we don't call
354
;; REGEX-LENGTH for the inner regex, we assume that the LEN slot is
355
;; always set correctly
356
(with-slots (len minimum maximum)
359
(eql minimum maximum))
363
(defmethod regex-length ((register register))
364
(declare #.*standard-optimize-settings*)
365
(regex-length (regex register)))
367
(defmethod regex-length ((standalone standalone))
368
(declare #.*standard-optimize-settings*)
369
(regex-length (regex standalone)))
371
(defmethod regex-length ((back-reference back-reference))
372
(declare #.*standard-optimize-settings*)
373
;; with enough effort we could possibly do better here, but
374
;; currently we just give up and return NIL
377
(defmethod regex-length ((char-class char-class))
378
(declare #.*standard-optimize-settings*)
381
(defmethod regex-length ((everything everything))
382
(declare #.*standard-optimize-settings*)
385
(defmethod regex-length ((str str))
386
(declare #.*standard-optimize-settings*)
389
(defmethod regex-length ((filter filter))
390
(declare #.*standard-optimize-settings*)
393
(defmethod regex-length ((regex regex))
394
(declare #.*standard-optimize-settings*)
395
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
396
;; WORD-BOUNDARY (which all have zero-length)
399
(defgeneric regex-min-length (regex)
400
(declare #.*standard-optimize-settings*)
401
(:documentation "Returns the minimal length of REGEX."))
403
(defmethod regex-min-length ((seq seq))
404
(declare #.*standard-optimize-settings*)
405
;; simply add all inner minimal lengths
406
(loop for sub-regex in (elements seq)
407
for len = (regex-min-length sub-regex)
410
(defmethod regex-min-length ((alternation alternation))
411
(declare #.*standard-optimize-settings*)
412
;; minimal length of an alternation is the minimal length of the
413
;; "shortest" element
414
(loop for sub-regex in (choices alternation)
415
for len = (regex-min-length sub-regex)
418
(defmethod regex-min-length ((branch branch))
419
(declare #.*standard-optimize-settings*)
420
;; minimal length of both alternations
421
(min (regex-min-length (then-regex branch))
422
(regex-min-length (else-regex branch))))
424
(defmethod regex-min-length ((repetition repetition))
425
(declare #.*standard-optimize-settings*)
426
;; obviously the product of the inner minimal length and the minimal
427
;; number of repetitions
428
(* (minimum repetition) (min-len repetition)))
430
(defmethod regex-min-length ((register register))
431
(declare #.*standard-optimize-settings*)
432
(regex-min-length (regex register)))
434
(defmethod regex-min-length ((standalone standalone))
435
(declare #.*standard-optimize-settings*)
436
(regex-min-length (regex standalone)))
438
(defmethod regex-min-length ((char-class char-class))
439
(declare #.*standard-optimize-settings*)
442
(defmethod regex-min-length ((everything everything))
443
(declare #.*standard-optimize-settings*)
446
(defmethod regex-min-length ((str str))
447
(declare #.*standard-optimize-settings*)
450
(defmethod regex-min-length ((filter filter))
451
(declare #.*standard-optimize-settings*)
455
(defmethod regex-min-length ((regex regex))
456
(declare #.*standard-optimize-settings*)
457
;; the general case for ANCHOR, BACK-REFERENCE, LOOKAHEAD,
458
;; LOOKBEHIND, VOID, and WORD-BOUNDARY
461
(defgeneric compute-offsets (regex start-pos)
462
(declare #.*standard-optimize-settings*)
463
(:documentation "Returns the offset the following regex would have
464
relative to START-POS or NIL if we can't compute it. Sets the OFFSET
465
slot of REGEX to START-POS if REGEX is a STR. May also affect OFFSET
466
slots of STR objects further down the tree."))
468
;; note that we're actually only interested in the offset of
469
;; "top-level" STR objects (see ADVANCE-FN in the SCAN function) so we
470
;; can stop at variable-length alternations and don't need to descend
473
(defmethod compute-offsets ((seq seq) start-pos)
474
(declare #.*standard-optimize-settings*)
475
(loop for element in (elements seq)
476
;; advance offset argument for next call while looping through
478
for pos = start-pos then curr-offset
479
for curr-offset = (compute-offsets element pos)
481
finally (return curr-offset)))
483
(defmethod compute-offsets ((alternation alternation) start-pos)
484
(declare #.*standard-optimize-settings*)
485
(loop for choice in (choices alternation)
486
for old-offset = nil then curr-offset
487
for curr-offset = (compute-offsets choice start-pos)
488
;; we stop immediately if two alternations don't result in the
490
if (or (not curr-offset)
491
(and old-offset (/= curr-offset old-offset)))
493
finally (return curr-offset)))
495
(defmethod compute-offsets ((branch branch) start-pos)
496
(declare #.*standard-optimize-settings*)
497
;; only return offset if both alternations have equal value
498
(let ((then-offset (compute-offsets (then-regex branch) start-pos)))
500
(eql then-offset (compute-offsets (else-regex branch) start-pos))
503
(defmethod compute-offsets ((repetition repetition) start-pos)
504
(declare #.*standard-optimize-settings*)
505
;; no need to descend into the inner regex
506
(with-slots (len minimum maximum)
509
(eq minimum maximum))
510
;; fixed number of repetitions, so we know how to proceed
511
(+ start-pos (* minimum len))
512
;; otherwise return NIL
515
(defmethod compute-offsets ((register register) start-pos)
516
(declare #.*standard-optimize-settings*)
517
(compute-offsets (regex register) start-pos))
519
(defmethod compute-offsets ((standalone standalone) start-pos)
520
(declare #.*standard-optimize-settings*)
521
(compute-offsets (regex standalone) start-pos))
523
(defmethod compute-offsets ((char-class char-class) start-pos)
524
(declare #.*standard-optimize-settings*)
527
(defmethod compute-offsets ((everything everything) start-pos)
528
(declare #.*standard-optimize-settings*)
531
(defmethod compute-offsets ((str str) start-pos)
532
(declare #.*standard-optimize-settings*)
533
(setf (offset str) start-pos)
534
(+ start-pos (len str)))
536
(defmethod compute-offsets ((back-reference back-reference) start-pos)
537
(declare #.*standard-optimize-settings*)
538
;; with enough effort we could possibly do better here, but
539
;; currently we just give up and return NIL
540
(declare (ignore start-pos))
543
(defmethod compute-offsets ((filter filter) start-pos)
544
(declare #.*standard-optimize-settings*)
545
(let ((len (len filter)))
550
(defmethod compute-offsets ((regex regex) start-pos)
551
(declare #.*standard-optimize-settings*)
552
;; the general case for ANCHOR, LOOKAHEAD, LOOKBEHIND, VOID, and
553
;; WORD-BOUNDARY (which all have zero-length)