Coverage report: /home/ellis/comp/core/lib/obj/uri/mask.lisp
Kind | Covered | All | % |
expression | 217 | 580 | 37.4 |
branch | 31 | 46 | 67.4 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; obj/uri/mask.lisp --- string character masks for parsing
7
;; To match sets of characters, the parser uses bit vectors constructed
8
;; from lists of characters.
9
;; The size of bit vectors are defined to check for characters in the
10
;; range 0 to 126 (~). We use location 0 and 1, which are never set by
11
;; any generated character list, as boolean.
13
(defparameter +uri-bit-vector-size+ 127)
14
;; The is the index at which we store the boolean: does this bitvector
15
;; allow `ucschar' (from the grammar)?
16
(defparameter +bitvector-index-ucschar+ 0)
17
;; The is the index at which we store the boolean: does this bitvector
18
;; allow `iprivate' (from the grammar)?
19
(defparameter +bitvector-index-iprivate+ 1)
21
(defun generate-character-list (char-start char-end)
22
;; Generate a list of characters between char-start and char-end,
23
;; inclusive of the start and end characters.
24
(when (>= (char-code char-start) (char-code char-end))
25
(error "char-start (~s) must come before char-end (~s)."
27
;; Make sure it doesn't index off the end of the array:
28
(when (>= (char-code char-end) +uri-bit-vector-size+)
29
(error "Illegal char-code (>= ~d)." +uri-bit-vector-size+))
30
(do* ((stop-code (1- (char-code char-start)))
31
(c (char-code char-end) (1- c))
34
(push (code-char c) res))))
36
(defmacro char-included-p (bit-vector char-code)
37
`(= 1 (sbit ,bit-vector ,char-code)))
39
(defmacro safe-char-included-p (bit-vector char-code)
42
`(let* ((,g-bv ,bit-vector)
45
(and (< ,g-cc +uri-bit-vector-size+)
46
(char-included-p ,g-bv ,g-cc))))))
48
(defun make-char-bitvector (chars &key except iri)
49
;; Return a bitvector which has a 1 for each character represented in
50
;; CHARS, where the index is the char-code of the character. If EXCEPT
51
;; is non-nil, it should be a list of characters to exclude.
53
;; If IRI is non-nil, it should be either :ucschar or :iprivate.
54
;; Since the first two bits of the bitvector returned by this function
55
;; are unused (those characters are invalid for URIs and IRIs), we use
56
;; those bits for IRI validation. During IRI character validation,
57
;; characters outside the ASCII range are validated with either ucscharp
58
;; or iprivatep. IRI mode is indicated by .iri-mode. having a non-nil
60
(do* ((a (make-array +uri-bit-vector-size+
61
:element-type 'bit :initial-element 0))
62
(chars chars (cdr chars))
63
(c (car chars) (car chars)))
66
;; set the booleans for this bitvector, used in .looking-at
68
(:ucschar (setf (sbit a #.+bitvector-index-ucschar+) 1))
69
(:iprivate (setf (sbit a #.+bitvector-index-iprivate+) 1))))
71
(if* (and except (member c except :test #'eq))
73
else (setf (sbit a (char-code c)) 1))))
75
;; Lists of characters used to make the bit vectors. These lists are
76
;; pretty much straight out of the grammars.
77
(defparameter *alpha-chars*
78
'#.(append (generate-character-list #\A #\Z)
79
(generate-character-list #\a #\z)))
81
(defparameter *digit-chars* '#.(generate-character-list #\0 #\9))
83
(defparameter *hexdig-chars*
85
'#.(generate-character-list #\A #\F)
86
'#.(generate-character-list #\a #\f)))
88
(defparameter *alphanum-chars* (append *alpha-chars* *digit-chars*))
89
(defparameter *alphanum+-chars* (append *alphanum-chars* '(#\-)))
91
(defparameter *sub-delims-chars* '(#\! #\$ #\& #\' #\( #\) #\* #\+ #\, #\; #\=))
93
(defparameter *unreserved-chars*
94
(append *alpha-chars* *digit-chars* '(#\- #\. #\_ #\~)))
96
(defparameter *pchar-chars*
97
(append *unreserved-chars* *sub-delims-chars* '(#\: #\@)))
99
;; used in pathname to URI conversion:
100
(defparameter *pchar/-chars* (append *pchar-chars* '(#\/)))
102
(defparameter *urn-nss-chars* (append *pchar-chars* '(#\/)))
104
(defparameter *segment-nz-nc-chars* ;; pchar w/o #\:
105
(append *unreserved-chars* *sub-delims-chars* '(#\@)))
107
(defparameter *query-strict-chars* (append *pchar-chars* '(#\/ #\?)))
108
(defparameter *urn-query-chars* (append *pchar-chars* '(#\/)))
109
(defparameter *fragment-strict-chars* (append *pchar-chars* '(#\/ #\?)))
111
(defparameter *ipvfuture-chars*
112
(append *unreserved-chars* *sub-delims-chars* '(#\:)))
114
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
116
(defparameter *alpha-bitvector* (make-char-bitvector *alpha-chars*))
117
(defparameter *digit-bitvector* (make-char-bitvector *digit-chars*))
118
(defparameter *alphanum-bitvector* (make-char-bitvector *alphanum-chars*))
119
(defparameter *alphanum+-bitvector* (make-char-bitvector *alphanum+-chars*))
120
(defparameter *hexdig-bitvector* (make-char-bitvector *hexdig-chars*))
121
(defparameter *pchar-bitvector* (make-char-bitvector *pchar-chars*
123
(defparameter *urn-nss-bitvector* (make-char-bitvector *urn-nss-chars*
125
(defparameter *unreserved-bitvector* (make-char-bitvector *unreserved-chars*
128
;; used in pathname to URI conversion:
129
(defparameter *pchar/-bitvector* (make-char-bitvector *pchar/-chars*
132
(defparameter *userinfo-bitvector*
134
(append *unreserved-chars* *sub-delims-chars* '(#\:))
137
(defparameter *reg-name-bitvector*
138
(make-char-bitvector (append *unreserved-chars* *sub-delims-chars*)
141
(defparameter *scheme-bitvector*
142
(make-char-bitvector (append *alpha-chars* *digit-chars* '(#\+ #\- #\.))))
144
(defparameter *query-bitvector-strict*
145
(make-char-bitvector *query-strict-chars*
148
(defparameter *query-bitvector-non-strict*
149
(make-char-bitvector (append *query-strict-chars*
151
;; Too many websites/tools use this in URLs
156
;; See discussion in rfe15844. Decoding the query should not touch percent
157
;; encodings of #\+, #\= and #\&, because those are interpreted by
158
;; another specification (HTTP).
160
(defparameter *decode-query-strict-chars*
161
(append *unreserved-chars*
162
;; Instead of *sub-delims-chars*, this (which is just like
163
;; *sub-delims-chars*, except for the commented out characters):
164
'(#\! #\$ #\' #\( #\) #\* #\, #\;
169
(defparameter *decode-query-bitvector-strict*
170
(make-char-bitvector *decode-query-strict-chars* :iri :iprivate))
172
(defparameter *decode-query-bitvector-non-strict*
174
(append *decode-query-strict-chars*
176
;; Too many websites/tools use this in URLs
181
(defparameter *fragment-bitvector-strict*
182
(make-char-bitvector *fragment-strict-chars* :iri :ucschar))
184
(defparameter *fragment-bitvector-non-strict*
186
(append *fragment-strict-chars*
188
;; Too many websites/tools use these in URLs
192
(defparameter *segment-nz-nc-bitvector*
193
(make-char-bitvector *segment-nz-nc-chars* :iri :ucschar))
195
(defparameter *urn-query-bitvector*
196
;; Not sure which to use, :ucschar or :iprivate. The universe will
197
;; probably end before anyone figures it out.
198
(make-char-bitvector *urn-query-chars* :iri :iprivate))
200
(defparameter *ipvfuture-bitvector*
201
(make-char-bitvector *ipvfuture-chars* :iri :ucschar))
203
;; The part of a URI that can have percent encoding:
210
(defun percent-decode-string (string allowed-bitvector)
211
;; Return a new string based on STRING which has all percent encoded
212
;; pairs (%xx) turned into real characters. If ALLOWED-BITVECTOR is
213
;; non-nil, only characters that `match' this bitvector are converted.
214
;; (declare (type string string))
216
(max (length string))
217
(new-string (make-string max))
221
;; (nyi! "was formerly a call to EXCL package") - shrinks new-string vector to fit size?
222
(remove #\Nul new-string))
223
(declare (fixnum i max new-i))
224
(if* (char= #\% (setq ch (schar string i)))
225
then (when (> (+ i 3) max)
226
(error "Unsyntactic percent encoding at ~d in ~s." i string))
227
(setq ch (schar string (incf i)))
228
(setq ch2 (schar string (incf i)))
229
(when (not (and (setq chc (digit-char-p ch 16))
230
(setq chc2 (digit-char-p ch2 16))))
232
"Non-hexidecimal digits after % at ~d in ~s."
234
(let ((ci (the fixnum
235
(+ (the fixnum (* 16 (the fixnum chc)))
236
(the fixnum chc2)))))
237
(declare (fixnum ci))
238
(if* (safe-char-included-p allowed-bitvector ci)
239
then ;; OK to convert
240
(setf (schar new-string new-i)
242
else ;; leave percent encoded
243
(setf (schar new-string new-i) #\%)
244
(setf (schar new-string (incf new-i)) ch)
245
(setf (schar new-string (incf new-i)) ch2)))
246
else (setf (schar new-string new-i) ch))))
248
;; This is experimental work in progress.
250
(defun percent-decode-utf8-string (string allowed-bitvector)
251
;; like percent-decode-string, but handle UTF-8 encoded sequences
252
;;;; chars 0..127 use allowed-bitvector
253
;;;; chars > 127 use RFC 3629 grammar
255
(max (length string))
256
(new-string (make-string max))
260
(vec (make-array 4 :element-type '(unsigned-byte 8)))
261
(temps (make-string 1 :element-type 'character))
264
(excl::.primcall 'sys::shrink-svector new-string new-i)
266
(declare (fixnum i max new-i veci)
267
(type (simple-array (unsigned-byte 8) (4)) vec)
268
(dynamic-extent vec))
270
((char= #\% (setq ch (schar string i)))
271
(when (> (+ i 3) max)
273
"Unsyntactic percent encoding at ~d in ~s." i string))
274
(setq ch (schar string (incf i)))
275
(setq ch2 (schar string (incf i)))
276
(when (not (and (setq chc (digit-char-p ch 16))
277
(setq chc2 (digit-char-p ch2 16))))
279
"Non-hexidecimal digits after % at ~d in ~s."
281
(let ((cc (the fixnum
282
(+ (the fixnum (* 16 (the fixnum chc)))
283
(the fixnum chc2)))))
284
(declare (fixnum cc))
286
((<= cc #.+uri-bit-vector-size+)
287
(if* (char-included-p allowed-bitvector cc)
288
then ;; OK to convert
289
(setf (schar new-string new-i)
291
else ;; leave percent encoded
292
(setf (schar new-string new-i) #\%)
293
(setf (schar new-string (incf new-i)) ch)
294
(setf (schar new-string (incf new-i)) ch2)))
296
;; check for valid UTF-8 encoding (from RFC 2234):
297
;;;; UTF8-octets = *( UTF8-char )
298
;;;; UTF8-char = UTF8-1 / UTF8-2 / UTF8-3 / UTF8-4
299
;;;; UTF8-1 = %x00-7F
300
;;;; UTF8-2 = %xC2-DF UTF8-tail
301
;;;; UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
302
;;;; %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
303
;;;; UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
304
;;;; %xF4 %x80-8F 2( UTF8-tail )
305
;;;; UTF8-tail = %x80-BF
306
;; We have a little FSM here. `state' can be one of:
307
;; :start :: looking for markers for UTF8-{2,3,4}
308
;; :utf8-3a :: have UTF8-3, read %E0, look for %xA0-BF
309
;; :utf8-3b :: have UTF8-3, read %ED, look for %x80-9F
310
;; :utf8-4a :: have UTF8-4, read %F0, look for %x90-BF
311
;; :utf8-4b :: have UTF8-4, read %F4, look for %x80-8F
312
;; :utf8-tail3 :: look for 3( UTF8-tail )
313
;; :utf8-tail2 :: look for 2( UTF8-tail )
314
;; :utf8-tail1 :: look for 1( UTF8-tail )
318
(if* (<= #xC2 cc #xDF)
319
then (setf (aref vec 0) cc)
321
(setq state :utf8-tail1)
324
then (setf (aref vec 0) cc)
326
(setq state :utf8-3a)
327
elseif (or (<= #xE1 cc #xEC)
329
then (setf (aref vec 0) cc)
331
(setq state :utf8-tail2)
333
then (setf (aref vec 0) cc)
335
(setq state :utf8-3b)
338
then (setf (aref vec 0) cc)
340
(setq state :utf8-4a)
341
elseif (<= #xF1 cc #xF3)
342
then (setf (aref vec 0) cc)
344
(setq state :utf8-tail3)
346
then (setf (aref vec 0) cc)
348
(setq state :utf8-4b)
349
else (excl::.parse-error
351
"invalid UTF-8 encoding...FIXME")))
353
(if* (<= #xA0 cc #xBF)
354
then (setf (aref vec veci) cc)
356
(setq state :utf8-tail1)
357
else (error "invalid UTF8-3 2nd byte: ~x" cc)))
359
(if* (<= #x80 cc #x9F)
360
then (setf (aref vec veci) cc)
362
(setq state :utf8-tail3)
363
else (error "invalid UTF8-3 2nd byte: ~x" cc)))
365
(if* (<= #x90 cc #xBF)
366
then (setf (aref vec veci) cc)
368
(setq state :utf8-tail2)
369
else (error "invalid UTF8-4 2nd byte: ~x" cc)))
371
(if* (<= #x80 cc #x8F)
372
then (setf (aref vec veci) cc)
374
(setq state :utf8-tail2)
375
else (error "invalid UTF8-4 2nd byte: ~x" cc)))
377
(if* (<= #x80 cc #xBF)
378
then (setf (aref vec veci) cc)
380
(setq state :utf8-tail2)))
382
(if* (<= #x80 cc #xBF)
383
then (setf (aref vec veci) cc)
385
(setq state :utf8-tail1)))
387
(if* (<= #x80 cc #xBF)
388
then (setf (aref vec veci) cc)
391
(octets-to-string vec :external-format :utf-8
392
:end veci :string temps)
393
(setf (schar new-string new-i) (char temps 0)))
394
(t (error "internal error: bad state: ~s" state)))))))
397
(setf (schar new-string new-i) ch)))))
399
(defun percent-encode-string (string allowed-bitvector)
400
;; Return a new string based on STRING which has all characters which do
401
;; not match ALLOWED-BITVECTOR converted into percent encoded pairs (%xx).
402
;; Percent-encoded pairs in the string are skipped over, as it is assumed
403
;; they were required to be encoded.
405
;; Make a string as big as it possibly needs to be (3 times the original
406
;; size), and truncate it at the end.
407
;; (declare (type string string))
408
(declare (optimize (safety 1)))
409
;;(declare (:explain :calls :types))
410
(do* ((hexchars ;; RFC 3986 section 2.1 says use upper case:
412
(pct (char-code #\%))
413
(max (length string))
414
(new-max (* 3 max)) ;; worst case new size
415
(new-string (make-string new-max))
418
(ci ;; so the fixnum decl is true:
422
;; is it safe to delete all 0 chars here?
423
;; (nyi! "was previously a call to EXCL")
424
(remove #\Nul new-string))
425
(declare (fixnum pct max new-max i new-i ci))
426
(setq ci (char-code (setq c (schar string i))))
427
(if* (or (= ci pct) ;; skip %'s
428
(safe-char-included-p allowed-bitvector ci))
431
(setf (schar new-string new-i) c)
432
else ;; need to escape it
433
(let ((d1 (ash ci -4))
434
(d2 (logand ci #xf)))
435
(declare (fixnum d1 d2))
436
(setf (schar new-string (incf new-i)) #\%)
437
(setf (schar new-string (incf new-i)) (schar hexchars d1))
438
(setf (schar new-string (incf new-i)) (schar hexchars d2))))))
440
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
442
;; For efficiency, we do as few subseq's as possible. To achieve this, we
443
;; return, from various parser functions, the start/end pair encoded into a
444
;; fixnum. This means the limit for a URI string is limited to 16384 on a
445
;; 32-bit Lisp. It appears from searches that this is well above the
446
;; accepted maximum for URI strings.
449
;; The max array index is 1/2 of the available fixnum range.
450
(defparameter +uri-max-string-length+
451
#.(expt 2 (truncate (integer-length most-positive-fixnum) 2)))
452
(defparameter +uri-pack-shift+
453
#.(truncate (integer-length most-positive-fixnum) 2))
454
(defparameter +uri-unpack-shift+
455
#.(- (truncate (integer-length most-positive-fixnum) 2)))
456
(defparameter +uri-unpack-mask+
457
#.(1- (ash 1 (truncate (integer-length most-positive-fixnum) 2))))
459
;; This is used as a marker for the null string. It must be a fixnum
460
;; that can't be returned as an index into a string.
461
(defparameter *uri-null-marker* -1)
464
(defun check-xri-string (string)
466
;; 1. STRING is a simple string, and
467
;; 2. Two indices into STRING can packed into a single fixnum.
468
;; This is what xsubseq/val do.
470
(error "string must be a simple string."))
471
(or (< (length string) #.+uri-max-string-length+)
472
(error "string is larger than ~d characters."
473
#.+uri-max-string-length+)))
475
(defun xsubseq (start end)
476
;; Encode START and END into a fixnum.
477
(declare (fixnum start end) (optimize (safety 1)))
480
(ash end +uri-pack-shift+)))))
482
(defun xval (string i)
483
;; Return the subsequence of STRING given by I, which was encoded with
485
(declare (type (or fixnum null) i) (optimize (safety 1)))
488
((= i *uri-null-marker*) "")
489
(t (let ((start (the fixnum (logand i +uri-unpack-mask+)))
491
(ash i +uri-unpack-shift+))))
492
(declare (fixnum start end))
493
(if* (simple-string-p string)
494
then ;; This is a good bit faster than calling subseq
495
(do* ((len (the fixnum (- end start)))
496
(res (make-string len))
497
(src-index start (the fixnum (1+ src-index)))
498
(dst-index 0 (the fixnum (1+ dst-index))))
499
((= src-index end) res)
500
(declare (fixnum len src-index dst-index))
501
(setf (schar res dst-index) (schar string src-index)))
502
else (subseq string start end)))))))
504
(defun at-end-p (i end)
505
;; return T if index I is beyond the END of the string
508
;; This macro is very specialized and not hygenic. It is built for pure
510
(defmacro .looking-at (simple thing string index end char-equal)
511
;; INDEX and END are declared FIXNUM by our caller.
512
;; SIMPLE-STRING-P and SCHAR are much faster than STRINGP and CHAR.
513
;; For the details of what this function returns, see looking-at below.
514
(let ((stringp (if simple 'simple-string-p 'stringp))
515
(schar (if simple 'schar 'char))
517
(length (if simple 'sequence:length 'length))
524
(declare (fixnum ,len))
525
(if* (at-end-p ,index ,end)
527
elseif (characterp ,thing)
528
then ;; In this case, we ignore CHAR-EQUAL and always do the
529
;; character comparison with CHAR= (case sensitively).
530
(when (char= ,thing (,schar ,string ,index))
531
(the fixnum (1+ ,index)))
532
elseif (,stringp ,thing)
533
then (when (not (at-end-p
535
(setq ,len (the fixnum (,length ,thing))))
537
(do* ((,i ,index (the fixnum (1+ ,i)))
538
(,j 0 (the fixnum (1+ ,j)))
539
(,x ,len (the fixnum (1- ,x))))
540
((= 0 ,x) (+ ,index ,len))
541
(declare (fixnum ,i ,j ,x))
543
then (when (not (char-equal (,schar ,string ,i)
546
else (when (not (char= (,schar ,string ,i)
549
elseif (simple-bit-vector-p ,thing) ;; a LOT faster than bit-vector-p
550
then (let ((,c (char-code (,schar ,string ,index))))
551
(if* (< ,c +uri-bit-vector-size+)
552
then (when (char-included-p ,thing ,c)
553
(the fixnum (1+ ,index)))
554
elseif (and %iri-mode
556
;; If the ucschar or iprivate booleans are set,
557
;; then check for characters in those ranges.
558
(and (= 1 (sbit ,thing #.+bitvector-index-ucschar+))
560
(and (= 1 (sbit ,thing #.+bitvector-index-iprivate+))
562
then (the fixnum (1+ ,index))))
563
else (error "bad object: ~s." ,thing)))))
565
(defun ucscharp (code)
566
(declare (type fixnum code) (optimize (safety 1)))
567
;; This is straight from the grammer in RFC 3987, for ucschar.
568
(or (<= #x000A0 code #x0D7FF)
569
(<= #x0F900 code #x0FDCF)
570
(<= #x0FDF0 code #x0FFEF)
571
(<= #x10000 code #x1FFFD)
572
(<= #x20000 code #x2FFFD)
573
(<= #x30000 code #x3FFFD)
574
(<= #x40000 code #x4FFFD)
575
(<= #x50000 code #x5FFFD)
576
(<= #x60000 code #x6FFFD)
577
(<= #x70000 code #x7FFFD)
578
(<= #x80000 code #x8FFFD)
579
(<= #x90000 code #x9FFFD)
580
(<= #xA0000 code #xAFFFD)
581
(<= #xB0000 code #xBFFFD)
582
(<= #xC0000 code #xCFFFD)
583
(<= #xD0000 code #xDFFFD)
584
(<= #xE1000 code #xEFFFD)))
586
(defun iprivatep (code)
587
(declare (fixnum code) (optimize (safety 1)))
588
;; This is straight from the grammer in RFC 3987, for iprivate.
589
(or (<= #x00E000 code #x00F8FF)
590
(<= #x0F0000 code #x0FFFFD)
591
(<= #x100000 code #x10FFFD)))
593
;; Future optimization from rfr:
594
;; If THING is going to be a string very often,
595
;; then you might get a useful speed improvement by splitting this
596
;; again based on char-equal true/false. As it is, you're generating
597
;; code in .looking-at that checks the char-equal argument on every
599
(defun looking-at (thing string index end
600
;; optional because it is rarely given
601
&optional char-equal)
602
;; Return a new index into the parse buffer (STRING), if
603
;; an object equivalent to THING exists at index INDEX.
605
;; - bit vector: if a bit vector, then check that at character
606
;; code index for it, there is a `1'
607
;; - string: check that the string is in STRING starting at INDEX
608
;; - character: check that the character is in STRING starting at
610
;; If CHAR-EQUAL is non-nil, then do character comparisons
611
;; case insensitively with CHAR-EQUAL.
612
(declare (type fixnum index end) (optimize (safety 1)))
613
;; The simple-string version is much faster, so this is worth the
616
;; NOTE: .looking-at takes ONLY symbols. The macro is not hygenic.
617
(if* (simple-string-p string)
618
then (.looking-at t thing string index end char-equal)
619
else (.looking-at nil thing string index end char-equal)))
621
(defun scan-forward (string start end bitvector
623
;; Scan STRING using BITVECTOR for matching, starting from position
624
;; START, and going no farther than END.
625
;; Return the index of the first non-matching character, or nil if no
626
;; characters matched.
628
;; If BITVECTOR does not match, then call FUNC with three arguments
629
;; (STRING, <index>, and END). If the FUNC returns nil, then scanning
630
;; terminates and this function returns <index>, if it is > START.
631
(declare (type fixnum start end)
632
(type (or function null) func)
633
(optimize (safety 1)))
642
((looking-at bitvector string i end)
647
(if* (setq new-i (funcall func string i end))
648
then ;; FUNC return non-nil, advance I and keep going...
650
else ;; FUNC return NIL, we're done
652
then ;; Nothing matched => NIL:
654
else ;; Something matched => first index that didn't:
657
;; BITVECTOR didn't match. We're done.
659
then ;; Nothing matched:
661
else ;; Something matched, first index that didn't: