Coverage report: /home/ellis/.stash/quicklisp/dists/ultralisp/software/edicl-flexi-streams-20240429143708/decode.lisp

KindCoveredAll%
expression6295 2.0
branch136 2.8
Key
Not instrumented
Conditionalized out
Executed
Not executed
 
Both branches taken
One branch taken
Neither branch taken
1
 ;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
2
 ;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.35 2008/08/26 10:59:22 edi Exp $
3
 
4
 ;;; Copyright (c) 2005-2008, Dr. Edmund Weitz.  All rights reserved.
5
 
6
 ;;; Redistribution and use in source and binary forms, with or without
7
 ;;; modification, are permitted provided that the following conditions
8
 ;;; are met:
9
 
10
 ;;;   * Redistributions of source code must retain the above copyright
11
 ;;;     notice, this list of conditions and the following disclaimer.
12
 
13
 ;;;   * Redistributions in binary form must reproduce the above
14
 ;;;     copyright notice, this list of conditions and the following
15
 ;;;     disclaimer in the documentation and/or other materials
16
 ;;;     provided with the distribution.
17
 
18
 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
19
 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
 ;;; ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
22
 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
24
 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26
 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27
 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
 
30
 (in-package :flexi-streams)
31
 
32
 (defun recover-from-encoding-error (external-format format-control &rest format-args)
33
   "Helper function used by OCTETS-TO-CHAR-CODE below to deal with
34
 encoding errors.  Checks if *SUBSTITUTION-CHAR* is not NIL and returns
35
 its character code in this case.  Otherwise signals an
36
 EXTERNAL-FORMAT-ENCODING-ERROR as determined by the arguments to this
37
 function and provides a corresponding USE-VALUE restart."
38
   (when *substitution-char*
39
     (return-from recover-from-encoding-error (char-code *substitution-char*)))
40
   (restart-case
41
       (apply #'signal-encoding-error external-format format-control format-args)
42
     (use-value (char)
43
       :report "Specify a character to be used instead."
44
       :interactive (lambda ()
45
                      (loop
46
                       (format *query-io* "Type a character: ")
47
                       (let ((line (read-line *query-io*)))
48
                         (when (= 1 (length line))
49
                           (return (list (char line 0)))))))
50
       (char-code char))))
51
 
52
 (defgeneric octets-to-char-code (format reader)
53
   (declare #.*standard-optimize-settings*)
54
   (:documentation "Converts a sequence of octets to a character code
55
 \(which is returned, or NIL in case of EOF) using the external format
56
 FORMAT.  The sequence is obtained by calling the function \(which must
57
 be a functional object) READER with no arguments which should return
58
 one octet per call.  In the case of EOF, READER should return NIL.
59
 
60
 The special variable *CURRENT-UNREADER* must be bound correctly
61
 whenever this function is called."))
62
 
63
 (defgeneric octets-to-string* (format sequence start end)
64
   (declare #.*standard-optimize-settings*)
65
   (:documentation "A generic function which dispatches on the external
66
 format and does the real work for OCTETS-TO-STRING."))
67
 
68
 (defmethod octets-to-string* :around (format (list list) start end)
69
   (declare #.*standard-optimize-settings*)
70
   (octets-to-string* format (coerce list 'vector) start end))
71
 
72
 (defmacro define-sequence-readers ((format-class) &body body)
73
   "Non-hygienic utility macro which defines methods for READ-SEQUENCE*
74
 and OCTETS-TO-STRING* for the class FORMAT-CLASS.  BODY is described
75
 in the docstring of DEFINE-CHAR-ENCODERS but can additionally contain
76
 a form \(UNGET <form>) which has to be replaced by the correct code to
77
 `unread' the octets for the character designated by <form>."
78
   (let* ((body `((block char-decoder
79
                    (locally
80
                      (declare #.*fixnum-optimize-settings*)
81
                      ,@body)))))
82
     `(progn
83
        (defmethod read-sequence* ((format ,format-class) flexi-input-stream sequence start end)
84
          (with-accessors ((position flexi-stream-position)
85
                           (bound flexi-stream-bound)
86
                           (octet-stack flexi-stream-octet-stack)
87
                           (last-octet flexi-stream-last-octet)
88
                           (last-char-code flexi-stream-last-char-code)
89
                           (stream flexi-stream-stream))
90
              flexi-input-stream
91
            (let* (buffer
92
                   (buffer-pos 0)
93
                   (buffer-end 0)
94
                   (index start)
95
                   donep
96
                   ;; whether we will later be able to rewind the stream if
97
                   ;; needed (to get rid of unused octets in the buffer)
98
                   (can-rewind-p (maybe-rewind stream 0))
99
                   (factor (encoding-factor format))
100
                   (integer-factor (floor factor))
101
                   ;; it's an interesting question whether it makes sense
102
                   ;; performance-wise to make RESERVE significantly bigger
103
                   ;; (and thus put potentially a lot more octets into
104
                   ;; OCTET-STACK), especially for UTF-8
105
                   (reserve (cond ((or (not (floatp factor))
106
                                       (not can-rewind-p)) 0)
107
                                  (t (ceiling (* (- factor integer-factor) (- end start)))))))
108
              (declare (fixnum buffer-pos buffer-end index integer-factor reserve)
109
                       (boolean can-rewind-p))
110
              (flet ((compute-fill-amount ()
111
                       "Computes the amount of octets we can savely read into
112
 the buffer without violating the stream's bound \(if there is one) and
113
 without potentially reading much more than we need \(unless we can
114
 rewind afterwards)."
115
                       (let ((minimum (min (the fixnum (+ (the fixnum (* integer-factor
116
                                                                         (the fixnum (- end index))))
117
                                                          reserve))
118
                                           +buffer-size+)))
119
                         (cond (bound (min minimum (- bound position)))
120
                               (t minimum))))
121
                     (fill-buffer (end)
122
                       "Tries to fill the buffer from BUFFER-POS to END and
123
 returns NIL if the buffer doesn't contain any new data."
124
                       (when donep
125
                         (return-from fill-buffer nil))
126
                       ;; put data from octet stack into buffer if there is any
127
                       (loop
128
                        (when (>= buffer-pos end)
129
                          (return))
130
                        (let ((next-octet (pop octet-stack)))
131
                          (cond (next-octet
132
                                 (setf (aref (the (array octet *) buffer) buffer-pos) (the octet next-octet))
133
                                 (incf buffer-pos))
134
                                (t (return)))))
135
                       (setq buffer-end (read-sequence buffer stream
136
                                                       :start buffer-pos
137
                                                       :end end))
138
                       ;; we reached EOF, so we remember this
139
                       (when (< buffer-end end)
140
                         (setq donep t))
141
                       ;; BUFFER-POS is only greater than zero if the buffer
142
                       ;; already contains unread data from the octet stack
143
                       ;; (see below), so we test for ZEROP here and do /not/
144
                       ;; compare with BUFFER-POS
145
                       (unless (zerop buffer-end)
146
                         (incf position buffer-end))))
147
                (let ((minimum (compute-fill-amount)))
148
                  (declare (fixnum minimum))
149
                  (setq buffer (make-octet-buffer minimum))
150
                  ;; fill buffer for the first time or return immediately if
151
                  ;; we don't succeed
152
                  (unless (fill-buffer minimum)
153
                    (return-from read-sequence* start)))
154
                (setq buffer-pos 0)
155
                (macrolet ((iterate (set-place)
156
                             "A very unhygienic macro to implement the
157
 actual iteration through the sequence including housekeeping for the
158
 flexi stream.  SET-PLACE is the place \(using the index INDEX) used to
159
 access the sequence."
160
                             `(flet ((leave ()
161
                                       "This is the function used to
162
 abort the LOOP iteration below."
163
                                       (when (> index start)
164
                                         (setq last-octet nil
165
                                               last-char-code ,(sublis '((index . (1- index))) set-place)))
166
                                       (return-from read-sequence* index)))
167
                                (loop
168
                                 (when (>= index end)
169
                                   ;; check if there are octets in the
170
                                   ;; buffer we didn't use - see
171
                                   ;; COMPUTE-FILL-AMOUNT above
172
                                   (let ((rest (- buffer-end buffer-pos)))
173
                                     (when (plusp rest)
174
                                       (or (and can-rewind-p
175
                                                (maybe-rewind stream rest))
176
                                           (loop
177
                                            (when (>= buffer-pos buffer-end)
178
                                              (return))
179
                                            (decf buffer-end)
180
                                            (push (aref (the (array octet *) buffer) buffer-end)
181
                                                  octet-stack)))))
182
                                   (leave))
183
                                 (let ((next-char-code
184
                                        (progn (symbol-macrolet
185
                                                   ((octet-getter
186
                                                     ;; this is the code to retrieve the next octet (or
187
                                                     ;; NIL) and to fill the buffer if needed
188
                                                     (block next-octet
189
                                                       (when (>= buffer-pos buffer-end)
190
                                                         (setq buffer-pos 0)
191
                                                         (unless (fill-buffer (compute-fill-amount))
192
                                                           (return-from next-octet)))
193
                                                       (prog1
194
                                                           (aref (the (array octet *) buffer) buffer-pos)
195
                                                         (incf buffer-pos)))))
196
                                                 (macrolet ((unget (form)
197
                                                              `(unread-char% ,form flexi-input-stream)))
198
                                                   ,',@body)))))
199
                                   (unless next-char-code
200
                                     (leave))
201
                                   (setf ,set-place (code-char next-char-code))
202
                                   (incf index))))))
203
                  (etypecase sequence
204
                    (string (iterate (char sequence index)))
205
                    (array (iterate (aref sequence index)))
206
                    (list (iterate (nth index sequence)))))))))
207
        (defmethod octets-to-string* ((format ,format-class) sequence start end)
208
          (declare #.*standard-optimize-settings*)
209
          (declare (fixnum start end))
210
          (let* ((i start)
211
                 (string-length (compute-number-of-chars format sequence start end))
212
                 (string (make-array string-length :element-type 'char*)))
213
            (declare (fixnum i string-length))
214
            (loop for j of-type fixnum from 0 below string-length
215
                  do (setf (schar string j)
216
                           (code-char (macrolet ((unget (form)
217
                                                   `(decf i (character-length format ,form))))
218
                                        (symbol-macrolet ((octet-getter (and (< i end)
219
                                                                             (prog1
220
                                                                                 (the octet (aref sequence i))
221
                                                                               (incf i)))))
222
                                          ,@body))))
223
                  finally (return string)))))))
224
 
225
 (defmacro define-char-decoders ((lf-format-class cr-format-class crlf-format-class) &body body)
226
   "Non-hygienic utility macro which defines several decoding-related
227
 methods for the classes LF-FORMAT-CLASS, CR-FORMAT-CLASS, and
228
 CRLF-FORMAT-CLASS where it is assumed that CR-FORMAT-CLASS is the same
229
 encoding as LF-FORMAT-CLASS but with CR instead of LF line endings and
230
 similar for CRLF-FORMAT-CLASS, i.e. LF-FORMAT-CLASS is the base class.
231
 BODY is a code template for the code to read octets and return one
232
 character code.  BODY must contain a symbol OCTET-GETTER representing
233
 the form which is used to obtain the next octet."
234
   (let* ((body (with-unique-names (char-code)
235
                  `((let ((,char-code (progn ,@body)))
236
                      (when (and ,char-code
237
                                 (or (<= #xd8 (logand* #x00ff (ash* ,char-code -8)) #xdf)
238
                                     (> ,char-code #x10ffff)))
239
                        (recover-from-encoding-error format "Illegal code point ~A \(#x~:*~X)." ,char-code))
240
                      ,char-code)))))
241
     `(progn
242
        (defmethod octets-to-char-code ((format ,lf-format-class) reader)
243
          (declare #.*fixnum-optimize-settings*)
244
          (declare (function reader))
245
          (symbol-macrolet ((octet-getter (funcall reader)))
246
            ,@(sublis '((char-decoder . octets-to-char-code))
247
                      body)))
248
        (define-sequence-readers (,lf-format-class) ,@body)
249
        (define-sequence-readers (,cr-format-class)
250
          ,(with-unique-names (char-code)
251
             `(let ((,char-code (progn ,@body)))
252
                (case ,char-code
253
                  (#.+cr+ #.(char-code #\Newline))
254
                  (otherwise ,char-code)))))
255
        (define-sequence-readers  (,crlf-format-class)
256
          ,(with-unique-names (char-code next-char-code get-char-code)
257
             `(flet ((,get-char-code () ,@body))
258
                (let ((,char-code (,get-char-code)))
259
                  (case ,char-code
260
                    (#.+cr+
261
                     (let ((,next-char-code (,get-char-code)))
262
                       (case ,next-char-code
263
                         (#.+lf+ #.(char-code #\Newline))
264
                         ;; we saw a CR but no LF afterwards, but then the data
265
                         ;; ended, so we just return #\Return
266
                         ((nil) +cr+)
267
                         ;; if the character we peeked at wasn't a
268
                         ;; linefeed character we unread its constituents
269
                         (otherwise (unget (code-char ,next-char-code))
270
                                    ,char-code))))
271
                    (otherwise ,char-code)))))))))
272
 
273
 (define-char-decoders (flexi-latin-1-format flexi-cr-latin-1-format flexi-crlf-latin-1-format)
274
   octet-getter)
275
 
276
 (define-char-decoders (flexi-ascii-format flexi-cr-ascii-format flexi-crlf-ascii-format)
277
   (when-let (octet octet-getter)
278
     (if (> (the octet octet) 127)
279
       (recover-from-encoding-error format
280
                                    "No character which corresponds to octet #x~X." octet)
281
       octet)))
282
 
283
 (define-char-decoders (flexi-8-bit-format flexi-cr-8-bit-format flexi-crlf-8-bit-format)
284
   (with-accessors ((decoding-table external-format-decoding-table))
285
       format
286
     (when-let (octet octet-getter)
287
       (let ((char-code (aref (the (simple-array char-code-integer *) decoding-table)
288
                              (the octet octet))))
289
         (if (or (null char-code)
290
                 (= (the char-code-integer char-code) 65533))
291
           (recover-from-encoding-error format
292
                                        "No character which corresponds to octet #x~X." octet)
293
           char-code)))))
294
 
295
 (define-char-decoders (flexi-utf-8-format flexi-cr-utf-8-format flexi-crlf-utf-8-format)
296
   (let (first-octet-seen)
297
     (declare (boolean first-octet-seen))
298
     (macrolet ((read-next-byte ()
299
                  '(prog1
300
                       (or octet-getter
301
                           (cond (first-octet-seen
302
                                  (return-from char-decoder
303
                                    (recover-from-encoding-error format
304
                                                                 "End of data while in UTF-8 sequence.")))
305
                                 (t (return-from char-decoder nil))))
306
                     (setq first-octet-seen t))))
307
       (flet ((recover-from-overlong-sequence (value)
308
                (restart-case
309
                    (recover-from-encoding-error format "`Overlong' UTF-8 sequence for code point #x~X."
310
                                                 value)                 
311
                  (accept-overlong-sequence ()
312
                    :report "Accept the code point and continue."
313
                    value))))
314
         (let ((octet (read-next-byte)))
315
           (declare (type octet octet))
316
           (block utf-8-sequence
317
             (multiple-value-bind (start count)
318
                 (cond ((not (logbitp 7 octet))
319
                        ;; avoid the overlong checks below
320
                        (return-from utf-8-sequence octet))
321
                       ((= #b11000000 (logand* octet #b11100000))
322
                        (values (logand* octet #b00011111) 1))
323
                       ((= #b11100000 (logand* octet #b11110000))
324
                        (values (logand* octet #b00001111) 2))
325
                       ((= #b11110000 (logand* octet #b11111000))
326
                        (values (logand* octet #b00000111) 3))
327
                       (t (return-from char-decoder
328
                            (recover-from-encoding-error format
329
                                                         "Unexpected value #x~X at start of UTF-8 sequence."
330
                                                         octet))))
331
               (declare (fixnum count))
332
               (loop for result of-type code-point
333
                     = start then (+ (ash* result 6)
334
                                     (logand* octet #b111111))
335
                     repeat count
336
                     for octet of-type octet = (read-next-byte)
337
                     unless (= #b10000000 (logand* octet #b11000000))
338
                     do (return-from char-decoder
339
                          (recover-from-encoding-error format
340
                                                       "Unexpected value #x~X in UTF-8 sequence." octet))
341
                     finally (return (cond ((< result (ecase count
342
                                                        (1 #x00080)
343
                                                        (2 #x00800)
344
                                                        (3 #x10000)))
345
                                            (recover-from-overlong-sequence result))
346
                                           (t result)))))))))))
347
 
348
 (define-char-decoders (flexi-utf-16-le-format flexi-cr-utf-16-le-format flexi-crlf-utf-16-le-format)
349
   (let (first-octet-seen)
350
     (declare (boolean first-octet-seen))
351
     (macrolet ((read-next-byte ()
352
                  '(prog1
353
                       (or octet-getter
354
                           (cond (first-octet-seen
355
                                  (return-from char-decoder
356
                                    (recover-from-encoding-error format
357
                                                                 "End of data while in UTF-16 sequence.")))
358
                                 (t (return-from char-decoder nil))))
359
                     (setq first-octet-seen t))))
360
       (flet ((read-next-word ()
361
                (+ (the octet (read-next-byte))
362
                   (ash* (the octet (read-next-byte)) 8))))
363
         (declare (inline read-next-word))
364
         (let ((word (read-next-word)))
365
           (declare (type (unsigned-byte 16) word))
366
           (cond ((<= #xd800 word #xdfff)
367
                  (let ((next-word (read-next-word)))
368
                    (declare (type (unsigned-byte 16) next-word))
369
                    (unless (<= #xdc00 next-word #xdfff)
370
                      (return-from char-decoder
371
                        (recover-from-encoding-error format
372
                                                     "Unexpected UTF-16 word #x~X following #x~X."
373
                                                     next-word word)))
374
                    (+ (ash* (logand* #b1111111111 word) 10)
375
                       (logand* #b1111111111 next-word)
376
                       #x10000)))
377
                 (t word)))))))
378
 
379
 (define-char-decoders (flexi-utf-16-be-format flexi-cr-utf-16-be-format flexi-crlf-utf-16-be-format)
380
   (let (first-octet-seen)
381
     (declare (boolean first-octet-seen))
382
     (macrolet ((read-next-byte ()
383
                  '(prog1
384
                       (or octet-getter
385
                           (cond (first-octet-seen
386
                                  (return-from char-decoder
387
                                    (recover-from-encoding-error format
388
                                                                 "End of data while in UTF-16 sequence.")))
389
                                 (t (return-from char-decoder nil))))
390
                     (setq first-octet-seen t))))
391
       (flet ((read-next-word ()
392
                (+ (ash* (the octet (read-next-byte)) 8)
393
                   (the octet (read-next-byte)))))
394
         (declare (inline read-next-word))
395
         (let ((word (read-next-word)))
396
           (declare (type (unsigned-byte 16) word))
397
           (cond ((<= #xd800 word #xdfff)
398
                  (let ((next-word (read-next-word)))
399
                    (declare (type (unsigned-byte 16) next-word))
400
                    (unless (<= #xdc00 next-word #xdfff)
401
                      (return-from char-decoder
402
                        (recover-from-encoding-error format
403
                                                     "Unexpected UTF-16 word #x~X following #x~X."
404
                                                     next-word word)))
405
                    (+ (ash* (logand* #b1111111111 word) 10)
406
                       (logand* #b1111111111 next-word)
407
                       #x10000)))
408
                 (t word)))))))
409
 (define-char-decoders (flexi-gbk-format flexi-cr-gbk-format flexi-crlf-gbk-format)
410
   (when-let (octet octet-getter)
411
     (cond ((<= (the octet octet) #x7f) octet)
412
           ((=  (the octet octet) #x80) #x20ac)
413
           ((=  (the octet octet) #xff) #xf8f5)
414
           (t (let ((next-byte octet-getter))
415
                (if (null next-byte)
416
                  (recover-from-encoding-error format
417
                                               "End of data while in GBK sequence.")
418
                  (let ((word (+ (ash* (the octet octet) 8)
419
                                 (the octet next-byte))))
420
                    (declare (type (unsigned-byte 16) word))
421
                    (let ((octet (or (get-multibyte-mapper *gbk-to-ucs-special-table* word)
422
                                     (get-multibyte-mapper *gbk-to-ucs-table* word))))
423
                      (if octet
424
                        octet
425
                        (recover-from-encoding-error format
426
                                                     "No character which corresponds to octet #x~X."
427
                                                     word))))))))))
428
 
429
 (define-char-decoders (flexi-utf-32-le-format flexi-cr-utf-32-le-format flexi-crlf-utf-32-le-format)
430
   (let (first-octet-seen)
431
     (declare (boolean first-octet-seen))
432
     (macrolet ((read-next-byte ()
433
                  '(prog1
434
                       (or octet-getter
435
                           (cond (first-octet-seen
436
                                  (return-from char-decoder
437
                                    (recover-from-encoding-error format
438
                                                                 "End of data while in UTF-32 sequence.")))
439
                                 (t (return-from char-decoder nil))))
440
                     (setq first-octet-seen t))))
441
       (loop for count of-type fixnum from 0 to 24 by 8
442
             for octet of-type octet = (read-next-byte)
443
             sum (ash* octet count)))))
444
 
445
 (define-char-decoders (flexi-utf-32-be-format flexi-cr-utf-32-be-format flexi-crlf-utf-32-be-format)
446
   (let (first-octet-seen)
447
     (declare (boolean first-octet-seen))
448
     (macrolet ((read-next-byte ()
449
                  '(prog1
450
                       (or octet-getter
451
                           (cond (first-octet-seen
452
                                  (return-from char-decoder
453
                                    (recover-from-encoding-error format
454
                                                                 "End of data while in UTF-32 sequence.")))
455
                                 (t (return-from char-decoder nil))))
456
                     (setq first-octet-seen t))))
457
       (loop for count of-type fixnum from 24 downto 0 by 8
458
             for octet of-type octet = (read-next-byte)
459
             sum (ash* octet count)))))
460
 
461
 (defmethod octets-to-char-code ((format flexi-cr-mixin) reader)
462
   (declare #.*fixnum-optimize-settings*)
463
   (declare (ignore reader))
464
   (let ((char-code (call-next-method)))
465
     (case char-code
466
       (#.+cr+ #.(char-code #\Newline))
467
       (otherwise char-code))))
468
 
469
 (defmethod octets-to-char-code ((format flexi-crlf-mixin) reader)
470
   (declare #.*fixnum-optimize-settings*)
471
   (declare (function *current-unreader*))
472
   (declare (ignore reader))
473
   (let ((char-code (call-next-method)))
474
     (case char-code
475
       (#.+cr+
476
        (let ((next-char-code (call-next-method)))
477
          (case next-char-code
478
            (#.+lf+ #.(char-code #\Newline))
479
            ;; we saw a CR but no LF afterwards, but then the data
480
            ;; ended, so we just return #\Return
481
            ((nil) +cr+)
482
            ;; if the character we peeked at wasn't a
483
            ;; linefeed character we unread its constituents
484
            (otherwise (funcall *current-unreader* (code-char next-char-code))
485
                       char-code))))
486
       (otherwise char-code))))
487