Coverage report: /home/ellis/.stash/quicklisp/dists/ultralisp/software/edicl-flexi-streams-20240429143708/decode.lisp
Kind | Covered | All | % |
expression | 6 | 295 | 2.0 |
branch | 1 | 36 | 2.8 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-
2
;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.35 2008/08/26 10:59:22 edi Exp $
4
;;; Copyright (c) 2005-2008, Dr. Edmund Weitz. All rights reserved.
6
;;; Redistribution and use in source and binary forms, with or without
7
;;; modification, are permitted provided that the following conditions
10
;;; * Redistributions of source code must retain the above copyright
11
;;; notice, this list of conditions and the following disclaimer.
13
;;; * Redistributions in binary form must reproduce the above
14
;;; copyright notice, this list of conditions and the following
15
;;; disclaimer in the documentation and/or other materials
16
;;; provided with the distribution.
18
;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
19
;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20
;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
22
;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23
;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
24
;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26
;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27
;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28
;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
(in-package :flexi-streams)
32
(defun recover-from-encoding-error (external-format format-control &rest format-args)
33
"Helper function used by OCTETS-TO-CHAR-CODE below to deal with
34
encoding errors. Checks if *SUBSTITUTION-CHAR* is not NIL and returns
35
its character code in this case. Otherwise signals an
36
EXTERNAL-FORMAT-ENCODING-ERROR as determined by the arguments to this
37
function and provides a corresponding USE-VALUE restart."
38
(when *substitution-char*
39
(return-from recover-from-encoding-error (char-code *substitution-char*)))
41
(apply #'signal-encoding-error external-format format-control format-args)
43
:report "Specify a character to be used instead."
44
:interactive (lambda ()
46
(format *query-io* "Type a character: ")
47
(let ((line (read-line *query-io*)))
48
(when (= 1 (length line))
49
(return (list (char line 0)))))))
52
(defgeneric octets-to-char-code (format reader)
53
(declare #.*standard-optimize-settings*)
54
(:documentation "Converts a sequence of octets to a character code
55
\(which is returned, or NIL in case of EOF) using the external format
56
FORMAT. The sequence is obtained by calling the function \(which must
57
be a functional object) READER with no arguments which should return
58
one octet per call. In the case of EOF, READER should return NIL.
60
The special variable *CURRENT-UNREADER* must be bound correctly
61
whenever this function is called."))
63
(defgeneric octets-to-string* (format sequence start end)
64
(declare #.*standard-optimize-settings*)
65
(:documentation "A generic function which dispatches on the external
66
format and does the real work for OCTETS-TO-STRING."))
68
(defmethod octets-to-string* :around (format (list list) start end)
69
(declare #.*standard-optimize-settings*)
70
(octets-to-string* format (coerce list 'vector) start end))
72
(defmacro define-sequence-readers ((format-class) &body body)
73
"Non-hygienic utility macro which defines methods for READ-SEQUENCE*
74
and OCTETS-TO-STRING* for the class FORMAT-CLASS. BODY is described
75
in the docstring of DEFINE-CHAR-ENCODERS but can additionally contain
76
a form \(UNGET <form>) which has to be replaced by the correct code to
77
`unread' the octets for the character designated by <form>."
78
(let* ((body `((block char-decoder
80
(declare #.*fixnum-optimize-settings*)
83
(defmethod read-sequence* ((format ,format-class) flexi-input-stream sequence start end)
84
(with-accessors ((position flexi-stream-position)
85
(bound flexi-stream-bound)
86
(octet-stack flexi-stream-octet-stack)
87
(last-octet flexi-stream-last-octet)
88
(last-char-code flexi-stream-last-char-code)
89
(stream flexi-stream-stream))
96
;; whether we will later be able to rewind the stream if
97
;; needed (to get rid of unused octets in the buffer)
98
(can-rewind-p (maybe-rewind stream 0))
99
(factor (encoding-factor format))
100
(integer-factor (floor factor))
101
;; it's an interesting question whether it makes sense
102
;; performance-wise to make RESERVE significantly bigger
103
;; (and thus put potentially a lot more octets into
104
;; OCTET-STACK), especially for UTF-8
105
(reserve (cond ((or (not (floatp factor))
106
(not can-rewind-p)) 0)
107
(t (ceiling (* (- factor integer-factor) (- end start)))))))
108
(declare (fixnum buffer-pos buffer-end index integer-factor reserve)
109
(boolean can-rewind-p))
110
(flet ((compute-fill-amount ()
111
"Computes the amount of octets we can savely read into
112
the buffer without violating the stream's bound \(if there is one) and
113
without potentially reading much more than we need \(unless we can
115
(let ((minimum (min (the fixnum (+ (the fixnum (* integer-factor
116
(the fixnum (- end index))))
119
(cond (bound (min minimum (- bound position)))
122
"Tries to fill the buffer from BUFFER-POS to END and
123
returns NIL if the buffer doesn't contain any new data."
125
(return-from fill-buffer nil))
126
;; put data from octet stack into buffer if there is any
128
(when (>= buffer-pos end)
130
(let ((next-octet (pop octet-stack)))
132
(setf (aref (the (array octet *) buffer) buffer-pos) (the octet next-octet))
135
(setq buffer-end (read-sequence buffer stream
138
;; we reached EOF, so we remember this
139
(when (< buffer-end end)
141
;; BUFFER-POS is only greater than zero if the buffer
142
;; already contains unread data from the octet stack
143
;; (see below), so we test for ZEROP here and do /not/
144
;; compare with BUFFER-POS
145
(unless (zerop buffer-end)
146
(incf position buffer-end))))
147
(let ((minimum (compute-fill-amount)))
148
(declare (fixnum minimum))
149
(setq buffer (make-octet-buffer minimum))
150
;; fill buffer for the first time or return immediately if
152
(unless (fill-buffer minimum)
153
(return-from read-sequence* start)))
155
(macrolet ((iterate (set-place)
156
"A very unhygienic macro to implement the
157
actual iteration through the sequence including housekeeping for the
158
flexi stream. SET-PLACE is the place \(using the index INDEX) used to
159
access the sequence."
161
"This is the function used to
162
abort the LOOP iteration below."
163
(when (> index start)
165
last-char-code ,(sublis '((index . (1- index))) set-place)))
166
(return-from read-sequence* index)))
169
;; check if there are octets in the
170
;; buffer we didn't use - see
171
;; COMPUTE-FILL-AMOUNT above
172
(let ((rest (- buffer-end buffer-pos)))
174
(or (and can-rewind-p
175
(maybe-rewind stream rest))
177
(when (>= buffer-pos buffer-end)
180
(push (aref (the (array octet *) buffer) buffer-end)
183
(let ((next-char-code
184
(progn (symbol-macrolet
186
;; this is the code to retrieve the next octet (or
187
;; NIL) and to fill the buffer if needed
189
(when (>= buffer-pos buffer-end)
191
(unless (fill-buffer (compute-fill-amount))
192
(return-from next-octet)))
194
(aref (the (array octet *) buffer) buffer-pos)
195
(incf buffer-pos)))))
196
(macrolet ((unget (form)
197
`(unread-char% ,form flexi-input-stream)))
199
(unless next-char-code
201
(setf ,set-place (code-char next-char-code))
204
(string (iterate (char sequence index)))
205
(array (iterate (aref sequence index)))
206
(list (iterate (nth index sequence)))))))))
207
(defmethod octets-to-string* ((format ,format-class) sequence start end)
208
(declare #.*standard-optimize-settings*)
209
(declare (fixnum start end))
211
(string-length (compute-number-of-chars format sequence start end))
212
(string (make-array string-length :element-type 'char*)))
213
(declare (fixnum i string-length))
214
(loop for j of-type fixnum from 0 below string-length
215
do (setf (schar string j)
216
(code-char (macrolet ((unget (form)
217
`(decf i (character-length format ,form))))
218
(symbol-macrolet ((octet-getter (and (< i end)
220
(the octet (aref sequence i))
223
finally (return string)))))))
225
(defmacro define-char-decoders ((lf-format-class cr-format-class crlf-format-class) &body body)
226
"Non-hygienic utility macro which defines several decoding-related
227
methods for the classes LF-FORMAT-CLASS, CR-FORMAT-CLASS, and
228
CRLF-FORMAT-CLASS where it is assumed that CR-FORMAT-CLASS is the same
229
encoding as LF-FORMAT-CLASS but with CR instead of LF line endings and
230
similar for CRLF-FORMAT-CLASS, i.e. LF-FORMAT-CLASS is the base class.
231
BODY is a code template for the code to read octets and return one
232
character code. BODY must contain a symbol OCTET-GETTER representing
233
the form which is used to obtain the next octet."
234
(let* ((body (with-unique-names (char-code)
235
`((let ((,char-code (progn ,@body)))
236
(when (and ,char-code
237
(or (<= #xd8 (logand* #x00ff (ash* ,char-code -8)) #xdf)
238
(> ,char-code #x10ffff)))
239
(recover-from-encoding-error format "Illegal code point ~A \(#x~:*~X)." ,char-code))
242
(defmethod octets-to-char-code ((format ,lf-format-class) reader)
243
(declare #.*fixnum-optimize-settings*)
244
(declare (function reader))
245
(symbol-macrolet ((octet-getter (funcall reader)))
246
,@(sublis '((char-decoder . octets-to-char-code))
248
(define-sequence-readers (,lf-format-class) ,@body)
249
(define-sequence-readers (,cr-format-class)
250
,(with-unique-names (char-code)
251
`(let ((,char-code (progn ,@body)))
253
(#.+cr+ #.(char-code #\Newline))
254
(otherwise ,char-code)))))
255
(define-sequence-readers (,crlf-format-class)
256
,(with-unique-names (char-code next-char-code get-char-code)
257
`(flet ((,get-char-code () ,@body))
258
(let ((,char-code (,get-char-code)))
261
(let ((,next-char-code (,get-char-code)))
262
(case ,next-char-code
263
(#.+lf+ #.(char-code #\Newline))
264
;; we saw a CR but no LF afterwards, but then the data
265
;; ended, so we just return #\Return
267
;; if the character we peeked at wasn't a
268
;; linefeed character we unread its constituents
269
(otherwise (unget (code-char ,next-char-code))
271
(otherwise ,char-code)))))))))
273
(define-char-decoders (flexi-latin-1-format flexi-cr-latin-1-format flexi-crlf-latin-1-format)
276
(define-char-decoders (flexi-ascii-format flexi-cr-ascii-format flexi-crlf-ascii-format)
277
(when-let (octet octet-getter)
278
(if (> (the octet octet) 127)
279
(recover-from-encoding-error format
280
"No character which corresponds to octet #x~X." octet)
283
(define-char-decoders (flexi-8-bit-format flexi-cr-8-bit-format flexi-crlf-8-bit-format)
284
(with-accessors ((decoding-table external-format-decoding-table))
286
(when-let (octet octet-getter)
287
(let ((char-code (aref (the (simple-array char-code-integer *) decoding-table)
289
(if (or (null char-code)
290
(= (the char-code-integer char-code) 65533))
291
(recover-from-encoding-error format
292
"No character which corresponds to octet #x~X." octet)
295
(define-char-decoders (flexi-utf-8-format flexi-cr-utf-8-format flexi-crlf-utf-8-format)
296
(let (first-octet-seen)
297
(declare (boolean first-octet-seen))
298
(macrolet ((read-next-byte ()
301
(cond (first-octet-seen
302
(return-from char-decoder
303
(recover-from-encoding-error format
304
"End of data while in UTF-8 sequence.")))
305
(t (return-from char-decoder nil))))
306
(setq first-octet-seen t))))
307
(flet ((recover-from-overlong-sequence (value)
309
(recover-from-encoding-error format "`Overlong' UTF-8 sequence for code point #x~X."
311
(accept-overlong-sequence ()
312
:report "Accept the code point and continue."
314
(let ((octet (read-next-byte)))
315
(declare (type octet octet))
316
(block utf-8-sequence
317
(multiple-value-bind (start count)
318
(cond ((not (logbitp 7 octet))
319
;; avoid the overlong checks below
320
(return-from utf-8-sequence octet))
321
((= #b11000000 (logand* octet #b11100000))
322
(values (logand* octet #b00011111) 1))
323
((= #b11100000 (logand* octet #b11110000))
324
(values (logand* octet #b00001111) 2))
325
((= #b11110000 (logand* octet #b11111000))
326
(values (logand* octet #b00000111) 3))
327
(t (return-from char-decoder
328
(recover-from-encoding-error format
329
"Unexpected value #x~X at start of UTF-8 sequence."
331
(declare (fixnum count))
332
(loop for result of-type code-point
333
= start then (+ (ash* result 6)
334
(logand* octet #b111111))
336
for octet of-type octet = (read-next-byte)
337
unless (= #b10000000 (logand* octet #b11000000))
338
do (return-from char-decoder
339
(recover-from-encoding-error format
340
"Unexpected value #x~X in UTF-8 sequence." octet))
341
finally (return (cond ((< result (ecase count
345
(recover-from-overlong-sequence result))
348
(define-char-decoders (flexi-utf-16-le-format flexi-cr-utf-16-le-format flexi-crlf-utf-16-le-format)
349
(let (first-octet-seen)
350
(declare (boolean first-octet-seen))
351
(macrolet ((read-next-byte ()
354
(cond (first-octet-seen
355
(return-from char-decoder
356
(recover-from-encoding-error format
357
"End of data while in UTF-16 sequence.")))
358
(t (return-from char-decoder nil))))
359
(setq first-octet-seen t))))
360
(flet ((read-next-word ()
361
(+ (the octet (read-next-byte))
362
(ash* (the octet (read-next-byte)) 8))))
363
(declare (inline read-next-word))
364
(let ((word (read-next-word)))
365
(declare (type (unsigned-byte 16) word))
366
(cond ((<= #xd800 word #xdfff)
367
(let ((next-word (read-next-word)))
368
(declare (type (unsigned-byte 16) next-word))
369
(unless (<= #xdc00 next-word #xdfff)
370
(return-from char-decoder
371
(recover-from-encoding-error format
372
"Unexpected UTF-16 word #x~X following #x~X."
374
(+ (ash* (logand* #b1111111111 word) 10)
375
(logand* #b1111111111 next-word)
379
(define-char-decoders (flexi-utf-16-be-format flexi-cr-utf-16-be-format flexi-crlf-utf-16-be-format)
380
(let (first-octet-seen)
381
(declare (boolean first-octet-seen))
382
(macrolet ((read-next-byte ()
385
(cond (first-octet-seen
386
(return-from char-decoder
387
(recover-from-encoding-error format
388
"End of data while in UTF-16 sequence.")))
389
(t (return-from char-decoder nil))))
390
(setq first-octet-seen t))))
391
(flet ((read-next-word ()
392
(+ (ash* (the octet (read-next-byte)) 8)
393
(the octet (read-next-byte)))))
394
(declare (inline read-next-word))
395
(let ((word (read-next-word)))
396
(declare (type (unsigned-byte 16) word))
397
(cond ((<= #xd800 word #xdfff)
398
(let ((next-word (read-next-word)))
399
(declare (type (unsigned-byte 16) next-word))
400
(unless (<= #xdc00 next-word #xdfff)
401
(return-from char-decoder
402
(recover-from-encoding-error format
403
"Unexpected UTF-16 word #x~X following #x~X."
405
(+ (ash* (logand* #b1111111111 word) 10)
406
(logand* #b1111111111 next-word)
409
(define-char-decoders (flexi-gbk-format flexi-cr-gbk-format flexi-crlf-gbk-format)
410
(when-let (octet octet-getter)
411
(cond ((<= (the octet octet) #x7f) octet)
412
((= (the octet octet) #x80) #x20ac)
413
((= (the octet octet) #xff) #xf8f5)
414
(t (let ((next-byte octet-getter))
416
(recover-from-encoding-error format
417
"End of data while in GBK sequence.")
418
(let ((word (+ (ash* (the octet octet) 8)
419
(the octet next-byte))))
420
(declare (type (unsigned-byte 16) word))
421
(let ((octet (or (get-multibyte-mapper *gbk-to-ucs-special-table* word)
422
(get-multibyte-mapper *gbk-to-ucs-table* word))))
425
(recover-from-encoding-error format
426
"No character which corresponds to octet #x~X."
429
(define-char-decoders (flexi-utf-32-le-format flexi-cr-utf-32-le-format flexi-crlf-utf-32-le-format)
430
(let (first-octet-seen)
431
(declare (boolean first-octet-seen))
432
(macrolet ((read-next-byte ()
435
(cond (first-octet-seen
436
(return-from char-decoder
437
(recover-from-encoding-error format
438
"End of data while in UTF-32 sequence.")))
439
(t (return-from char-decoder nil))))
440
(setq first-octet-seen t))))
441
(loop for count of-type fixnum from 0 to 24 by 8
442
for octet of-type octet = (read-next-byte)
443
sum (ash* octet count)))))
445
(define-char-decoders (flexi-utf-32-be-format flexi-cr-utf-32-be-format flexi-crlf-utf-32-be-format)
446
(let (first-octet-seen)
447
(declare (boolean first-octet-seen))
448
(macrolet ((read-next-byte ()
451
(cond (first-octet-seen
452
(return-from char-decoder
453
(recover-from-encoding-error format
454
"End of data while in UTF-32 sequence.")))
455
(t (return-from char-decoder nil))))
456
(setq first-octet-seen t))))
457
(loop for count of-type fixnum from 24 downto 0 by 8
458
for octet of-type octet = (read-next-byte)
459
sum (ash* octet count)))))
461
(defmethod octets-to-char-code ((format flexi-cr-mixin) reader)
462
(declare #.*fixnum-optimize-settings*)
463
(declare (ignore reader))
464
(let ((char-code (call-next-method)))
466
(#.+cr+ #.(char-code #\Newline))
467
(otherwise char-code))))
469
(defmethod octets-to-char-code ((format flexi-crlf-mixin) reader)
470
(declare #.*fixnum-optimize-settings*)
471
(declare (function *current-unreader*))
472
(declare (ignore reader))
473
(let ((char-code (call-next-method)))
476
(let ((next-char-code (call-next-method)))
478
(#.+lf+ #.(char-code #\Newline))
479
;; we saw a CR but no LF afterwards, but then the data
480
;; ended, so we just return #\Return
482
;; if the character we peeked at wasn't a
483
;; linefeed character we unread its constituents
484
(otherwise (funcall *current-unreader* (code-char next-char-code))
486
(otherwise char-code))))