Coverage report: /home/ellis/.stash/quicklisp/dists/ultralisp/software/cl-babel-babel-20240610131823/src/enc-unicode.lisp
Kind | Covered | All | % |
expression | 0 | 130 | 0.0 |
branch | 0 | 6 | 0.0 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
3
;;; enc-unicode.lisp --- Unicode encodings.
5
;;; Copyright (C) 2007, Luis Oliveira <loliveira@common-lisp.net>
7
;;; Permission is hereby granted, free of charge, to any person
8
;;; obtaining a copy of this software and associated documentation
9
;;; files (the "Software"), to deal in the Software without
10
;;; restriction, including without limitation the rights to use, copy,
11
;;; modify, merge, publish, distribute, sublicense, and/or sell copies
12
;;; of the Software, and to permit persons to whom the Software is
13
;;; furnished to do so, subject to the following conditions:
15
;;; The above copyright notice and this permission notice shall be
16
;;; included in all copies or substantial portions of the Software.
18
;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
;;; DEALINGS IN THE SOFTWARE.
27
;;; This implementation is largely based on OpenMCL's l1-unicode.lisp
28
;;; Copyright (C) 2006 Clozure Associates and contributors.
30
(in-package #:babel-encodings)
32
(eval-when (:compile-toplevel :load-toplevel :execute)
33
(defconstant +repl+ #xfffd "Unicode replacement character code point.")
34
(defconstant +byte-order-mark-code+ #xfeff)
35
(defconstant +swapped-byte-order-mark-code+ #xfffe)
36
(defconstant +swapped-byte-order-mark-code-32+ #xfffe0000))
38
;;; Some convenience macros adding FIXNUM declarations.
39
(defmacro f-ash (integer count) `(the fixnum (ash ,integer ,count)))
40
(defmacro f-logior (&rest integers) `(the fixnum (logior ,@integers)))
41
(defmacro f-logand (&rest integers) `(the fixnum (logand ,@integers)))
42
(defmacro f-logxor (&rest integers) `(the fixnum (logxor ,@integers)))
46
(define-character-encoding :utf-8
47
"An 8-bit, variable-length character encoding in which
48
character code points in the range #x00-#x7f can be encoded in a
49
single octet; characters with larger code values can be encoded
52
:literal-char-code-limit #x80
53
:bom-encoding #(#xef #xbb #xbf)
54
:default-replacement #xfffd)
56
(define-condition invalid-utf8-starter-byte (character-decoding-error)
58
(:documentation "Signalled when an invalid UTF-8 starter byte is found."))
60
(define-condition invalid-utf8-continuation-byte (character-decoding-error)
63
"Signalled when an invalid UTF-8 continuation byte is found."))
65
(define-condition overlong-utf8-sequence (character-decoding-error)
67
(:documentation "Signalled upon overlong UTF-8 sequences."))
69
(define-octet-counter :utf-8 (getter type)
70
`(named-lambda utf-8-octet-counter (seq start end max)
71
(declare (type ,type seq) (fixnum start end max))
72
(loop with noctets fixnum = 0
73
for i fixnum from start below end
74
for code of-type code-point = (,getter seq i) do
75
(let ((new (+ (cond ((< code #x80) 1)
80
(if (and (plusp max) (> new max))
83
finally (return (values noctets i)))))
85
(define-code-point-counter :utf-8 (getter type)
86
`(named-lambda utf-8-code-point-counter (seq start end max)
87
(declare (type ,type seq) (fixnum start end max))
88
(loop with nchars fixnum = 0
91
;; check for invalid continuation bytes
92
(macrolet ((invalid-cb-p (n)
93
`(and (< (+ i ,n) end)
94
(not (< #x7f (,',getter seq (+ i ,n)) #xc0)))))
95
;; wrote this code with LET instead of FOR because CLISP's
96
;; LOOP doesn't like WHILE clauses before FOR clauses.
97
(let* ((octet (,getter seq i))
98
(next-i (+ i (cond ((or (< octet #xc0) (invalid-cb-p 1)) 1)
99
((or (< octet #xe0) (invalid-cb-p 2)) 2)
100
((or (< octet #xf0) (invalid-cb-p 3)) 3)
101
((or (< octet #xf8) (invalid-cb-p 4)) 4)
102
((or (< octet #xfc) (invalid-cb-p 5)) 5)
104
(declare (type ub8 octet) (fixnum next-i))
107
;; Should we add restarts to this error, we'll have
108
;; to figure out a way to communicate with the
109
;; decoder since we probably want to do something
110
;; about it right here when we have a chance to
111
;; change the count or something. (Like an
112
;; alternative replacement character or perhaps the
113
;; existence of this error so that the decoder
114
;; doesn't have to check for it on every iteration
117
;; FIXME: The data for this error is not right.
118
(decoding-error (vector octet) :utf-8 seq i
119
nil 'end-of-input-in-character)
120
(return (values (1+ nchars) end)))
122
(setq nchars (1+ nchars)
124
(when (and (plusp max) (= nchars max))
125
(return (values nchars i)))))))
128
(return (values nchars i))))))
130
(define-encoder :utf-8 (getter src-type setter dest-type)
131
`(named-lambda utf-8-encoder (src start end dest d-start)
132
(declare (type ,src-type src)
133
(type ,dest-type dest)
134
(fixnum start end d-start))
135
(loop with di fixnum = d-start
136
for i fixnum from start below end
137
for code of-type code-point = (,getter src i) do
138
(macrolet ((set-octet (offset value)
139
`(,',setter ,value dest (the fixnum (+ di ,offset)))))
147
(set-octet 0 (logior #xc0 (f-ash code -6)))
148
(set-octet 1 (logior #x80 (f-logand code #x3f)))
152
(set-octet 0 (logior #xe0 (f-ash code -12)))
153
(set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6))))
154
(set-octet 2 (logior #x80 (f-logand code #x3f)))
158
(set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18))))
159
(set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12))))
160
(set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6))))
161
(set-octet 3 (logior #x80 (logand code #x3f)))
163
finally (return (the fixnum (- di d-start))))))
165
(define-decoder :utf-8 (getter src-type setter dest-type)
166
`(named-lambda utf-8-decoder (src start end dest d-start)
167
(declare (type ,src-type src)
168
(type ,dest-type dest)
169
(fixnum start end d-start))
170
(let ((u2 0) (u3 0) (u4 0) (u5 0) (u6 0))
171
(declare (type ub8 u2 u3 u4 u5 u6))
172
(loop for di fixnum from d-start
173
for i fixnum from start below end
174
for u1 of-type ub8 = (,getter src i) do
175
;; Note: CONSUME-OCTET doesn't check if I is being
176
;; incremented past END. We're assuming that END has
177
;; been calculated with the CODE-POINT-POINTER above that
181
`(let ((next-i (incf i)))
183
;; FIXME: data for this error is incomplete.
184
;; and signalling this error twice
185
(return-from setter-block
186
(decoding-error nil :utf-8 src i +repl+
187
'end-of-input-in-character))
188
(,',getter src next-i))))
189
(handle-error (n &optional (c 'character-decoding-error))
191
(vector ,@(subseq '(u1 u2 u3 u4 u5 u6) 0 n))
192
:utf-8 src (1+ (- i ,n)) +repl+ ',c))
193
(handle-error-if-icb (var n)
194
`(when (not (< #x7f ,var #xc0))
196
(return-from setter-block
197
(handle-error ,n invalid-utf8-continuation-byte)))))
201
((< u1 #x80) u1) ; 1 octet
203
(handle-error 1 invalid-utf8-starter-byte))
205
(setq u2 (consume-octet))
206
(handle-error-if-icb u2 1)
209
(handle-error 2 overlong-utf8-sequence))
210
((< u1 #xe0) ; 2 octets
211
(logior (f-ash (f-logand #x1f u1) 6)
214
(setq u3 (consume-octet))
215
(handle-error-if-icb u3 2)
217
((and (= u1 #xe0) (< u2 #xa0))
218
(handle-error 3 overlong-utf8-sequence))
219
((< u1 #xf0) ; 3 octets
220
(let ((start (f-logior (f-ash (f-logand u1 #x0f) 12)
221
(f-ash (f-logand u2 #x3f) 6))))
222
(if (<= #xd800 start #xdfc0)
223
(handle-error 3 character-out-of-range)
224
(logior start (f-logand u3 #x3f)))))
226
(setq u4 (consume-octet))
227
(handle-error-if-icb u4 3)
229
((and (= u1 #xf0) (< u2 #x90))
230
(handle-error 4 overlong-utf8-sequence))
232
(if (or (> u1 #xf4) (and (= u1 #xf4) (> u2 #x8f)))
233
(handle-error 4 character-out-of-range)
234
(f-logior (f-ash (f-logand u1 7) 18)
235
(f-ash (f-logxor u2 #x80) 12)
236
(f-ash (f-logxor u3 #x80) 6)
237
(f-logxor u4 #x80))))
238
;; from here on we'll be getting either
239
;; invalid continuation bytes or overlong
240
;; 5-byte or 6-byte sequences.
242
(setq u5 (consume-octet))
243
(handle-error-if-icb u5 4)
245
((and (= u1 #xf8) (< u2 #x88))
246
(handle-error 5 overlong-utf8-sequence))
248
(handle-error 5 character-out-of-range))
250
(setq u6 (consume-octet))
251
(handle-error-if-icb u6 5)
253
((and (= u1 #xfc) (< u2 #x84))
254
(handle-error 6 overlong-utf8-sequence))
256
(handle-error 6 character-out-of-range)
259
finally (return (the fixnum (- di d-start)))))))
263
;;; The following excerpt from a linux-utf8 message by Markus Kuhn is
264
;;; the closest thing to a UTF-8B specification:
266
;;; <http://mail.nl.linux.org/linux-utf8/2000-07/msg00040.html>
268
;;; "D) Emit a malformed UTF-16 sequence for every byte in a malformed
271
;;; All the previous options for converting malformed UTF-8 sequences
272
;;; to UTF-16 destroy information. This can be highly undesirable in
273
;;; applications such as text file editors, where guaranteed binary
274
;;; transparency is a desireable feature. (E.g., I frequently edit
275
;;; executable code or graphic files with the Emacs text editor and I
276
;;; hate the idea that my editor might automatically make U+FFFD
277
;;; substitutions at locations that I haven't even edited when I save
280
;;; I therefore suggested 1999-11-02 on the unicode@xxxxxxxxxxx
281
;;; mailing list the following approach. Instead of using U+FFFD,
282
;;; simply encode malformed UTF-8 sequences as malformed UTF-16
283
;;; sequences. Malformed UTF-8 sequences consist excludively of the
284
;;; bytes 0x80 - 0xff, and each of these bytes can be represented
285
;;; using a 16-bit value from the UTF-16 low-half surrogate zone
286
;;; U+DC80 to U+DCFF. Thus, the overlong "K" (U+004B) 0xc1 0x8b from
287
;;; the above example would be represented in UTF-16 as U+DCC1
288
;;; U+DC8B. If we simply make sure that every UTF-8 encoded surrogate
289
;;; character is also treated like a malformed sequence, then there
290
;;; is no way that a single high-half surrogate could precede the
291
;;; encoded malformed sequence and cause a valid UTF-16 sequence to
294
;;; This way 100% binary transparent UTF-8 -> UTF-16 -> UTF-8
295
;;; round-trip compatibility can be achieved quite easily.
297
;;; On an output device, a lonely low-half surrogate character should
298
;;; be treated just like a character outside the adopted subset of
299
;;; representable characters, that is for the end user, the display
300
;;; would look exactly like with semantics B), i.e. one symbol per
301
;;; byte of a malformed sequence. However in contrast to semantics
302
;;; B), no information is thrown away, and a cut&paste in an editor
303
;;; or terminal emulator will be guaranteed to reconstruct the
304
;;; original byte sequence. This should greatly reduce the incidence
305
;;; of accidental corruption of binary data by UTF-8 -> UTF-16 ->
306
;;; UTF-8 conversion round trips."
308
(define-character-encoding :utf-8b
309
"An 8-bit, variable-length character encoding in which
310
character code points in the range #x00-#x7f can be encoded in a
311
single octet; characters with larger code values can be encoded
312
in 2 to 4 bytes. Invalid UTF-8 sequences are encoded with #xDCXX
313
code points for each invalid byte."
314
:max-units-per-char 4
315
:literal-char-code-limit #x80
316
:bom-encoding #(#xef #xbb #xbf)
317
:default-replacement nil)
319
;;; TODO: reuse the :UTF-8 octet counter through a simple macro.
320
(define-octet-counter :utf-8b (getter type)
321
`(named-lambda utf-8b-octet-counter (seq start end max)
322
(declare (type ,type seq) (fixnum start end max))
323
(loop with noctets fixnum = 0
324
for i fixnum from start below end
325
for code of-type code-point = (,getter seq i) do
326
(let ((new (+ (cond ((< code #x80) 1)
328
((<= #xdc80 code #xdcff) 1)
332
(if (and (plusp max) (> new max))
335
finally (return (values noctets i)))))
337
(define-code-point-counter :utf-8b (getter type)
338
`(named-lambda utf-8b-code-point-counter (seq start end max)
339
(declare (type ,type seq) (fixnum start end max))
340
(loop with nchars fixnum = 0
341
with i fixnum = start
343
;; wrote this code with LET instead of FOR because CLISP's
344
;; LOOP doesn't like WHILE clauses before FOR clauses.
345
(let* ((octet (,getter seq i))
346
(noctets (cond ((< octet #x80) 1)
350
(declare (type ub8 octet) (fixnum noctets))
352
((> (+ i noctets) end)
353
;; If this error is suppressed these last few bytes
354
;; will be encoded as raw bytes later.
355
(decoding-error (vector octet) :utf-8 seq i
356
nil 'end-of-input-in-character)
357
(return (values (+ nchars (- end i)) end)))
359
;; FIXME: clean this mess up.
361
(u2 (if (>= noctets 2) (,getter seq (1+ i)) 0))
362
(u3 (if (>= noctets 3) (,getter seq (+ i 2)) 0))
363
(u4 (if (= noctets 4) (,getter seq (+ i 3)) 0))
364
(inc (or (and (> noctets 1)
367
(not (logior u2 #x40)))
369
(not (and (< (f-logxor u2 #x80) #x40)
370
(< (f-logxor u3 #x80) #x40)
371
(or (>= u1 #xe1) (>= u2 #xa0))
372
(or (/= u1 #xed) (< u2 #xa0) (> u2 #xbf)))))
375
(and (< (f-logxor u2 #x80) #x40)
376
(< (f-logxor u3 #x80) #x40)
377
(< (f-logxor u4 #x80) #x40)
378
(or (>= u1 #xf1) (>= u2 #x90))))))))
379
(let ((new-nchars (if inc (+ nchars noctets) (1+ nchars))))
380
(when (and (plusp max) (> new-nchars max))
381
(return (values nchars i)))
383
(setq nchars new-nchars))))))
386
(return (values nchars i))))))
388
;;; TODO: reuse the :UTF-8 encoder with through a simple macro.
389
(define-encoder :utf-8b (getter src-type setter dest-type)
390
`(named-lambda utf-8b-encoder (src start end dest d-start)
391
(declare (type ,src-type src)
392
(type ,dest-type dest)
393
(fixnum start end d-start))
394
(loop with di fixnum = d-start
395
for i fixnum from start below end
396
for code of-type code-point = (,getter src i) do
397
(macrolet ((set-octet (offset value)
398
`(,',setter ,value dest (the fixnum (+ di ,offset)))))
406
(set-octet 0 (logior #xc0 (f-ash code -6)))
407
(set-octet 1 (logior #x80 (f-logand code #x3f)))
409
;; 1 octet (invalid octet)
410
((<= #xdc80 code #xdcff)
411
(set-octet 0 (f-logand code #xff))
415
(set-octet 0 (logior #xe0 (f-ash code -12)))
416
(set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6))))
417
(set-octet 2 (logior #x80 (f-logand code #x3f)))
421
(set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18))))
422
(set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12))))
423
(set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6))))
424
(set-octet 3 (logand #x3f code))
426
finally (return (the fixnum (- di d-start))))))
428
(define-decoder :utf-8b (getter src-type setter dest-type)
429
`(named-lambda utf-8b-decoder (src start end dest d-start)
430
(declare (type ,src-type src)
431
(type ,dest-type dest)
432
(fixnum start end d-start))
433
(let ((u2 0) (u3 0) (u4 0))
434
(declare (type ub8 u2 u3 u4))
435
(loop for di fixnum from d-start
436
for i fixnum from start below end
437
for u1 of-type ub8 = (,getter src i) do
438
;; Unlike the UTF-8 version, this version of
439
;; CONSUME-OCTET needs to check if I is being incremented
440
;; past END because we might have trailing binary
445
(encode-raw-octets ,n)
446
(,',getter src (incf i))))
447
(encode-raw-octets (n)
449
,@(loop for i below n and var in '(u1 u2 u3 u4)
450
collect `(,',setter (logior #xdc00 ,var) dest di)
453
(return-from set-body))))
456
((< u1 #x80) ; 1 octet
459
(setq u2 (consume-octet 1))
461
((< u1 #xe0) ; 2 octets
462
(if (< (f-logxor u2 #x80) #x40)
463
(logior (f-ash (f-logand #x1f u1) 6)
465
(encode-raw-octets 2)))
467
(setq u3 (consume-octet 2))
469
((< u1 #xf0) ; 3 octets
470
(if (and (< (f-logxor u2 #x80) #x40)
471
(< (f-logxor u3 #x80) #x40)
472
(or (>= u1 #xe1) (>= u2 #xa0)))
473
(let ((start (f-logior (f-ash (f-logand u1 #x0f) 12)
474
(f-ash (f-logand u2 #x3f) 6))))
475
(if (<= #xd800 start #xdfc0)
476
(encode-raw-octets 3)
477
(logior start (f-logand u3 #x3f))))
478
(encode-raw-octets 3)))
480
(setq u4 (consume-octet 3))
481
(if (and (< (f-logxor u2 #x80) #x40)
482
(< (f-logxor u3 #x80) #x40)
483
(< (f-logxor u4 #x80) #x40)
484
(or (>= u1 #xf1) (>= u2 #x90)))
486
(f-logior (f-ash (f-logand u1 7) 18)
487
(f-ash (f-logxor u2 #x80) 12))
488
(f-logior (f-ash (f-logxor u3 #x80) 6)
490
(encode-raw-octets 4)))))))
491
(t (encode-raw-octets 1)))
493
finally (return (the fixnum (- di d-start)))))))
497
;;; TODO: add a way to pass some info at compile-time telling us that,
498
;;; for example, the maximum code-point will always be < #x10000 in
499
;;; which case we could simply return (* 2 (- end start)).
500
(defmacro utf16-octet-counter (getter type)
501
`(named-lambda utf-16-octet-counter (seq start end max)
502
(declare (type ,type seq) (fixnum start end max))
503
(loop with noctets fixnum = 0
504
for i fixnum from start below end
505
for code of-type code-point = (,getter seq i)
506
do (let ((new (the fixnum (+ (if (< code #x10000) 2 4) noctets))))
507
(if (and (plusp max) (> new max))
510
finally (return (values noctets i)))))
512
(defmacro utf-16-combine-surrogate-pairs (u1 u2)
513
`(the (unsigned-byte 21)
515
(the (unsigned-byte 20)
517
(the (unsigned-byte 20)
518
(ash (the (unsigned-byte 10) (- ,u1 #xd800)) 10))
519
(the (unsigned-byte 10)
522
(defmacro define-utf-16 (name &optional endianness)
523
(check-type endianness (or null (eql :be) (eql :le)))
524
(check-type name keyword)
525
(let ((swap-var (gensym "SWAP"))
526
(code-point-counter-name
527
(format-symbol t '#:~a-code-point-counter (string name)))
528
(encoder-name (format-symbol t '#:~a-encoder (string name)))
529
(decoder-name (format-symbol t '#:~a-decoder (string name))))
530
(labels ((make-bom-check-form (end start getter seq)
531
(if (null endianness)
533
(when (> ,,end ,,start)
534
(case (,,getter ,,seq ,,start 2 :ne)
535
(#.+byte-order-mark-code+ (incf ,,start 2) nil)
536
(#.+swapped-byte-order-mark-code+ (incf ,,start 2) t)
537
(t #+little-endian t)))))
539
(make-getter-form (getter src i)
541
(:le ``(,,getter ,,src ,,i 2 :le))
542
(:be ``(,,getter ,,src ,,i 2 :be))
544
(,,getter ,,src ,,i 2 :re)
545
(,,getter ,,src ,,i 2 :ne)))))
546
(make-setter-form (setter code dest di)
548
(:be ``(,,setter ,,code ,,dest ,,di 2 :be))
549
(:le ``(,,setter ,,code ,,dest ,,di 2 :le))
550
(t ``(,,setter ,,code ,,dest ,,di 2 :ne)))))
552
(define-octet-counter ,name (getter type)
553
`(utf16-octet-counter ,getter ,type))
554
(define-code-point-counter ,name (getter type)
555
`(named-lambda ,',code-point-counter-name (seq start end max)
556
(declare (type ,type seq) (fixnum start end max))
557
(let* ,,(make-bom-check-form ''end ''start 'getter ''seq)
558
(loop with count fixnum = 0
559
with i fixnum = start
560
while (<= i (- end 2)) do
561
(let* ((code ,,(make-getter-form 'getter ''seq ''i))
562
(next-i (+ i (if (or (< code #xd800) (>= code #xdc00))
565
(declare (type (unsigned-byte 16) code) (fixnum next-i))
569
(vector (,getter seq i) (,getter seq (1+ i)))
570
,',name seq i nil 'end-of-input-in-character)
571
(return (values count i)))
575
(when (and (plusp max) (= count max))
576
(return (values count i))))))
579
(return (values count i)))))))
580
(define-encoder ,name (getter src-type setter dest-type)
581
`(named-lambda ,',encoder-name (src start end dest d-start)
582
(declare (type ,src-type src)
583
(type ,dest-type dest)
584
(fixnum start end d-start))
585
(loop with di fixnum = d-start
586
for i fixnum from start below end
587
for code of-type code-point = (,getter src i)
588
for high-bits fixnum = (- code #x10000) do
589
(cond ((< high-bits 0)
590
,,(make-setter-form 'setter ''code ''dest ''di)
594
'setter ''(logior #xd800 (f-ash high-bits -10))
597
'setter ''(logior #xdc00 (f-logand high-bits #x3ff))
600
finally (return (the fixnum (- di d-start))))))
601
(define-decoder ,name (getter src-type setter dest-type)
602
`(named-lambda ,',decoder-name (src start end dest d-start)
603
(declare (type ,src-type src)
604
(type ,dest-type dest)
605
(fixnum start end d-start))
606
(let ,,(make-bom-check-form ''end ''start 'getter ''src)
607
(loop with i fixnum = start
608
for di fixnum from d-start
610
(let ((u1 ,,(make-getter-form 'getter ''src ''i)))
611
(declare (type (unsigned-byte 16) u1))
614
((or (< u1 #xd800) (>= u1 #xe000)) ; 2 octets
616
((< u1 #xdc00) ; 4 octets
617
(let ((u2 ,,(make-getter-form 'getter ''src ''i)))
618
(declare (type (unsigned-byte 16) u2))
620
(if (and (>= u2 #xdc00) (< u2 #xe000))
621
(utf-16-combine-surrogate-pairs u1 u2)
623
(vector (,getter src (- i 4))
624
(,getter src (- i 3))
625
(,getter src (- i 2))
626
(,getter src (- i 1)))
627
,',name src i +repl+))))
629
(decoding-error (vector (,getter src (- i 2))
630
(,getter src (- i 1)))
631
,',name src i +repl+)))
633
finally (return (the fixnum (- di d-start)))))))
636
(define-character-encoding :utf-16
637
"A 16-bit, variable-length encoding in which characters with
638
code points less than #x10000 can be encoded in a single 16-bit
639
word and characters with larger codes can be encoded in a pair of
640
16-bit words. The endianness of the encoded data is indicated by
641
the endianness of a byte-order-mark character (#\u+feff)
642
prepended to the data; in the absence of such a character on
643
input, the data is assumed to be in big-endian order. Output is
644
written in native byte-order with a leading byte-order mark."
645
:max-units-per-char 2
647
:native-endianness t ; not necessarily true when decoding
648
:decode-literal-code-unit-limit #xd800
649
:encode-literal-code-unit-limit #x10000
650
:use-bom #+big-endian :utf-16be #+little-endian :utf-16le
651
:bom-encoding #+big-endian #(#xfe #xff) #+little-endian #(#xff #xfe)
653
:default-replacement #xfffd
654
:ambiguous #+little-endian t #+big-endian nil)
656
(define-utf-16 :utf-16)
658
(define-character-encoding :utf-16le
659
"A 16-bit, variable-length encoding in which characters with
660
code points less than #x10000 can be encoded in a single 16-bit
661
word and characters with larger codes can be encoded in a pair of
662
16-bit words. The data is assumed to be in little-endian order. Output is
663
written in little-endian byte-order without a leading byte-order mark."
664
:aliases '(:utf-16/le)
665
:max-units-per-char 2
667
:native-endianness #+little-endian t #+big-endian nil
668
:decode-literal-code-unit-limit #xd800
669
:encode-literal-code-unit-limit #x10000
671
:default-replacement #xfffd)
673
(define-utf-16 :utf-16le :le)
675
(define-character-encoding :utf-16be
676
"A 16-bit, variable-length encoding in which characters with
677
code points less than #x10000 can be encoded in a single 16-bit
678
word and characters with larger codes can be encoded in a pair of
679
16-bit words. The data is assumed to be in big-endian order. Output is
680
written in big-endian byte-order without a leading byte-order mark."
681
:aliases '(:utf-16/be)
682
:max-units-per-char 2
684
:native-endianness #+little-endian nil #+big-endian t
685
:decode-literal-code-unit-limit #xd800
686
:encode-literal-code-unit-limit #x10000
688
:default-replacement #xfffd)
690
(define-utf-16 :utf-16be :be)
692
(defmacro define-ucs (name bytes &optional endianness (limit #x110000))
693
(check-type name keyword)
694
(check-type bytes (or (eql 2) (eql 4)))
695
(check-type endianness (or null (eql :le) (eql :be)))
696
(let ((swap-var (gensym "SWAP"))
697
(code-point-counter-name
698
(format-symbol t '#:~a-code-point-counter (string name)))
700
(format-symbol t '#:~a-encoder (string name)))
702
(format-symbol t '#:~a-decoder (string name))))
703
(labels ((make-bom-check-form (end start getter src)
704
(if (null endianness)
705
``(when (not (zerop (- ,,end ,,start)))
706
(case (,,getter ,,src 0 ,',bytes :ne)
707
(#.+byte-order-mark-code+
708
(incf ,,start ,',bytes) nil)
709
(#.+swapped-byte-order-mark-code-32+
710
(incf ,,start ,',bytes) t)
711
(t #+little-endian t)))
713
(make-setter-form (setter code dest di)
714
``(,,setter ,,code ,,dest ,,di ,',bytes
715
,',(or endianness :ne)))
716
(make-getter-form (getter src i)
717
(if (null endianness)
719
(,,getter ,,src ,,i ,',bytes :re)
720
(,,getter ,,src ,,i ,',bytes :ne))
721
``(,,getter ,,src ,,i ,',bytes ,',endianness))))
723
(define-code-point-counter ,name (getter type)
724
`(named-lambda ,',code-point-counter-name (seq start end max)
725
(declare (type ,type seq) (fixnum start end max))
727
,,(make-bom-check-form ''end ''start 'getter ''seq)
728
(multiple-value-bind (count rem)
729
(floor (- end start) ,',bytes)
731
((and (plusp max) (> count max))
732
(values max (the fixnum (+ start (* ,',bytes max)))))
734
;; check for incomplete last character
736
(let ((vector (make-array ,',bytes :fill-pointer 0)))
738
(vector-push (,getter seq (+ i (- end rem))) vector))
739
(decoding-error vector ,',name seq (the fixnum (- end rem)) nil
740
'end-of-input-in-character)
742
(values count end))))))
743
(define-encoder ,name (getter src-type setter dest-type)
744
`(named-lambda ,',encoder-name (src start end dest d-start)
745
(declare (type ,src-type src)
746
(type ,dest-type dest)
747
(fixnum start end d-start))
748
(loop for i fixnum from start below end
749
and di fixnum from d-start by ,',bytes
750
for code of-type code-point = (,getter src i)
751
do (if (>= code ,',limit)
752
(encoding-error code ,',name src i +repl+)
753
,,(make-setter-form 'setter ''code ''dest ''di))
754
finally (return (the fixnum (- di d-start))))))
755
(define-decoder ,name (getter src-type setter dest-type)
756
`(named-lambda ,',decoder-name (src start end dest d-start)
757
(declare (type ,src-type src)
758
(type ,dest-type dest)
759
(fixnum start end d-start))
760
(let ((,',swap-var ,,(make-bom-check-form ''end ''start 'getter ''src)))
761
(declare (ignorable ,',swap-var))
762
(loop for i fixnum from start below end by ,',bytes
764
do (,setter (let ((unit ,,(make-getter-form 'getter ''src ''i)))
765
(if (>= unit ,',limit)
767
(vector (,getter src i)
768
(,getter src (+ i 1))
770
``((,getter src (+ i 2))
771
(,getter src (+ i 3)))))
773
'character-out-of-range)
776
finally (return (the fixnum (- di d-start)))))))
781
(define-character-encoding :utf-32
782
"A 32-bit, fixed-length encoding in which all Unicode
783
characters can be encoded in a single 32-bit word. The
784
endianness of the encoded data is indicated by the endianness of
785
a byte-order-mark character (#\u+feff) prepended to the data; in
786
the absence of such a character on input, input data is assumed
787
to be in big-endian order. Output is written in native byte
788
order with a leading byte-order mark."
790
:max-units-per-char 1
792
:native-endianness t ; not necessarily true when decoding
793
:literal-char-code-limit #x110000
794
:use-bom #+little-endian :utf-32le #+big-endian :utf-32be
796
#+big-endian #(#x00 #x00 #xfe #xff)
797
#+little-endian #(#xff #xfe #x00 #x00)
798
:nul-encoding #(0 0 0 0)
799
:ambiguous #+little-endian t #+big-endian nil)
801
(define-ucs :utf-32 4)
803
(define-character-encoding :utf-32le
804
"A 32-bit, fixed-length encoding in which all Unicode
805
characters can be encoded in a single 32-bit word. Input data is assumed
806
to be in little-endian order. Output is also written in little-endian byte
807
order without a leading byte-order mark."
808
:max-units-per-char 1
810
:aliases '(:utf-32/le :ucs-4le :ucs-4/le)
811
:native-endianness #+little-endian t #+big-endian nil
812
:literal-char-code-limit #x110000
813
:nul-encoding #(0 0 0 0))
815
(define-ucs :utf-32le 4 :le)
817
(define-character-encoding :utf-32be
818
"A 32-bit, fixed-length encoding in which all Unicode
819
characters can be encoded in a single 32-bit word. Input data is assumed
820
to be in big-endian order. Output is also written in big-endian byte
821
order without a leading byte-order mark."
822
:max-units-per-char 1
824
:aliases '(:utf-32/be :ucs-4be :ucs-4/be)
825
:native-endianness #+little-endian nil #+big-endian t
826
:literal-char-code-limit #x110000
827
:nul-encoding #(0 0 0 0))
829
(define-ucs :utf-32be 4 :be)
833
(define-character-encoding :ucs-2
834
"A 16-bit, fixed-length encoding in which all Unicode
835
characters can be encoded in a single 16-bit word. The
836
endianness of the encoded data is indicated by the endianness of
837
a byte-order-mark character (#\u+feff) prepended to the data; in
838
the absence of such a character on input, input data is assumed
839
to be in big-endian order. Output is written in native byte
840
order with a leading byte-order mark."
842
:max-units-per-char 1
844
:native-endianness t ; not necessarily true when decoding
845
:literal-char-code-limit #x10000
846
:use-bom #+little-endian :ucs-2le #+big-endian :ucs-2be
848
#+big-endian #(#xfe #xff)
849
#+little-endian #(#xff #xfe)
851
:ambiguous #+little-endian t #+big-endian nil)
853
(define-ucs :ucs-2 2 nil #x10000)
855
(define-character-encoding :ucs-2le
856
"A 16-bit, fixed-length encoding in which all Unicode
857
characters can be encoded in a single 16-bit word. Input data is assumed
858
to be in little-endian order. Output is also written in little-endian byte
859
order without a leading byte-order mark."
860
:max-units-per-char 1
862
:aliases '(:ucs-2/le)
863
:native-endianness #+little-endian t #+big-endian nil
864
:literal-char-code-limit #x10000
865
:nul-encoding #(0 0))
867
(define-ucs :ucs-2le 2 :le #x10000)
869
(define-character-encoding :ucs-2be
870
"A 16-bit, fixed-length encoding in which all Unicode
871
characters can be encoded in a single 16-bit word. Input data is assumed
872
to be in big-endian order. Output is also written in big-endian byte
873
order without a leading byte-order mark."
874
:max-units-per-char 1
876
:aliases '(:ucs-2/be)
877
:native-endianness #+little-endian nil #+big-endian t
878
:literal-char-code-limit #x10000
879
:nul-encoding #(0 0))
881
(define-ucs :ucs-2be 2 :be #x10000)