Coverage report: /home/ellis/.stash/quicklisp/dists/ultralisp/software/cl-babel-babel-20240610131823/src/encodings.lisp
Kind | Covered | All | % |
expression | 43 | 216 | 19.9 |
branch | 2 | 6 | 33.3 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
3
;;; encodings.lisp --- Character encodings and mappings.
5
;;; Copyright (C) 2007, Luis Oliveira <loliveira@common-lisp.net>
7
;;; Permission is hereby granted, free of charge, to any person
8
;;; obtaining a copy of this software and associated documentation
9
;;; files (the "Software"), to deal in the Software without
10
;;; restriction, including without limitation the rights to use, copy,
11
;;; modify, merge, publish, distribute, sublicense, and/or sell copies
12
;;; of the Software, and to permit persons to whom the Software is
13
;;; furnished to do so, subject to the following conditions:
15
;;; The above copyright notice and this permission notice shall be
16
;;; included in all copies or substantial portions of the Software.
18
;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
;;; DEALINGS IN THE SOFTWARE.
27
(in-package #:babel-encodings)
29
;;;; Character Encodings
31
(defclass character-encoding ()
32
((name :initarg :name :reader enc-name
33
:initform (error "Must specify a NAME for this character encoding."))
34
;; Most of these documentation strings are taken from OpenMCL.
36
:initarg :documentation :reader enc-documentation :initform nil)
37
;; A non-exhaustive list of aliases for the encoding.
38
(aliases :initarg :aliases :initform nil :reader enc-aliases)
39
;; Specified in bits. Usually 8, 16 or 32.
41
:initarg :code-unit-size :reader enc-code-unit-size :initform 8)
43
:initarg :max-units-per-char :reader enc-max-units-per-char :initform 1)
44
;; If NIL, it is necessary to swap 16- and 32-bit units.
46
:initarg :native-endianness :reader enc-native-endianness :initform t)
47
;; Code units less than this value map to themselves on input.
48
(decode-literal-code-unit-limit
49
:initarg :decode-literal-code-unit-limit :initform 0
50
:reader enc-decode-literal-code-unit-limit)
51
;; Code points less than this value map to themselves on output.
52
(encode-literal-code-unit-limit
53
:initarg :encode-literal-code-unit-limit :initform 0
54
:reader enc-encode-literal-code-unit-limit)
55
;; Defines whether it is necessary to prepend a byte-order-mark to
56
;; determine the endianness.
57
(use-bom :initarg :use-bom :initform nil :reader enc-use-bom)
58
;; How the byte-order-mark should be encoded, specified as a
59
;; sequence of octets. NIL if it cannot be encoded.
61
:initarg :bom-encoding :reader enc-bom-encoding :initform nil)
62
;; How should NUL be encoded, specified as sequence of octets.
64
:initarg :nul-encoding :reader enc-nul-encoding :initform #(0))
65
;; Preferred replacement character code point.
67
:initarg :default-replacement :reader enc-default-replacement
69
;; Does VALID-STRING => OCTETS => STRING2 guarantee a valid
70
;; STRING2? UTF-{16,32} on little-endian plaforms don't because
71
;; they assume different endianness on each direction.
73
:initarg :ambiguous :reader ambiguous-encoding-p :initform nil)))
75
;;; I'm too lazy to write all the identical limits twice.
76
(defmethod initialize-instance :after ((enc character-encoding)
77
&key literal-char-code-limit)
78
(when literal-char-code-limit
79
(setf (slot-value enc 'encode-literal-code-unit-limit)
80
literal-char-code-limit)
81
(setf (slot-value enc 'decode-literal-code-unit-limit)
82
literal-char-code-limit)))
85
(defmethod describe-object ((enc character-encoding) s)
86
"Prints out the name, aliases and documentation slots of a
87
character encoding object."
88
(with-slots (name aliases documentation) enc
89
(format s "~&~S" name)
91
(format s " [Aliases:~{ ~S~}]" aliases))
92
(format s "~&~A~%~%" documentation))
95
(defvar *supported-character-encodings* nil)
97
(defun list-character-encodings ()
98
"List of keyword symbols denoting supported character
99
encodings. This list does not include aliases."
100
*supported-character-encodings*)
102
(defvar *character-encodings* (make-hash-table :test 'eq))
104
(defvar *default-character-encoding* :utf-8
105
"Special variable used to determine the default character
108
(defun get-character-encoding (name)
109
"Lookups the character encoding denoted by the keyword symbol
110
NAME. Signals an error if one is not found. If NAME is already
111
a CHARACTER-ENCONDING object, it is returned unmodified."
112
(when (typep name 'character-encoding)
113
(return-from get-character-encoding name))
114
(when (eq name :default)
115
(setq name *default-character-encoding*))
116
(or (gethash name *character-encodings*)
117
(error "Unknown character encoding: ~S" name)))
119
(defmethod ambiguous-encoding-p ((encoding symbol))
120
(ambiguous-encoding-p (get-character-encoding encoding)))
122
(defun notice-character-encoding (enc)
123
(pushnew (enc-name enc) *supported-character-encodings*)
124
(dolist (kw (cons (enc-name enc) (enc-aliases enc)))
125
(setf (gethash kw *character-encodings*) enc))
128
(defmacro define-character-encoding (name docstring &body options)
129
`(notice-character-encoding
130
(make-instance 'character-encoding :name ,name ,@options
131
:documentation ,docstring)))
135
;;; TODO: describe what mappings are
137
(defun make-fixed-width-counter (getter type &optional (unit-size-in-bits 8))
138
(declare (ignore getter type))
139
(check-type unit-size-in-bits positive-fixnum)
140
(let ((unit-size-in-bytes (/ unit-size-in-bits 8)))
141
`(named-lambda fixed-width-counter (seq start end max)
142
(declare (ignore seq) (fixnum start end max))
143
;; XXX: the result can be bigger than a fixnum when (> unit-size
144
;; 1) and we don't want that to happen. Possible solution: signal
145
;; a warning (hmm, make that an actual error) and truncate.
147
(let ((count (the fixnum (min (floor max ,unit-size-in-bytes)
148
(the fixnum (- end start))))))
149
(values (the fixnum (* count ,unit-size-in-bytes))
150
(the fixnum (+ start count))))
151
(values (the fixnum (* (the fixnum (- end start))
152
,unit-size-in-bytes))
153
(the fixnum end))))))
155
;;; Useful to develop new encodings incrementally starting with octet
156
;;; and code-unit counters.
157
(defun make-dummy-coder (sg st ds dt)
158
(declare (ignore sg st ds dt))
159
`(named-lambda dummy-coder (src s e dest i)
160
(declare (ignore src s e dest i))
161
(error "this encoder/decoder hasn't been implemented yet")))
163
;;; TODO: document here
165
;;; ENCODER -- (lambda (src-getter src-type dest-setter dest-type) ...)
166
;;; DECODER -- (lambda (src-getter src-type dest-setter dest-type) ...)
168
;;; OCTET-COUNTER -- (lambda (getter type) ...)
169
;;; CODE-POINT-COUNTER -- (lambda (getter type) ...)
170
(defclass abstract-mapping ()
171
((encoder-factory :accessor encoder-factory :initform 'make-dummy-coder)
172
(decoder-factory :accessor decoder-factory :initform 'make-dummy-coder)
173
(octet-counter-factory :accessor octet-counter-factory
174
:initform 'make-fixed-width-counter)
175
(code-point-counter-factory :accessor code-point-counter-factory
176
:initform 'make-fixed-width-counter)))
178
;;; TODO: document these
180
;;; ENCODER -- (lambda (src start end dest d-start) ...)
181
;;; DECODER -- (lambda (src start end dest d-start) ...)
183
;;; OCTET-COUNTER -- (lambda (seq start end max-octets) ...)
184
;;; CODE-POINT-COUNTER -- (lambda (seq start end max-chars) ...)
185
;;; => N-CHARS NEW-END
186
;;; (important: describe NEW-END)
187
(defclass concrete-mapping ()
188
((encoder :accessor encoder)
189
(decoder :accessor decoder)
190
(octet-counter :accessor octet-counter)
191
(code-point-counter :accessor code-point-counter)))
193
(defparameter *abstract-mappings* (make-hash-table :test 'eq))
195
(defun get-abstract-mapping (encoding)
196
(gethash encoding *abstract-mappings*))
198
(defun (setf get-abstract-mapping) (value encoding)
199
(setf (gethash encoding *abstract-mappings*) value))
201
(defun %register-mapping-part (encoding slot-name fn)
202
(let ((mapping (get-abstract-mapping encoding)))
204
(setq mapping (make-instance 'abstract-mapping))
205
(setf (get-abstract-mapping encoding) mapping))
206
(setf (slot-value mapping slot-name) fn)))
208
;;; See enc-*.lisp for example usages of these 4 macros.
210
(defmacro define-encoder (encoding (sa st da dt) &body body)
211
`(%register-mapping-part ,encoding 'encoder-factory
212
(named-lambda encoder (,sa ,st ,da ,dt)
215
(defmacro define-decoder (encoding (sa st da dt) &body body)
216
`(%register-mapping-part ,encoding 'decoder-factory
217
(named-lambda decoder (,sa ,st ,da ,dt)
220
(defmacro define-octet-counter (encoding (acc type) &body body)
221
`(%register-mapping-part ,encoding 'octet-counter-factory
222
(named-lambda octet-counter-factory (,acc ,type)
225
(defmacro define-code-point-counter (encoding (acc type) &body body)
226
`(%register-mapping-part ,encoding 'code-point-counter-factory
227
(named-lambda code-point-counter (,acc ,type)
230
(defun instantiate-encoder (encoding am octet-seq-getter octet-seq-type
231
code-point-seq-setter code-point-seq-type)
232
(declare (ignore encoding))
233
(funcall (encoder-factory am)
236
code-point-seq-setter
237
code-point-seq-type))
239
(defun instantiate-decoder (encoding am octet-seq-getter octet-seq-type
240
code-point-seq-setter code-point-seq-type)
241
(declare (ignore encoding))
242
(funcall (decoder-factory am)
245
code-point-seq-setter
246
code-point-seq-type))
248
(defun instantiate-code-point-counter (encoding am octet-seq-getter
250
(declare (ignore encoding))
251
(funcall (code-point-counter-factory am)
255
(defun instantiate-octet-counter (encoding am code-point-seq-getter
257
(if (= 1 (enc-max-units-per-char encoding))
258
(make-fixed-width-counter code-point-seq-getter code-point-seq-type
259
(enc-code-unit-size encoding))
260
(funcall (octet-counter-factory am)
261
code-point-seq-getter
262
code-point-seq-type)))
264
;;; Expands into code generated by the available abstract mappings
265
;;; that will be compiled into concrete mappings. This is used in
266
;;; e.g. strings.lisp to define mappings between strings and
267
;;; (unsigned-byte 8) vectors.
269
;;; For each encoding funcall the abstract mappings at macro-expansion
270
;;; time with the src/dest accessors and types to generate the
271
;;; appropriate code for the concrete mappings. These functions are
272
;;; then saved in their respective slots of the CONCRETE-MAPPING
274
(defmacro instantiate-concrete-mappings
275
(&key (encodings (hash-table-keys *abstract-mappings*))
276
(optimize '((speed 3) (debug 0) (compilation-speed 0)))
277
octet-seq-getter octet-seq-setter octet-seq-type
278
code-point-seq-getter code-point-seq-setter code-point-seq-type
279
(instantiate-decoders t))
280
`(let ((ht (make-hash-table :test 'eq)))
281
(declare (optimize ,@optimize)
282
#+sbcl (sb-ext:muffle-conditions sb-ext:compiler-note))
283
(flet ((notice-mapping (encoding-name cm)
284
(let* ((encoding (get-character-encoding encoding-name))
285
(aliases (enc-aliases encoding)))
286
(dolist (kw (cons (enc-name encoding) aliases))
287
(setf (gethash kw ht) cm)))))
288
,@(loop for encoding-name in encodings
289
for encoding = (get-character-encoding encoding-name)
290
for am = (gethash encoding-name *abstract-mappings*)
292
`(let ((cm (make-instance 'concrete-mapping)))
294
,(instantiate-encoder encoding am
295
code-point-seq-getter
299
,(when instantiate-decoders
302
,(instantiate-decoder encoding am
305
code-point-seq-setter
306
code-point-seq-type))
307
(setf (code-point-counter cm)
308
,(instantiate-code-point-counter
309
encoding am octet-seq-getter octet-seq-type))))
310
(setf (octet-counter cm)
311
,(instantiate-octet-counter encoding am
312
code-point-seq-getter
313
code-point-seq-type))
314
(notice-mapping ,encoding-name cm))))
320
(defun pprint-instantiate-concrete-mappings
321
(&key (encodings (hash-table-keys *abstract-mappings*))
322
(optimize '((debug 3) (safety 3)))
323
(octet-seq-setter 'ub-set) (octet-seq-getter 'ub-get)
324
(octet-seq-type '(simple-array (unsigned-byte 8) (*)))
325
(code-point-seq-setter 'string-set)
326
(code-point-seq-getter 'string-get)
327
(code-point-seq-type 'simple-unicode-string))
328
(let ((encodings (ensure-list encodings))
329
(*package* (find-package :babel-encodings))
330
(*print-case* :downcase))
333
`(instantiate-concrete-mappings
334
:encodings ,encodings
336
:octet-seq-getter ,octet-seq-getter
337
:octet-seq-setter ,octet-seq-setter
338
:octet-seq-type ,octet-seq-type
339
:code-point-seq-getter ,code-point-seq-getter
340
:code-point-seq-setter ,code-point-seq-setter
341
:code-point-seq-type ,code-point-seq-type))))
344
;;;; Utilities used in enc-*.lisp
346
(defconstant +default-substitution-code-point+ #x1a
347
"Default ASCII substitution character code point used in case of an encoding/decoding error.")
349
;;; We're converting between objects of the (UNSIGNED-BYTE 8) and
350
;;; (MOD #x110000) types which are aliased here to UB8 and CODE-POINT
352
(deftype ub8 () '(unsigned-byte 8))
353
(deftype code-point () '(mod #x110000))
355
;;; Utility macro around DEFINE-ENCODER that takes care of most of the
356
;;; work need to deal with an 8-bit, fixed-width character encoding.
358
;;; BODY will be inside a loop and its return value will placed in the
359
;;; destination buffer. BODY will be surounded by lexical BLOCK which
360
;;; will have the ENCODING's name, usually a keyword. It handles all
361
;;; sorts of type declarations.
363
;;; See enc-ascii.lisp for a simple usage example.
364
(defmacro define-unibyte-encoder (encoding (code) &body body)
365
(with-unique-names (s-getter s-type d-setter d-type
366
src start end dest d-start i di)
367
`(define-encoder ,encoding (,s-getter ,s-type ,d-setter ,d-type)
368
`(named-lambda ,',(symbolicate encoding '#:-unibyte-encoder)
369
(,',src ,',start ,',end ,',dest ,',d-start)
370
(declare (type ,,s-type ,',src)
371
(type ,,d-type ,',dest)
372
(fixnum ,',start ,',end ,',d-start))
373
(loop for ,',i fixnum from ,',start below ,',end
374
and ,',di fixnum from ,',d-start do
377
;; this should probably be a function...
378
((handle-error (&optional (c ''character-encoding-error))
380
,',',code ,',',encoding ,',',src ,',',i
381
+default-substitution-code-point+ ,c)))
382
(let ((,',code (,,s-getter ,',src ,',i)))
383
(declare (type code-point ,',code))
384
(block ,',encoding ,@',body)))
386
finally (return (the fixnum (- ,',di ,',d-start))))))))
388
;;; The decoder version of the above macro.
389
(defmacro define-unibyte-decoder (encoding (octet) &body body)
390
(with-unique-names (s-getter s-type d-setter d-type
391
src start end dest d-start i di)
392
`(define-decoder ,encoding (,s-getter ,s-type ,d-setter ,d-type)
393
`(named-lambda ,',(symbolicate encoding '#:-unibyte-encoder)
394
(,',src ,',start ,',end ,',dest ,',d-start)
395
(declare (type ,,s-type ,',src)
396
(type ,,d-type ,',dest)
397
(fixnum ,',start ,',end ,',d-start))
398
(loop for ,',i fixnum from ,',start below ,',end
399
and ,',di fixnum from ,',d-start do
402
;; this should probably be a function...
403
((handle-error (&optional (c ''character-decoding-error))
405
(vector ,',',octet) ,',',encoding ,',',src ,',',i
406
+default-substitution-code-point+ ,c)))
407
(let ((,',octet (,,s-getter ,',src ,',i)))
408
(declare (type ub8 ,',octet))
409
(block ,',encoding ,@',body)))
411
finally (return (the fixnum (- ,',di ,',d-start))))))))
413
;;;; Error Conditions
415
;;; For now, we don't define any actual restarts. The only mechanism
416
;;; for "restarting" a coding error is the
417
;;; *SUPPRESS-CHARACTER-CODING-ERRORS* special variable which, when
418
;;; bound to T (the default), suppresses any error and uses a default
419
;;; replacement character instead.
421
;;; If it turns out that other more options are necessary, possible
422
;;; alternative approaches include:
424
;;; a) use a *REPLACEMENT-CHARACTER* special variable that lets us
425
;;; pick our own replacement character. The encoder must do
426
;;; additional work to check if this is character is encodable.
428
;;; b) offer a restart to pick a replacement character. Same
429
;;; problem as above.
431
;;; Both approaches pose encoding problems when dealing with a
432
;;; variable-width encodings because different replacement characters
433
;;; will need different numbers of octets. This is not a problem for
434
;;; UTF but will be a problem for the CJK charsets. Approach (a) is
435
;;; nevertheless easier since the replacement character is known in
436
;;; advance and therefore the octet-counter can account for it.
438
;;; For more complex restarts like SBCL's -- that'll let you specify
439
;;; _several_ replacement characters for a single character error --
440
;;; will probably need extra support code outside the encoder/decoder
441
;;; (i.e. in the string-to-octets function, for example) since the
442
;;; encoders/decoders deal with pre-allocated fixed-length buffers.
444
;;; SBCL has ASCII-specific (MALFORMED-ASCII) and UTF8-specific
445
;;; errors. Why? Do we want to add some of those too?
447
;;; FIXME: We used to deal with this with an extra ERRORP argument for
448
;;; encoders, decoders, etc... Still undecided on the best way to do
449
;;; it. We could also use a simple restart instead of this...
451
;;; In any case, this is not for the users to bind and it's not
452
;;; exported from the BABEL package.
453
(defvar *suppress-character-coding-errors* nil
454
"If non-NIL, encoding or decoding errors are suppressed and the
455
the current character encoding's default replacement character is
458
;;; All of Babel's error conditions are subtypes of
459
;;; CHARACTER-CODING-ERROR. This error hierarchy is based on SBCL's.
460
(define-condition character-coding-error (error)
461
((buffer :initarg :buffer :reader character-coding-error-buffer)
462
(position :initarg :position :reader character-coding-error-position)
463
(encoding :initarg :encoding :reader character-coding-error-encoding)))
465
(define-condition character-encoding-error (character-coding-error)
466
((code :initarg :code :reader character-encoding-error-code))
467
(:report (lambda (c s)
468
(format s "Unable to encode character code point ~A as ~S."
469
(character-encoding-error-code c)
470
(character-coding-error-encoding c)))))
472
(declaim (inline encoding-error))
473
(defun encoding-error (code enc buf pos &optional
474
(sub +default-substitution-code-point+)
475
(e 'character-encoding-error))
476
(unless *suppress-character-coding-errors*
477
(error e :encoding enc :buffer buf :position pos :code code))
480
(define-condition character-decoding-error (character-coding-error)
481
((octets :initarg :octets :reader character-decoding-error-octets))
482
(:report (lambda (c s)
483
(format s "Illegal ~S character starting at position ~D."
484
(character-coding-error-encoding c)
485
(character-coding-error-position c)))))
487
(define-condition end-of-input-in-character (character-decoding-error)
489
(:documentation "Signalled by DECODERs or CODE-POINT-COUNTERs
490
of variable-width character encodings."))
492
(define-condition character-out-of-range (character-decoding-error)
495
"Signalled when the character being decoded is out of range."))
497
(declaim (inline decoding-error))
498
(defun decoding-error (octets enc buf pos &optional
499
(sub +default-substitution-code-point+)
500
(e 'character-decoding-error))
501
(unless *suppress-character-coding-errors*
502
(error e :octets octets :encoding enc :buffer buf :position pos))