Coverage report: /home/ellis/.stash/quicklisp/dists/ultralisp/software/cl-babel-babel-20240610131823/src/strings.lisp
Kind | Covered | All | % |
expression | 8 | 323 | 2.5 |
branch | 1 | 8 | 12.5 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-
3
;;; strings.lisp --- Conversions between strings and UB8 vectors.
5
;;; Copyright (C) 2007, Luis Oliveira <loliveira@common-lisp.net>
7
;;; Permission is hereby granted, free of charge, to any person
8
;;; obtaining a copy of this software and associated documentation
9
;;; files (the "Software"), to deal in the Software without
10
;;; restriction, including without limitation the rights to use, copy,
11
;;; modify, merge, publish, distribute, sublicense, and/or sell copies
12
;;; of the Software, and to permit persons to whom the Software is
13
;;; furnished to do so, subject to the following conditions:
15
;;; The above copyright notice and this permission notice shall be
16
;;; included in all copies or substantial portions of the Software.
18
;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22
;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23
;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25
;;; DEALINGS IN THE SOFTWARE.
29
;;; The usefulness of this string/octets interface of Babel's is very
30
;;; limited on Lisps with 8-bit characters which will in effect only
31
;;; support the latin-1 subset of Unicode. That is, all encodings are
32
;;; supported but we can only store the first 256 code points in Lisp
33
;;; strings. Support for using other 8-bit encodings for strings on
34
;;; these Lisps could be added with an extra encoding/decoding step.
35
;;; Supporting other encodings with larger code units would be silly
36
;;; (it would break expectations about common string operations) and
37
;;; better done with something like Closure's runes.
39
;;; Can we handle unicode fully?
40
(eval-when (:compile-toplevel :load-toplevel :execute)
41
;; The EVAL is just here to avoid warnings...
42
(case (eval char-code-limit)
43
(#x100 (pushnew '8-bit-chars *features*))
44
(#x10000 (pushnew 'ucs-2-chars *features*))
46
;; This is here mostly because if the CHAR-CODE-LIMIT is bigger
47
;; than #x11000, strange things might happen but we probably
48
;; shouldn't descriminate against other, smaller, values.
49
(t (error "Strange CHAR-CODE-LIMIT (#x~X), bailing out."
52
;;; Adapted from Ironclad. TODO: check if it's worthwhile adding
53
;;; implementation-specific accessors such as SAP-REF-* for SBCL.
54
(defmacro ub-get (vector index &optional (bytes 1) (endianness :ne))
55
(let ((big-endian (member endianness
56
'(:be #+big-endian :ne #+little-endian :re))))
57
(once-only (vector index)
59
,(1- (ash 1 (* 8 bytes)))
61
,@(loop for i from 0 below bytes
62
for offset = (if big-endian i (- bytes i 1))
63
for shift = (if big-endian
66
collect `(ash (aref ,vector (+ ,index ,offset)) ,shift)))))))
68
(defmacro ub-set (value vector index &optional (bytes 1) (endianness :ne))
69
(let ((big-endian (member endianness
70
'(:be #+big-endian :ne #+little-endian :re))))
72
,@(loop for i from 1 to bytes
73
for offset = (if big-endian (- bytes i) (1- i)) collect
74
`(setf (aref ,vector (+ ,index ,offset))
75
(ldb (byte 8 ,(* 8 (1- i))) ,value)))
78
(defmacro string-get (string index)
79
`(char-code (schar ,string ,index)))
81
(defmacro string-set (code string index)
82
`(setf (schar ,string ,index) (code-char ,code)))
84
;;; SIMPLE-BASE-STRING would also be a subtype of SIMPLE-STRING so we
85
;;; don't use that because on SBCL BASE-CHARs can only hold ASCII.
86
;;; Also, with (> SPEED SAFETY) (setf (schar base-str n) big-char)
87
;;; will quietly work, sort of.
89
;;; XXX: test this on various lisps.
91
(defconstant unicode-char-code-limit
93
"An alias for CL:CHAR-CODE-LIMIT which might be lower than
94
#x110000 on some Lisps.")
96
(deftype unicode-char ()
97
"This character type can hold any characters whose CHAR-CODEs
98
are less than UNICODE-CHAR-CODE-LIMIT."
99
#+lispworks 'lw:simple-char
100
#-lispworks 'character)
102
(deftype simple-unicode-string ()
103
"Alias for (SIMPLE-ARRAY UNICODE-CHAR (*))."
104
'(simple-array unicode-char (*)))
106
(deftype unicode-string ()
107
"Alias for (VECTOR UNICODE-CHAR *)."
108
'(vector unicode-char *))
110
(defparameter *string-vector-mappings*
111
(instantiate-concrete-mappings
112
;; :optimize ((speed 3) (safety 0) (debug 0) (compilation-speed 0))
113
:octet-seq-setter ub-set
114
:octet-seq-getter ub-get
115
:octet-seq-type (simple-array (unsigned-byte 8) (*))
116
:code-point-seq-setter string-set
117
:code-point-seq-getter string-get
118
:code-point-seq-type simple-unicode-string))
121
(defparameter *simple-base-string-vector-mappings*
122
(instantiate-concrete-mappings
123
;; :optimize ((speed 3) (safety 0) (debug 0) (compilation-speed 0))
124
:instantiate-decoders nil
125
:octet-seq-setter ub-set
126
:octet-seq-getter ub-get
127
:octet-seq-type (simple-array (unsigned-byte 8) (*))
128
:code-point-seq-setter string-set
129
:code-point-seq-getter string-get
130
:code-point-seq-type simple-base-string))
132
;;; Do we want a more a specific error condition here?
133
(defun check-vector-bounds (vector start end)
134
(unless (<= 0 start end (length vector))
135
(error "Invalid start (~A) and end (~A) values for vector of length ~A."
136
start end (length vector))))
138
(defmacro with-simple-vector (((v vector) (s start) (e end)) &body body)
139
"If VECTOR is a displaced or adjustable array, binds V to the
140
underlying simple vector, adds an adequate offset to START and
141
END and binds those offset values to S and E. Otherwise, if
142
VECTOR is already a simple array, it's simply bound to V with no
145
START and END are unchecked and assumed to be within bounds.
147
Note that in some Lisps, a slow copying implementation is
148
necessary to obtain a simple vector thus V will be bound to a
149
copy of VECTOR coerced to a simple-vector. Therefore, you
150
shouldn't attempt to modify V."
152
`(sb-kernel:with-array-data ((,v ,vector) (,s ,start) (,e ,end))
155
`(lisp::with-array-data ((,v ,vector) (,s ,start) (,e ,end))
158
(with-unique-names (offset)
159
`(multiple-value-bind (,v ,offset)
160
(ccl::array-data-and-offset ,vector)
161
(let ((,s (+ ,start ,offset))
162
(,e (+ ,end ,offset)))
165
(with-unique-names (offset)
166
`(excl::with-underlying-simple-vector (,vector ,v ,offset)
167
(let ((,e (+ ,end ,offset))
168
(,s (+ ,start ,offset)))
170
;; slow, copying implementation
171
#-(or sbcl cmu scl openmcl allegro)
173
`(funcall (if (adjustable-array-p ,vector)
174
#'call-with-array-data/copy
175
#'call-with-array-data/fast)
177
(lambda (,v ,s ,e) ,@body))))
179
#-(or sbcl cmu scl openmcl allegro)
182
(defun array-data-and-offset (array)
183
(loop with offset = 0 do
184
(multiple-value-bind (displaced-to index-offset)
185
(array-displacement array)
186
(when (null displaced-to)
187
(return-from array-data-and-offset
188
(values array offset)))
189
(incf offset index-offset)
190
(setf array displaced-to))))
192
(defun call-with-array-data/fast (vector start end fn)
193
(multiple-value-bind (data offset)
194
(array-data-and-offset vector)
195
(funcall fn data (+ offset start) (+ offset end))))
197
(defun call-with-array-data/copy (vector start end fn)
198
(funcall fn (replace (make-array (- end start) :element-type
199
(array-element-type vector))
200
vector :start2 start :end2 end)
203
(defmacro with-checked-simple-vector (((v vector) (s start) (e end)) &body body)
204
"Like WITH-SIMPLE-VECTOR but bound-checks START and END."
205
(once-only (vector start)
206
`(let ((,e (or ,end (length ,vector))))
207
(check-vector-bounds ,vector ,start ,e)
208
(with-simple-vector ((,v ,vector) (,s ,start) (,e ,e))
211
;;; Future features these functions should have:
214
;;; * specify target vector/string + offset
215
;;; * documentation :)
217
(declaim (inline octets-to-string string-to-octets string-size-in-octets
218
vector-size-in-chars concatenate-strings-to-octets
221
(defun octets-to-string (vector &key (start 0) end
222
(errorp (not *suppress-character-coding-errors*))
223
(encoding *default-character-encoding*))
224
(check-type vector (vector (unsigned-byte 8)))
225
(with-checked-simple-vector ((vector vector) (start start) (end end))
226
(declare (type (simple-array (unsigned-byte 8) (*)) vector))
227
(let ((*suppress-character-coding-errors* (not errorp))
228
(mapping (lookup-mapping *string-vector-mappings* encoding)))
229
(multiple-value-bind (size new-end)
230
(funcall (code-point-counter mapping) vector start end -1)
231
;; TODO we could optimize ASCII here: the result should
232
;; be a simple-base-string filled using code-char...
233
(let ((string (make-string size :element-type 'unicode-char)))
234
(funcall (decoder mapping) vector start new-end string 0)
237
(defun bom-vector (encoding use-bom)
238
(check-type use-bom (member :default t nil))
242
(let ((enc (typecase encoding
243
(external-format (external-format-encoding encoding))
244
(t (get-character-encoding encoding)))))
245
(if (or (eq use-bom t)
246
(and (eq use-bom :default) (enc-use-bom enc)))
247
;; VALUES avoids a "type assertion too complex to check" note.
248
(values (enc-bom-encoding enc))
251
(defun string-to-octets (string &key (encoding *default-character-encoding*)
252
(start 0) end (use-bom :default)
253
(errorp (not *suppress-character-coding-errors*)))
254
(declare (optimize (speed 3) (safety 2)))
255
(let ((*suppress-character-coding-errors* (not errorp)))
257
;; On some lisps (e.g. clisp and ccl) all strings are BASE-STRING and all
258
;; characters are BASE-CHAR. So, only enable this optimization for
263
(setf end (length string)))
264
(check-vector-bounds string start end)
265
(let* ((mapping (lookup-mapping *simple-base-string-vector-mappings*
267
(bom (bom-vector encoding use-bom))
268
(bom-length (length bom))
269
;; OPTIMIZE: we could use the (length string) information here
270
;; because it's a simple-base-string where each character <= 127
273
(funcall (the function (octet-counter mapping))
274
string start end -1))
276
:element-type '(unsigned-byte 8))))
278
(funcall (the function (encoder mapping))
279
string start end result bom-length)
282
;; FIXME: we shouldn't really need that coercion to UNICODE-STRING
283
;; but we kind of because it's declared all over. To avoid that,
284
;; we'd need different types for input and output strings. Or maybe
285
;; this is not a problem; figure that out.
286
(with-checked-simple-vector ((string (coerce string 'unicode-string))
287
(start start) (end end))
288
(declare (type simple-unicode-string string))
289
(let* ((mapping (lookup-mapping *string-vector-mappings* encoding))
290
(bom (bom-vector encoding use-bom))
291
(bom-length (length bom))
294
(funcall (the function (octet-counter mapping))
295
string start end -1))
297
:element-type '(unsigned-byte 8))))
299
(funcall (the function (encoder mapping))
300
string start end result bom-length)
303
(defun concatenate-strings-to-octets (encoding &rest strings)
304
"Optimized equivalent of
305
\(string-to-octets \(apply #'concatenate 'string strings)
307
(declare (dynamic-extent strings))
308
(let* ((mapping (lookup-mapping *string-vector-mappings* encoding))
309
(octet-counter (octet-counter mapping))
313
:key (lambda (string)
314
(funcall octet-counter
315
string 0 (length string) -1))))
316
:element-type '(unsigned-byte 8)))
318
(declare (type array-index current-index))
319
(dolist (string strings)
320
(check-type string string)
321
(with-checked-simple-vector ((string (coerce string 'unicode-string))
322
(start 0) (end (length string)))
323
(declare (type simple-unicode-string string))
325
(funcall (encoder mapping)
326
string start end vector current-index))))
329
(defun string-size-in-octets (string &key (start 0) end (max -1 maxp)
330
(errorp (not *suppress-character-coding-errors*))
331
(encoding *default-character-encoding*))
332
(check-type string string)
333
(with-checked-simple-vector ((string (coerce string 'unicode-string))
334
(start start) (end end))
335
(declare (type simple-unicode-string string))
336
(let ((mapping (lookup-mapping *string-vector-mappings* encoding))
337
(*suppress-character-coding-errors* (not errorp)))
338
(when maxp (assert (plusp max)))
339
(funcall (octet-counter mapping) string start end max))))
341
(defun vector-size-in-chars (vector &key (start 0) end (max -1 maxp)
342
(errorp (not *suppress-character-coding-errors*))
343
(encoding *default-character-encoding*))
344
(check-type vector (vector (unsigned-byte 8)))
345
(with-checked-simple-vector ((vector vector) (start start) (end end))
346
(declare (type (simple-array (unsigned-byte 8) (*)) vector))
347
(let ((mapping (lookup-mapping *string-vector-mappings* encoding))
348
(*suppress-character-coding-errors* (not errorp)))
349
(when maxp (assert (plusp max)))
350
(funcall (code-point-counter mapping) vector start end max))))
352
(declaim (notinline octets-to-string string-to-octets string-size-in-octets
353
vector-size-in-chars concatenate-strings-to-octets))