Coverage report: /home/ellis/.stash/quicklisp/dists/ultralisp/software/cl-babel-babel-20240610131823/src/enc-unicode.lisp

Kind	Covered	All	%
expression	0	130	0.0
branch	0	6	0.0

Key

Not instrumented

Conditionalized out

Executed

Not executed

Both branches taken

One branch taken

Neither branch taken

1

;;;; -*- Mode: lisp; indent-tabs-mode: nil -*-

2

;;;

3

;;; enc-unicode.lisp --- Unicode encodings.

4

;;;

5

6

;;;

7

;;; Permission is hereby granted, free of charge, to any person

8

;;; obtaining a copy of this software and associated documentation

9

;;; files (the "Software"), to deal in the Software without

10

;;; restriction, including without limitation the rights to use, copy,

11

;;; modify, merge, publish, distribute, sublicense, and/or sell copies

12

;;; of the Software, and to permit persons to whom the Software is

13

;;; furnished to do so, subject to the following conditions:

14

;;;

15

;;; The above copyright notice and this permission notice shall be

16

;;; included in all copies or substantial portions of the Software.

17

;;;

18

;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

19

;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF

20

;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

21

;;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT

22

;;; HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,

23

;;; WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

24

;;; OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER

25

;;; DEALINGS IN THE SOFTWARE.

26

27

;;; This implementation is largely based on OpenMCL's l1-unicode.lisp

28

29

30

(in-package #:babel-encodings)

31

32

(eval-when (:compile-toplevel :load-toplevel :execute)

33

(defconstant +repl+ #xfffd "Unicode replacement character code point.")

34

(defconstant +byte-order-mark-code+ #xfeff)

35

(defconstant +swapped-byte-order-mark-code+ #xfffe)

36

(defconstant +swapped-byte-order-mark-code-32+ #xfffe0000))

37

38

;;; Some convenience macros adding FIXNUM declarations.

39

(defmacro f-ash (integer count) `(the fixnum (ash ,integer ,count)))

40

(defmacro f-logior (&rest integers) `(the fixnum (logior ,@integers)))

41

(defmacro f-logand (&rest integers) `(the fixnum (logand ,@integers)))

42

(defmacro f-logxor (&rest integers) `(the fixnum (logxor ,@integers)))

43

44

;;;; UTF-8

45

46

(define-character-encoding :utf-8

47

"An 8-bit, variable-length character encoding in which

48

character code points in the range #x00-#x7f can be encoded in a

49

single octet; characters with larger code values can be encoded

50

in 2 to 4 bytes."

51

:max-units-per-char 4

52

:literal-char-code-limit #x80

53

:bom-encoding #(#xef #xbb #xbf)

54

:default-replacement #xfffd)

55

56

(define-condition invalid-utf8-starter-byte (character-decoding-error)

57

()

58

(:documentation "Signalled when an invalid UTF-8 starter byte is found."))

59

60

(define-condition invalid-utf8-continuation-byte (character-decoding-error)

61

()

62

(:documentation

63

"Signalled when an invalid UTF-8 continuation byte is found."))

64

65

(define-condition overlong-utf8-sequence (character-decoding-error)

66

()

67

(:documentation "Signalled upon overlong UTF-8 sequences."))

68

69

(define-octet-counter :utf-8 (getter type)

70

`(named-lambda utf-8-octet-counter (seq start end max)

71

(declare (type ,type seq) (fixnum start end max))

72

(loop with noctets fixnum = 0

73

for i fixnum from start below end

74

for code of-type code-point = (,getter seq i) do

75

(let ((new (+ (cond ((< code #x80) 1)

76

((< code #x800) 2)

77

((< code #x10000) 3)

78

(t 4))

79

noctets)))

80

(if (and (plusp max) (> new max))

81

(loop-finish)

82

(setq noctets new)))

83

finally (return (values noctets i)))))

84

85

(define-code-point-counter :utf-8 (getter type)

86

`(named-lambda utf-8-code-point-counter (seq start end max)

87

(declare (type ,type seq) (fixnum start end max))

88

(loop with nchars fixnum = 0

89

with i fixnum = start

90

while (< i end) do

91

;; check for invalid continuation bytes

92

(macrolet ((invalid-cb-p (n)

93

`(and (< (+ i ,n) end)

94

(not (< #x7f (,',getter seq (+ i ,n)) #xc0)))))

95

;; wrote this code with LET instead of FOR because CLISP's

96

;; LOOP doesn't like WHILE clauses before FOR clauses.

97

(let* ((octet (,getter seq i))

98

(next-i (+ i (cond ((or (< octet #xc0) (invalid-cb-p 1)) 1)

99

                                        ((or (< octet #xe0) (invalid-cb-p 2)) 2)

100

                                        ((or (< octet #xf0) (invalid-cb-p 3)) 3)

101

                                        ((or (< octet #xf8) (invalid-cb-p 4)) 4)

102

                                        ((or (< octet #xfc) (invalid-cb-p 5)) 5)

103

(t 6)))))

104

(declare (type ub8 octet) (fixnum next-i))

105

(cond

106

((> next-i end)

107

;; Should we add restarts to this error, we'll have

108

;; to figure out a way to communicate with the

109

;; decoder since we probably want to do something

110

;; about it right here when we have a chance to

111

;; change the count or something. (Like an

112

;; alternative replacement character or perhaps the

113

;; existence of this error so that the decoder

114

;; doesn't have to check for it on every iteration

115

;; like we do.)

116

;;

117

;; FIXME: The data for this error is not right.

118

(decoding-error (vector octet) :utf-8 seq i

119

nil 'end-of-input-in-character)

120

(return (values (1+ nchars) end)))

121

(t

122

(setq nchars (1+ nchars)

123

i next-i)

124

(when (and (plusp max) (= nchars max))

125

(return (values nchars i)))))))

126

finally (progn

127

(assert (= i end))

128

(return (values nchars i))))))

129

130

(define-encoder :utf-8 (getter src-type setter dest-type)

131

`(named-lambda utf-8-encoder (src start end dest d-start)

132

(declare (type ,src-type src)

133

(type ,dest-type dest)

134

(fixnum start end d-start))

135

(loop with di fixnum = d-start

136

for i fixnum from start below end

137

for code of-type code-point = (,getter src i) do

138

(macrolet ((set-octet (offset value)

139

`(,',setter ,value dest (the fixnum (+ di ,offset)))))

140

(cond

141

;; 1 octet

142

((< code #x80)

143

(set-octet 0 code)

144

(incf di))

145

;; 2 octets

146

((< code #x800)

147

(set-octet 0 (logior #xc0 (f-ash code -6)))

148

(set-octet 1 (logior #x80 (f-logand code #x3f)))

149

(incf di 2))

150

;; 3 octets

151

((< code #x10000)

152

(set-octet 0 (logior #xe0 (f-ash code -12)))

153

(set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6))))

154

(set-octet 2 (logior #x80 (f-logand code #x3f)))

155

(incf di 3))

156

;; 4 octets

157

(t

158

(set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18))))

159

(set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12))))

160

(set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6))))

161

(set-octet 3 (logior #x80 (logand code #x3f)))

162

(incf di 4))))

163

finally (return (the fixnum (- di d-start))))))

164

165

(define-decoder :utf-8 (getter src-type setter dest-type)

166

`(named-lambda utf-8-decoder (src start end dest d-start)

167

(declare (type ,src-type src)

168

(type ,dest-type dest)

169

(fixnum start end d-start))

170

(let ((u2 0) (u3 0) (u4 0) (u5 0) (u6 0))

171

(declare (type ub8 u2 u3 u4 u5 u6))

172

(loop for di fixnum from d-start

173

for i fixnum from start below end

174

for u1 of-type ub8 = (,getter src i) do

175

;; Note: CONSUME-OCTET doesn't check if I is being

176

;; incremented past END. We're assuming that END has

177

;; been calculated with the CODE-POINT-POINTER above that

178

;; checks this.

179

(macrolet

180

((consume-octet ()

181

`(let ((next-i (incf i)))

182

(if (= next-i end)

183

;; FIXME: data for this error is incomplete.

184

;; and signalling this error twice

185

(return-from setter-block

186

(decoding-error nil :utf-8 src i +repl+

187

'end-of-input-in-character))

188

(,',getter src next-i))))

189

(handle-error (n &optional (c 'character-decoding-error))

190

`(decoding-error

191

(vector ,@(subseq '(u1 u2 u3 u4 u5 u6) 0 n))

192

:utf-8 src (1+ (- i ,n)) +repl+ ',c))

193

(handle-error-if-icb (var n)

194

`(when (not (< #x7f ,var #xc0))

195

(decf i)

196

(return-from setter-block

197

(handle-error ,n invalid-utf8-continuation-byte)))))

198

(,setter

199

(block setter-block

200

(cond

201

((< u1 #x80) u1) ; 1 octet

202

((< u1 #xc0)

203

(handle-error 1 invalid-utf8-starter-byte))

204

(t

205

(setq u2 (consume-octet))

206

(handle-error-if-icb u2 1)

207

(cond

208

((< u1 #xc2)

209

(handle-error 2 overlong-utf8-sequence))

210

((< u1 #xe0) ; 2 octets

211

(logior (f-ash (f-logand #x1f u1) 6)

212

(f-logxor u2 #x80)))

213

(t

214

(setq u3 (consume-octet))

215

(handle-error-if-icb u3 2)

216

(cond

217

((and (= u1 #xe0) (< u2 #xa0))

218

(handle-error 3 overlong-utf8-sequence))

219

((< u1 #xf0) ; 3 octets

220

(let ((start (f-logior (f-ash (f-logand u1 #x0f) 12)

221

                                                   (f-ash (f-logand u2 #x3f) 6))))

222

(if (<= #xd800 start #xdfc0)

223

(handle-error 3 character-out-of-range)

224

(logior start (f-logand u3 #x3f)))))

225

(t ; 4 octets

226

(setq u4 (consume-octet))

227

(handle-error-if-icb u4 3)

228

(cond

229

((and (= u1 #xf0) (< u2 #x90))

230

(handle-error 4 overlong-utf8-sequence))

231

((< u1 #xf8)

232

                               (if (or (> u1 #xf4) (and (= u1 #xf4) (> u2 #x8f)))

233

(handle-error 4 character-out-of-range)

234

(f-logior (f-ash (f-logand u1 7) 18)

235

                                             (f-ash (f-logxor u2 #x80) 12)

236

                                             (f-ash (f-logxor u3 #x80) 6)

237

(f-logxor u4 #x80))))

238

;; from here on we'll be getting either

239

;; invalid continuation bytes or overlong

240

;; 5-byte or 6-byte sequences.

241

(t

242

(setq u5 (consume-octet))

243

(handle-error-if-icb u5 4)

244

(cond

245

((and (= u1 #xf8) (< u2 #x88))

246

(handle-error 5 overlong-utf8-sequence))

247

((< u1 #xfc)

248

(handle-error 5 character-out-of-range))

249

(t

250

(setq u6 (consume-octet))

251

(handle-error-if-icb u6 5)

252

(cond

253

((and (= u1 #xfc) (< u2 #x84))

254

(handle-error 6 overlong-utf8-sequence))

255

(t

256

(handle-error 6 character-out-of-range)

257

)))))))))))))

258

dest di))

259

finally (return (the fixnum (- di d-start)))))))

260

261

;;;; UTF-8B

262

263

;;; The following excerpt from a linux-utf8 message by Markus Kuhn is

264

;;; the closest thing to a UTF-8B specification:

265

;;;

266

;;; <http://mail.nl.linux.org/linux-utf8/2000-07/msg00040.html>

267

;;;

268

;;; "D) Emit a malformed UTF-16 sequence for every byte in a malformed

269

;;; UTF-8 sequence

270

;;;

271

;;; All the previous options for converting malformed UTF-8 sequences

272

;;; to UTF-16 destroy information. This can be highly undesirable in

273

;;; applications such as text file editors, where guaranteed binary

274

;;; transparency is a desireable feature. (E.g., I frequently edit

275

;;; executable code or graphic files with the Emacs text editor and I

276

;;; hate the idea that my editor might automatically make U+FFFD

277

;;; substitutions at locations that I haven't even edited when I save

278

;;; the file again.)

279

;;;

280

;;; I therefore suggested 1999-11-02 on the unicode@xxxxxxxxxxx

281

;;; mailing list the following approach. Instead of using U+FFFD,

282

;;; simply encode malformed UTF-8 sequences as malformed UTF-16

283

;;; sequences. Malformed UTF-8 sequences consist excludively of the

284

;;; bytes 0x80 - 0xff, and each of these bytes can be represented

285

;;; using a 16-bit value from the UTF-16 low-half surrogate zone

286

;;; U+DC80 to U+DCFF. Thus, the overlong "K" (U+004B) 0xc1 0x8b from

287

;;; the above example would be represented in UTF-16 as U+DCC1

288

;;; U+DC8B. If we simply make sure that every UTF-8 encoded surrogate

289

;;; character is also treated like a malformed sequence, then there

290

;;; is no way that a single high-half surrogate could precede the

291

;;; encoded malformed sequence and cause a valid UTF-16 sequence to

292

;;; emerge.

293

;;;

294

;;; This way 100% binary transparent UTF-8 -> UTF-16 -> UTF-8

295

;;; round-trip compatibility can be achieved quite easily.

296

;;;

297

;;; On an output device, a lonely low-half surrogate character should

298

;;; be treated just like a character outside the adopted subset of

299

;;; representable characters, that is for the end user, the display

300

;;; would look exactly like with semantics B), i.e. one symbol per

301

;;; byte of a malformed sequence. However in contrast to semantics

302

;;; B), no information is thrown away, and a cut&paste in an editor

303

;;; or terminal emulator will be guaranteed to reconstruct the

304

;;; original byte sequence. This should greatly reduce the incidence

305

;;; of accidental corruption of binary data by UTF-8 -> UTF-16 ->

306

;;; UTF-8 conversion round trips."

307

308

(define-character-encoding :utf-8b

309

"An 8-bit, variable-length character encoding in which

310

character code points in the range #x00-#x7f can be encoded in a

311

single octet; characters with larger code values can be encoded

312

in 2 to 4 bytes. Invalid UTF-8 sequences are encoded with #xDCXX

313

code points for each invalid byte."

314

:max-units-per-char 4

315

:literal-char-code-limit #x80

316

:bom-encoding #(#xef #xbb #xbf)

317

:default-replacement nil)

318

319

;;; TODO: reuse the :UTF-8 octet counter through a simple macro.

320

(define-octet-counter :utf-8b (getter type)

321

`(named-lambda utf-8b-octet-counter (seq start end max)

322

(declare (type ,type seq) (fixnum start end max))

323

(loop with noctets fixnum = 0

324

for i fixnum from start below end

325

for code of-type code-point = (,getter seq i) do

326

(let ((new (+ (cond ((< code #x80) 1)

327

((< code #x800) 2)

328

((<= #xdc80 code #xdcff) 1)

329

((< code #x10000) 3)

330

(t 4))

331

noctets)))

332

(if (and (plusp max) (> new max))

333

(loop-finish)

334

(setq noctets new)))

335

finally (return (values noctets i)))))

336

337

(define-code-point-counter :utf-8b (getter type)

338

`(named-lambda utf-8b-code-point-counter (seq start end max)

339

(declare (type ,type seq) (fixnum start end max))

340

(loop with nchars fixnum = 0

341

with i fixnum = start

342

while (< i end) do

343

;; wrote this code with LET instead of FOR because CLISP's

344

;; LOOP doesn't like WHILE clauses before FOR clauses.

345

(let* ((octet (,getter seq i))

346

(noctets (cond ((< octet #x80) 1)

347

((< octet #xe0) 2)

348

((< octet #xf0) 3)

349

(t 4))))

350

(declare (type ub8 octet) (fixnum noctets))

351

(cond

352

((> (+ i noctets) end)

353

;; If this error is suppressed these last few bytes

354

;; will be encoded as raw bytes later.

355

(decoding-error (vector octet) :utf-8 seq i

356

nil 'end-of-input-in-character)

357

(return (values (+ nchars (- end i)) end)))

358

(t

359

;; FIXME: clean this mess up.

360

(let* ((u1 octet)

361

(u2 (if (>= noctets 2) (,getter seq (1+ i)) 0))

362

(u3 (if (>= noctets 3) (,getter seq (+ i 2)) 0))

363

(u4 (if (= noctets 4) (,getter seq (+ i 3)) 0))

364

(inc (or (and (> noctets 1)

365

(< u1 #xc2))

366

(and (= noctets 2)

367

(not (logior u2 #x40)))

368

(and (= noctets 3)

369

(not (and (< (f-logxor u2 #x80) #x40)

370

                                                (< (f-logxor u3 #x80) #x40)

371

                                                (or (>= u1 #xe1) (>= u2 #xa0))

372

                                                (or (/= u1 #xed) (< u2 #xa0) (> u2 #xbf)))))

373

(and (= noctets 4)

374

(not

375

(and (< (f-logxor u2 #x80) #x40)

376

(< (f-logxor u3 #x80) #x40)

377

(< (f-logxor u4 #x80) #x40)

378

                                            (or (>= u1 #xf1) (>= u2 #x90))))))))

379

(let ((new-nchars (if inc (+ nchars noctets) (1+ nchars))))

380

(when (and (plusp max) (> new-nchars max))

381

(return (values nchars i)))

382

(incf i noctets)

383

(setq nchars new-nchars))))))

384

finally (progn

385

(assert (= i end))

386

(return (values nchars i))))))

387

388

;;; TODO: reuse the :UTF-8 encoder with through a simple macro.

389

(define-encoder :utf-8b (getter src-type setter dest-type)

390

`(named-lambda utf-8b-encoder (src start end dest d-start)

391

(declare (type ,src-type src)

392

(type ,dest-type dest)

393

(fixnum start end d-start))

394

(loop with di fixnum = d-start

395

for i fixnum from start below end

396

for code of-type code-point = (,getter src i) do

397

(macrolet ((set-octet (offset value)

398

`(,',setter ,value dest (the fixnum (+ di ,offset)))))

399

(cond

400

;; 1 octet

401

((< code #x80)

402

(set-octet 0 code)

403

(incf di))

404

;; 2 octets

405

((< code #x800)

406

(set-octet 0 (logior #xc0 (f-ash code -6)))

407

(set-octet 1 (logior #x80 (f-logand code #x3f)))

408

(incf di 2))

409

;; 1 octet (invalid octet)

410

((<= #xdc80 code #xdcff)

411

(set-octet 0 (f-logand code #xff))

412

(incf di))

413

;; 3 octets

414

((< code #x10000)

415

(set-octet 0 (logior #xe0 (f-ash code -12)))

416

(set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -6))))

417

(set-octet 2 (logior #x80 (f-logand code #x3f)))

418

(incf di 3))

419

;; 4 octets

420

(t

421

(set-octet 0 (logior #xf0 (f-logand #x07 (f-ash code -18))))

422

(set-octet 1 (logior #x80 (f-logand #x3f (f-ash code -12))))

423

(set-octet 2 (logior #x80 (f-logand #x3f (f-ash code -6))))

424

(set-octet 3 (logand #x3f code))

425

(incf di 4))))

426

finally (return (the fixnum (- di d-start))))))

427

428

(define-decoder :utf-8b (getter src-type setter dest-type)

429

`(named-lambda utf-8b-decoder (src start end dest d-start)

430

(declare (type ,src-type src)

431

(type ,dest-type dest)

432

(fixnum start end d-start))

433

(let ((u2 0) (u3 0) (u4 0))

434

(declare (type ub8 u2 u3 u4))

435

(loop for di fixnum from d-start

436

for i fixnum from start below end

437

for u1 of-type ub8 = (,getter src i) do

438

;; Unlike the UTF-8 version, this version of

439

;; CONSUME-OCTET needs to check if I is being incremented

440

;; past END because we might have trailing binary

441

;; garbage.

442

(macrolet

443

((consume-octet (n)

444

`(if (= i (1- end))

445

(encode-raw-octets ,n)

446

(,',getter src (incf i))))

447

(encode-raw-octets (n)

448

`(progn

449

,@(loop for i below n and var in '(u1 u2 u3 u4)

450

collect `(,',setter (logior #xdc00 ,var) dest di)

451

unless (= i (1- n))

452

collect '(incf di))

453

(return-from set-body))))

454

(block set-body

455

(,setter (cond

456

((< u1 #x80) ; 1 octet

457

u1)

458

((>= u1 #xc2)

459

(setq u2 (consume-octet 1))

460

(cond

461

((< u1 #xe0) ; 2 octets

462

(if (< (f-logxor u2 #x80) #x40)

463

(logior (f-ash (f-logand #x1f u1) 6)

464

(f-logxor u2 #x80))

465

(encode-raw-octets 2)))

466

(t

467

(setq u3 (consume-octet 2))

468

(cond

469

((< u1 #xf0) ; 3 octets

470

(if (and (< (f-logxor u2 #x80) #x40)

471

                                             (< (f-logxor u3 #x80) #x40)

472

                                             (or (>= u1 #xe1) (>= u2 #xa0)))

473

                                        (let ((start (f-logior (f-ash (f-logand u1 #x0f) 12)

474

                                                               (f-ash (f-logand u2 #x3f) 6))))

475

(if (<= #xd800 start #xdfc0)

476

(encode-raw-octets 3)

477

                                              (logior start (f-logand u3 #x3f))))

478

(encode-raw-octets 3)))

479

(t ; 4 octets

480

(setq u4 (consume-octet 3))

481

(if (and (< (f-logxor u2 #x80) #x40)

482

                                             (< (f-logxor u3 #x80) #x40)

483

                                             (< (f-logxor u4 #x80) #x40)

484

                                             (or (>= u1 #xf1) (>= u2 #x90)))

485

(logior

486

                                         (f-logior (f-ash (f-logand u1 7) 18)

487

                                                   (f-ash (f-logxor u2 #x80) 12))

488

                                         (f-logior (f-ash (f-logxor u3 #x80) 6)

489

                                                   (f-logxor u4 #x80)))

490

(encode-raw-octets 4)))))))

491

(t (encode-raw-octets 1)))

492

dest di)))

493

finally (return (the fixnum (- di d-start)))))))

494

495

;;;; UTF-16

496

497

;;; TODO: add a way to pass some info at compile-time telling us that,

498

;;; for example, the maximum code-point will always be < #x10000 in

499

;;; which case we could simply return (* 2 (- end start)).

500

(defmacro utf16-octet-counter (getter type)

501

`(named-lambda utf-16-octet-counter (seq start end max)

502

(declare (type ,type seq) (fixnum start end max))

503

(loop with noctets fixnum = 0

504

for i fixnum from start below end

505

for code of-type code-point = (,getter seq i)

506

do (let ((new (the fixnum (+ (if (< code #x10000) 2 4) noctets))))

507

(if (and (plusp max) (> new max))

508

(loop-finish)

509

(setq noctets new)))

510

finally (return (values noctets i)))))

511

512

(defmacro utf-16-combine-surrogate-pairs (u1 u2)

513

`(the (unsigned-byte 21)

514

(+ #x10000

515

(the (unsigned-byte 20)

516

(logior

517

(the (unsigned-byte 20)

518

(ash (the (unsigned-byte 10) (- ,u1 #xd800)) 10))

519

(the (unsigned-byte 10)

520

(- ,u2 #xdc00)))))))

521

522

(defmacro define-utf-16 (name &optional endianness)

523

(check-type endianness (or null (eql :be) (eql :le)))

524

(check-type name keyword)

525

(let ((swap-var (gensym "SWAP"))

526

(code-point-counter-name

527

(format-symbol t '#:~a-code-point-counter (string name)))

528

(encoder-name (format-symbol t '#:~a-encoder (string name)))

529

(decoder-name (format-symbol t '#:~a-decoder (string name))))

530

(labels ((make-bom-check-form (end start getter seq)

531

(if (null endianness)

532

``((,',swap-var

533

(when (> ,,end ,,start)

534

(case (,,getter ,,seq ,,start 2 :ne)

535

(#.+byte-order-mark-code+ (incf ,,start 2) nil)

536

(#.+swapped-byte-order-mark-code+ (incf ,,start 2) t)

537

(t #+little-endian t)))))

538

'()))

539

(make-getter-form (getter src i)

540

(case endianness

541

(:le ``(,,getter ,,src ,,i 2 :le))

542

(:be ``(,,getter ,,src ,,i 2 :be))

543

(t ``(if ,',swap-var

544

(,,getter ,,src ,,i 2 :re)

545

(,,getter ,,src ,,i 2 :ne)))))

546

(make-setter-form (setter code dest di)

547

(case endianness

548

(:be ``(,,setter ,,code ,,dest ,,di 2 :be))

549

(:le ``(,,setter ,,code ,,dest ,,di 2 :le))

550

(t ``(,,setter ,,code ,,dest ,,di 2 :ne)))))

551

`(progn

552

(define-octet-counter ,name (getter type)

553

`(utf16-octet-counter ,getter ,type))

554

(define-code-point-counter ,name (getter type)

555

`(named-lambda ,',code-point-counter-name (seq start end max)

556

(declare (type ,type seq) (fixnum start end max))

557

(let* ,,(make-bom-check-form ''end ''start 'getter ''seq)

558

(loop with count fixnum = 0

559

with i fixnum = start

560

while (<= i (- end 2)) do

561

(let* ((code ,,(make-getter-form 'getter ''seq ''i))

562

(next-i (+ i (if (or (< code #xd800) (>= code #xdc00))

563

2

564

4))))

565

(declare (type (unsigned-byte 16) code) (fixnum next-i))

566

(cond

567

((> next-i end)

568

(decoding-error

569

(vector (,getter seq i) (,getter seq (1+ i)))

570

,',name seq i nil 'end-of-input-in-character)

571

(return (values count i)))

572

(t

573

(setq i next-i

574

count (1+ count))

575

(when (and (plusp max) (= count max))

576

(return (values count i))))))

577

finally (progn

578

(assert (= i end))

579

(return (values count i)))))))

580

(define-encoder ,name (getter src-type setter dest-type)

581

`(named-lambda ,',encoder-name (src start end dest d-start)

582

(declare (type ,src-type src)

583

(type ,dest-type dest)

584

(fixnum start end d-start))

585

(loop with di fixnum = d-start

586

for i fixnum from start below end

587

for code of-type code-point = (,getter src i)

588

for high-bits fixnum = (- code #x10000) do

589

(cond ((< high-bits 0)

590

,,(make-setter-form 'setter ''code ''dest ''di)

591

(incf di 2))

592

(t

593

,,(make-setter-form

594

'setter ''(logior #xd800 (f-ash high-bits -10))

595

''dest ''di)

596

,,(make-setter-form

597

'setter ''(logior #xdc00 (f-logand high-bits #x3ff))

598

''dest ''(+ di 2))

599

(incf di 4)))

600

finally (return (the fixnum (- di d-start))))))

601

(define-decoder ,name (getter src-type setter dest-type)

602

`(named-lambda ,',decoder-name (src start end dest d-start)

603

(declare (type ,src-type src)

604

(type ,dest-type dest)

605

(fixnum start end d-start))

606

(let ,,(make-bom-check-form ''end ''start 'getter ''src)

607

(loop with i fixnum = start

608

for di fixnum from d-start

609

until (= i end) do

610

(let ((u1 ,,(make-getter-form 'getter ''src ''i)))

611

(declare (type (unsigned-byte 16) u1))

612

(incf i 2)

613

(,setter (cond

614

((or (< u1 #xd800) (>= u1 #xe000)) ; 2 octets

615

u1)

616

((< u1 #xdc00) ; 4 octets

617

                                 (let ((u2 ,,(make-getter-form 'getter ''src ''i)))

618

(declare (type (unsigned-byte 16) u2))

619

(incf i 2)

620

(if (and (>= u2 #xdc00) (< u2 #xe000))

621

(utf-16-combine-surrogate-pairs u1 u2)

622

(decoding-error

623

(vector (,getter src (- i 4))

624

(,getter src (- i 3))

625

(,getter src (- i 2))

626

(,getter src (- i 1)))

627

,',name src i +repl+))))

628

(t

629

(decoding-error (vector (,getter src (- i 2))

630

                                                          (,getter src (- i 1)))

631

                                                  ,',name src i +repl+)))

632

dest di))

633

finally (return (the fixnum (- di d-start)))))))

634

',name))))

635

636

(define-character-encoding :utf-16

637

"A 16-bit, variable-length encoding in which characters with

638

code points less than #x10000 can be encoded in a single 16-bit

639

word and characters with larger codes can be encoded in a pair of

640

16-bit words. The endianness of the encoded data is indicated by

641

the endianness of a byte-order-mark character (#\u+feff)

642

prepended to the data; in the absence of such a character on

643

input, the data is assumed to be in big-endian order. Output is

644

written in native byte-order with a leading byte-order mark."

645

:max-units-per-char 2

646

:code-unit-size 16

647

:native-endianness t ; not necessarily true when decoding

648

:decode-literal-code-unit-limit #xd800

649

:encode-literal-code-unit-limit #x10000

650

:use-bom #+big-endian :utf-16be #+little-endian :utf-16le

651

:bom-encoding #+big-endian #(#xfe #xff) #+little-endian #(#xff #xfe)

652

:nul-encoding #(0 0)

653

:default-replacement #xfffd

654

:ambiguous #+little-endian t #+big-endian nil)

655

656

(define-utf-16 :utf-16)

657

658

(define-character-encoding :utf-16le

659

"A 16-bit, variable-length encoding in which characters with

660

code points less than #x10000 can be encoded in a single 16-bit

661

word and characters with larger codes can be encoded in a pair of

662

16-bit words. The data is assumed to be in little-endian order. Output is

663

written in little-endian byte-order without a leading byte-order mark."

664

:aliases '(:utf-16/le)

665

:max-units-per-char 2

666

:code-unit-size 16

667

:native-endianness #+little-endian t #+big-endian nil

668

:decode-literal-code-unit-limit #xd800

669

:encode-literal-code-unit-limit #x10000

670

:nul-encoding #(0 0)

671

:default-replacement #xfffd)

672

673

(define-utf-16 :utf-16le :le)

674

675

(define-character-encoding :utf-16be

676

"A 16-bit, variable-length encoding in which characters with

677

code points less than #x10000 can be encoded in a single 16-bit

678

word and characters with larger codes can be encoded in a pair of

679

16-bit words. The data is assumed to be in big-endian order. Output is

680

written in big-endian byte-order without a leading byte-order mark."

681

:aliases '(:utf-16/be)

682

:max-units-per-char 2

683

:code-unit-size 16

684

:native-endianness #+little-endian nil #+big-endian t

685

:decode-literal-code-unit-limit #xd800

686

:encode-literal-code-unit-limit #x10000

687

:nul-encoding #(0 0)

688

:default-replacement #xfffd)

689

690

(define-utf-16 :utf-16be :be)

691

692

(defmacro define-ucs (name bytes &optional endianness (limit #x110000))

693

(check-type name keyword)

694

(check-type bytes (or (eql 2) (eql 4)))

695

(check-type endianness (or null (eql :le) (eql :be)))

696

(let ((swap-var (gensym "SWAP"))

697

(code-point-counter-name

698

(format-symbol t '#:~a-code-point-counter (string name)))

699

(encoder-name

700

(format-symbol t '#:~a-encoder (string name)))

701

(decoder-name

702

(format-symbol t '#:~a-decoder (string name))))

703

(labels ((make-bom-check-form (end start getter src)

704

(if (null endianness)

705

``(when (not (zerop (- ,,end ,,start)))

706

(case (,,getter ,,src 0 ,',bytes :ne)

707

(#.+byte-order-mark-code+

708

(incf ,,start ,',bytes) nil)

709

(#.+swapped-byte-order-mark-code-32+

710

(incf ,,start ,',bytes) t)

711

(t #+little-endian t)))

712

'()))

713

(make-setter-form (setter code dest di)

714

``(,,setter ,,code ,,dest ,,di ,',bytes

715

,',(or endianness :ne)))

716

(make-getter-form (getter src i)

717

(if (null endianness)

718

``(if ,',swap-var

719

(,,getter ,,src ,,i ,',bytes :re)

720

(,,getter ,,src ,,i ,',bytes :ne))

721

``(,,getter ,,src ,,i ,',bytes ,',endianness))))

722

`(progn

723

(define-code-point-counter ,name (getter type)

724

`(named-lambda ,',code-point-counter-name (seq start end max)

725

(declare (type ,type seq) (fixnum start end max))

726

;; check for bom

727

,,(make-bom-check-form ''end ''start 'getter ''seq)

728

(multiple-value-bind (count rem)

729

(floor (- end start) ,',bytes)

730

(cond

731

((and (plusp max) (> count max))

732

(values max (the fixnum (+ start (* ,',bytes max)))))

733

(t

734

;; check for incomplete last character

735

(unless (zerop rem)

736

(let ((vector (make-array ,',bytes :fill-pointer 0)))

737

(dotimes (i rem)

738

(vector-push (,getter seq (+ i (- end rem))) vector))

739

(decoding-error vector ,',name seq (the fixnum (- end rem)) nil

740

'end-of-input-in-character)

741

(decf end rem)))

742

(values count end))))))

743

(define-encoder ,name (getter src-type setter dest-type)

744

`(named-lambda ,',encoder-name (src start end dest d-start)

745

(declare (type ,src-type src)

746

(type ,dest-type dest)

747

(fixnum start end d-start))

748

(loop for i fixnum from start below end

749

and di fixnum from d-start by ,',bytes

750

for code of-type code-point = (,getter src i)

751

do (if (>= code ,',limit)

752

(encoding-error code ,',name src i +repl+)

753

,,(make-setter-form 'setter ''code ''dest ''di))

754

finally (return (the fixnum (- di d-start))))))

755

(define-decoder ,name (getter src-type setter dest-type)

756

`(named-lambda ,',decoder-name (src start end dest d-start)

757

(declare (type ,src-type src)

758

(type ,dest-type dest)

759

(fixnum start end d-start))

760

(let ((,',swap-var ,,(make-bom-check-form ''end ''start 'getter ''src)))

761

(declare (ignorable ,',swap-var))

762

(loop for i fixnum from start below end by ,',bytes

763

and di from d-start

764

do (,setter (let ((unit ,,(make-getter-form 'getter ''src ''i)))

765

(if (>= unit ,',limit)

766

(decoding-error

767

(vector (,getter src i)

768

                                                  (,getter src (+ i 1))

769

                                                  ,@,(if (= bytes 4)

770

                                                         ``((,getter src (+ i 2))

771

                                                            (,getter src (+ i 3)))))

772

,',name src i +repl+

773

'character-out-of-range)

774

unit))

775

dest di)

776

finally (return (the fixnum (- di d-start)))))))

777

',name))))

778

779

;;;; UTF-32

780

781

(define-character-encoding :utf-32

782

"A 32-bit, fixed-length encoding in which all Unicode

783

characters can be encoded in a single 32-bit word. The

784

endianness of the encoded data is indicated by the endianness of

785

a byte-order-mark character (#\u+feff) prepended to the data; in

786

the absence of such a character on input, input data is assumed

787

to be in big-endian order. Output is written in native byte

788

order with a leading byte-order mark."

789

:aliases '(:ucs-4)

790

:max-units-per-char 1

791

:code-unit-size 32

792

:native-endianness t ; not necessarily true when decoding

793

:literal-char-code-limit #x110000

794

:use-bom #+little-endian :utf-32le #+big-endian :utf-32be

795

:bom-encoding

796

#+big-endian #(#x00 #x00 #xfe #xff)

797

#+little-endian #(#xff #xfe #x00 #x00)

798

:nul-encoding #(0 0 0 0)

799

:ambiguous #+little-endian t #+big-endian nil)

800

801

(define-ucs :utf-32 4)

802

803

(define-character-encoding :utf-32le

804

"A 32-bit, fixed-length encoding in which all Unicode

805

characters can be encoded in a single 32-bit word. Input data is assumed

806

to be in little-endian order. Output is also written in little-endian byte

807

order without a leading byte-order mark."

808

:max-units-per-char 1

809

:code-unit-size 32

810

:aliases '(:utf-32/le :ucs-4le :ucs-4/le)

811

:native-endianness #+little-endian t #+big-endian nil

812

:literal-char-code-limit #x110000

813

:nul-encoding #(0 0 0 0))

814

815

(define-ucs :utf-32le 4 :le)

816

817

(define-character-encoding :utf-32be

818

"A 32-bit, fixed-length encoding in which all Unicode

819

characters can be encoded in a single 32-bit word. Input data is assumed

820

to be in big-endian order. Output is also written in big-endian byte

821

order without a leading byte-order mark."

822

:max-units-per-char 1

823

:code-unit-size 32

824

:aliases '(:utf-32/be :ucs-4be :ucs-4/be)

825

:native-endianness #+little-endian nil #+big-endian t

826

:literal-char-code-limit #x110000

827

:nul-encoding #(0 0 0 0))

828

829

(define-ucs :utf-32be 4 :be)

830

831

;; UCS-2

832

833

(define-character-encoding :ucs-2

834

"A 16-bit, fixed-length encoding in which all Unicode

835

characters can be encoded in a single 16-bit word. The

836

endianness of the encoded data is indicated by the endianness of

837

a byte-order-mark character (#\u+feff) prepended to the data; in

838

the absence of such a character on input, input data is assumed

839

to be in big-endian order. Output is written in native byte

840

order with a leading byte-order mark."

841

:aliases '(:ucs-2)

842

:max-units-per-char 1

843

:code-unit-size 16

844

:native-endianness t ; not necessarily true when decoding

845

:literal-char-code-limit #x10000

846

:use-bom #+little-endian :ucs-2le #+big-endian :ucs-2be

847

:bom-encoding

848

#+big-endian #(#xfe #xff)

849

#+little-endian #(#xff #xfe)

850

:nul-encoding #(0 0)

851

:ambiguous #+little-endian t #+big-endian nil)

852

853

(define-ucs :ucs-2 2 nil #x10000)

854

855

(define-character-encoding :ucs-2le

856

"A 16-bit, fixed-length encoding in which all Unicode

857

characters can be encoded in a single 16-bit word. Input data is assumed

858

to be in little-endian order. Output is also written in little-endian byte

859

order without a leading byte-order mark."

860

:max-units-per-char 1

861

:code-unit-size 16

862

:aliases '(:ucs-2/le)

863

:native-endianness #+little-endian t #+big-endian nil

864

:literal-char-code-limit #x10000

865

:nul-encoding #(0 0))

866

867

(define-ucs :ucs-2le 2 :le #x10000)

868

869

(define-character-encoding :ucs-2be

870

"A 16-bit, fixed-length encoding in which all Unicode

871

characters can be encoded in a single 16-bit word. Input data is assumed

872

to be in big-endian order. Output is also written in big-endian byte

873

order without a leading byte-order mark."

874

:max-units-per-char 1

875

:code-unit-size 16

876

:aliases '(:ucs-2/be)

877

:native-endianness #+little-endian nil #+big-endian t

878

:literal-char-code-limit #x10000

879

:nul-encoding #(0 0))

880

881

(define-ucs :ucs-2be 2 :be #x10000)