Coverage report: /home/ellis/.stash/quicklisp/dists/ultralisp/software/edicl-flexi-streams-20240429143708/decode.lisp

Kind	Covered	All	%
expression	6	295	2.0
branch	1	36	2.8

Key

Not instrumented

Conditionalized out

Executed

Not executed

Both branches taken

One branch taken

Neither branch taken

1

;;; -*- Mode: LISP; Syntax: COMMON-LISP; Package: FLEXI-STREAMS; Base: 10 -*-

2

;;; $Header: /usr/local/cvsrep/flexi-streams/decode.lisp,v 1.35 2008/08/26 10:59:22 edi Exp $

3

4

5

6

;;; Redistribution and use in source and binary forms, with or without

7

;;; modification, are permitted provided that the following conditions

8

;;; are met:

9

10

;;; * Redistributions of source code must retain the above copyright

11

;;; notice, this list of conditions and the following disclaimer.

12

13

;;; * Redistributions in binary form must reproduce the above

14

;;; copyright notice, this list of conditions and the following

15

;;; disclaimer in the documentation and/or other materials

16

;;; provided with the distribution.

17

18

;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED

19

;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

20

;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

21

;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY

22

;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

23

;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE

24

;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

25

;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,

26

;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

27

;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

28

;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

29

30

(in-package :flexi-streams)

31

32

(defun recover-from-encoding-error (external-format format-control &rest format-args)

33

"Helper function used by OCTETS-TO-CHAR-CODE below to deal with

34

encoding errors. Checks if *SUBSTITUTION-CHAR* is not NIL and returns

35

its character code in this case. Otherwise signals an

36

EXTERNAL-FORMAT-ENCODING-ERROR as determined by the arguments to this

37

function and provides a corresponding USE-VALUE restart."

38

(when *substitution-char*

39

(return-from recover-from-encoding-error (char-code *substitution-char*)))

40

(restart-case

41

(apply #'signal-encoding-error external-format format-control format-args)

42

(use-value (char)

43

:report "Specify a character to be used instead."

44

:interactive (lambda ()

45

(loop

46

(format *query-io* "Type a character: ")

47

(let ((line (read-line *query-io*)))

48

(when (= 1 (length line))

49

(return (list (char line 0)))))))

50

(char-code char))))

51

52

(defgeneric octets-to-char-code (format reader)

53

(declare #.*standard-optimize-settings*)

54

(:documentation "Converts a sequence of octets to a character code

55

\(which is returned, or NIL in case of EOF) using the external format

56

FORMAT. The sequence is obtained by calling the function \(which must

57

be a functional object) READER with no arguments which should return

58

one octet per call. In the case of EOF, READER should return NIL.

59

60

The special variable *CURRENT-UNREADER* must be bound correctly

61

whenever this function is called."))

62

63

(defgeneric octets-to-string* (format sequence start end)

64

(declare #.*standard-optimize-settings*)

65

(:documentation "A generic function which dispatches on the external

66

format and does the real work for OCTETS-TO-STRING."))

67

68

(defmethod octets-to-string* :around (format (list list) start end)

69

(declare #.*standard-optimize-settings*)

70

(octets-to-string* format (coerce list 'vector) start end))

71

72

(defmacro define-sequence-readers ((format-class) &body body)

73

"Non-hygienic utility macro which defines methods for READ-SEQUENCE*

74

and OCTETS-TO-STRING* for the class FORMAT-CLASS. BODY is described

75

in the docstring of DEFINE-CHAR-ENCODERS but can additionally contain

76

a form \(UNGET <form>) which has to be replaced by the correct code to

77

`unread' the octets for the character designated by <form>."

78

(let* ((body `((block char-decoder

79

(locally

80

(declare #.*fixnum-optimize-settings*)

81

,@body)))))

82

`(progn

83

(defmethod read-sequence* ((format ,format-class) flexi-input-stream sequence start end)

84

(with-accessors ((position flexi-stream-position)

85

(bound flexi-stream-bound)

86

(octet-stack flexi-stream-octet-stack)

87

(last-octet flexi-stream-last-octet)

88

(last-char-code flexi-stream-last-char-code)

89

(stream flexi-stream-stream))

90

flexi-input-stream

91

(let* (buffer

92

(buffer-pos 0)

93

(buffer-end 0)

94

(index start)

95

donep

96

;; whether we will later be able to rewind the stream if

97

;; needed (to get rid of unused octets in the buffer)

98

(can-rewind-p (maybe-rewind stream 0))

99

(factor (encoding-factor format))

100

(integer-factor (floor factor))

101

;; it's an interesting question whether it makes sense

102

;; performance-wise to make RESERVE significantly bigger

103

;; (and thus put potentially a lot more octets into

104

;; OCTET-STACK), especially for UTF-8

105

(reserve (cond ((or (not (floatp factor))

106

(not can-rewind-p)) 0)

107

                                  (t (ceiling (* (- factor integer-factor) (- end start)))))))

108

(declare (fixnum buffer-pos buffer-end index integer-factor reserve)

109

(boolean can-rewind-p))

110

(flet ((compute-fill-amount ()

111

"Computes the amount of octets we can savely read into

112

the buffer without violating the stream's bound \(if there is one) and

113

without potentially reading much more than we need \(unless we can

114

rewind afterwards)."

115

(let ((minimum (min (the fixnum (+ (the fixnum (* integer-factor

116

                                                                         (the fixnum (- end index))))

117

                                                          reserve))

118

+buffer-size+)))

119

(cond (bound (min minimum (- bound position)))

120

(t minimum))))

121

(fill-buffer (end)

122

"Tries to fill the buffer from BUFFER-POS to END and

123

returns NIL if the buffer doesn't contain any new data."

124

(when donep

125

(return-from fill-buffer nil))

126

;; put data from octet stack into buffer if there is any

127

(loop

128

(when (>= buffer-pos end)

129

(return))

130

(let ((next-octet (pop octet-stack)))

131

(cond (next-octet

132

                                 (setf (aref (the (array octet *) buffer) buffer-pos) (the octet next-octet))

133

(incf buffer-pos))

134

(t (return)))))

135

(setq buffer-end (read-sequence buffer stream

136

                                                       :start buffer-pos

137

                                                       :end end))

138

;; we reached EOF, so we remember this

139

(when (< buffer-end end)

140

(setq donep t))

141

;; BUFFER-POS is only greater than zero if the buffer

142

;; already contains unread data from the octet stack

143

;; (see below), so we test for ZEROP here and do /not/

144

;; compare with BUFFER-POS

145

(unless (zerop buffer-end)

146

(incf position buffer-end))))

147

(let ((minimum (compute-fill-amount)))

148

(declare (fixnum minimum))

149

(setq buffer (make-octet-buffer minimum))

150

;; fill buffer for the first time or return immediately if

151

;; we don't succeed

152

(unless (fill-buffer minimum)

153

(return-from read-sequence* start)))

154

(setq buffer-pos 0)

155

(macrolet ((iterate (set-place)

156

"A very unhygienic macro to implement the

157

actual iteration through the sequence including housekeeping for the

158

flexi stream. SET-PLACE is the place \(using the index INDEX) used to

159

access the sequence."

160

`(flet ((leave ()

161

"This is the function used to

162

abort the LOOP iteration below."

163

(when (> index start)

164

(setq last-octet nil

165

                                               last-char-code ,(sublis '((index . (1- index))) set-place)))

166

(return-from read-sequence* index)))

167

(loop

168

(when (>= index end)

169

;; check if there are octets in the

170

;; buffer we didn't use - see

171

;; COMPUTE-FILL-AMOUNT above

172

(let ((rest (- buffer-end buffer-pos)))

173

(when (plusp rest)

174

(or (and can-rewind-p

175

                                                (maybe-rewind stream rest))

176

(loop

177

                                            (when (>= buffer-pos buffer-end)

178

(return))

179

(decf buffer-end)

180

                                            (push (aref (the (array octet *) buffer) buffer-end)

181

octet-stack)))))

182

(leave))

183

(let ((next-char-code

184

(progn (symbol-macrolet

185

((octet-getter

186

                                                     ;; this is the code to retrieve the next octet (or

187

                                                     ;; NIL) and to fill the buffer if needed

188

                                                     (block next-octet

189

                                                       (when (>= buffer-pos buffer-end)

190

                                                         (setq buffer-pos 0)

191

                                                         (unless (fill-buffer (compute-fill-amount))

192

                                                           (return-from next-octet)))

193

(prog1

194

                                                           (aref (the (array octet *) buffer) buffer-pos)

195

                                                         (incf buffer-pos)))))

196

                                                 (macrolet ((unget (form)

197

                                                              `(unread-char% ,form flexi-input-stream)))

198

,',@body)))))

199

(unless next-char-code

200

(leave))

201

(setf ,set-place (code-char next-char-code))

202

(incf index))))))

203

(etypecase sequence

204

(string (iterate (char sequence index)))

205

(array (iterate (aref sequence index)))

206

(list (iterate (nth index sequence)))))))))

207

(defmethod octets-to-string* ((format ,format-class) sequence start end)

208

(declare #.*standard-optimize-settings*)

209

(declare (fixnum start end))

210

(let* ((i start)

211

(string-length (compute-number-of-chars format sequence start end))

212

(string (make-array string-length :element-type 'char*)))

213

(declare (fixnum i string-length))

214

(loop for j of-type fixnum from 0 below string-length

215

do (setf (schar string j)

216

(code-char (macrolet ((unget (form)

217

                                                   `(decf i (character-length format ,form))))

218

                                        (symbol-macrolet ((octet-getter (and (< i end)

219

                                                                             (prog1

220

                                                                                 (the octet (aref sequence i))

221

                                                                               (incf i)))))

222

,@body))))

223

finally (return string)))))))

224

225

(defmacro define-char-decoders ((lf-format-class cr-format-class crlf-format-class) &body body)

226

"Non-hygienic utility macro which defines several decoding-related

227

methods for the classes LF-FORMAT-CLASS, CR-FORMAT-CLASS, and

228

CRLF-FORMAT-CLASS where it is assumed that CR-FORMAT-CLASS is the same

229

encoding as LF-FORMAT-CLASS but with CR instead of LF line endings and

230

similar for CRLF-FORMAT-CLASS, i.e. LF-FORMAT-CLASS is the base class.

231

BODY is a code template for the code to read octets and return one

232

character code. BODY must contain a symbol OCTET-GETTER representing

233

the form which is used to obtain the next octet."

234

(let* ((body (with-unique-names (char-code)

235

`((let ((,char-code (progn ,@body)))

236

(when (and ,char-code

237

                                 (or (<= #xd8 (logand* #x00ff (ash* ,char-code -8)) #xdf)

238

(> ,char-code #x10ffff)))

239

                        (recover-from-encoding-error format "Illegal code point ~A \(#x~:*~X)." ,char-code))

240

,char-code)))))

241

`(progn

242

(defmethod octets-to-char-code ((format ,lf-format-class) reader)

243

(declare #.*fixnum-optimize-settings*)

244

(declare (function reader))

245

(symbol-macrolet ((octet-getter (funcall reader)))

246

,@(sublis '((char-decoder . octets-to-char-code))

247

body)))

248

(define-sequence-readers (,lf-format-class) ,@body)

249

(define-sequence-readers (,cr-format-class)

250

,(with-unique-names (char-code)

251

`(let ((,char-code (progn ,@body)))

252

(case ,char-code

253

(#.+cr+ #.(char-code #\Newline))

254

(otherwise ,char-code)))))

255

(define-sequence-readers (,crlf-format-class)

256

,(with-unique-names (char-code next-char-code get-char-code)

257

`(flet ((,get-char-code () ,@body))

258

(let ((,char-code (,get-char-code)))

259

(case ,char-code

260

(#.+cr+

261

(let ((,next-char-code (,get-char-code)))

262

(case ,next-char-code

263

(#.+lf+ #.(char-code #\Newline))

264

;; we saw a CR but no LF afterwards, but then the data

265

;; ended, so we just return #\Return

266

((nil) +cr+)

267

;; if the character we peeked at wasn't a

268

;; linefeed character we unread its constituents

269

(otherwise (unget (code-char ,next-char-code))

270

,char-code))))

271

(otherwise ,char-code)))))))))

272

273

(define-char-decoders (flexi-latin-1-format flexi-cr-latin-1-format flexi-crlf-latin-1-format)

274

octet-getter)

275

276

(define-char-decoders (flexi-ascii-format flexi-cr-ascii-format flexi-crlf-ascii-format)

277

(when-let (octet octet-getter)

278

(if (> (the octet octet) 127)

279

(recover-from-encoding-error format

280

                                    "No character which corresponds to octet #x~X." octet)

281

octet)))

282

283

(define-char-decoders (flexi-8-bit-format flexi-cr-8-bit-format flexi-crlf-8-bit-format)

284

(with-accessors ((decoding-table external-format-decoding-table))

285

format

286

(when-let (octet octet-getter)

287

(let ((char-code (aref (the (simple-array char-code-integer *) decoding-table)

288

(the octet octet))))

289

(if (or (null char-code)

290

(= (the char-code-integer char-code) 65533))

291

(recover-from-encoding-error format

292

                                        "No character which corresponds to octet #x~X." octet)

293

char-code)))))

294

295

(define-char-decoders (flexi-utf-8-format flexi-cr-utf-8-format flexi-crlf-utf-8-format)

296

(let (first-octet-seen)

297

(declare (boolean first-octet-seen))

298

(macrolet ((read-next-byte ()

299

'(prog1

300

(or octet-getter

301

(cond (first-octet-seen

302

(return-from char-decoder

303

(recover-from-encoding-error format

304

                                                                 "End of data while in UTF-8 sequence.")))

305

(t (return-from char-decoder nil))))

306

(setq first-octet-seen t))))

307

(flet ((recover-from-overlong-sequence (value)

308

(restart-case

309

                    (recover-from-encoding-error format "`Overlong' UTF-8 sequence for code point #x~X."

310

                                                 value)

311

(accept-overlong-sequence ()

312

:report "Accept the code point and continue."

313

value))))

314

(let ((octet (read-next-byte)))

315

(declare (type octet octet))

316

(block utf-8-sequence

317

(multiple-value-bind (start count)

318

(cond ((not (logbitp 7 octet))

319

;; avoid the overlong checks below

320

(return-from utf-8-sequence octet))

321

((= #b11000000 (logand* octet #b11100000))

322

(values (logand* octet #b00011111) 1))

323

((= #b11100000 (logand* octet #b11110000))

324

(values (logand* octet #b00001111) 2))

325

((= #b11110000 (logand* octet #b11111000))

326

(values (logand* octet #b00000111) 3))

327

(t (return-from char-decoder

328

(recover-from-encoding-error format

329

                                                         "Unexpected value #x~X at start of UTF-8 sequence."

330

                                                         octet))))

331

(declare (fixnum count))

332

(loop for result of-type code-point

333

= start then (+ (ash* result 6)

334

(logand* octet #b111111))

335

repeat count

336

for octet of-type octet = (read-next-byte)

337

unless (= #b10000000 (logand* octet #b11000000))

338

do (return-from char-decoder

339

(recover-from-encoding-error format

340

                                                       "Unexpected value #x~X in UTF-8 sequence." octet))

341

finally (return (cond ((< result (ecase count

342

                                                        (1 #x00080)

343

                                                        (2 #x00800)

344

                                                        (3 #x10000)))

345

                                            (recover-from-overlong-sequence result))

346

(t result)))))))))))

347

348

(define-char-decoders (flexi-utf-16-le-format flexi-cr-utf-16-le-format flexi-crlf-utf-16-le-format)

349

(let (first-octet-seen)

350

(declare (boolean first-octet-seen))

351

(macrolet ((read-next-byte ()

352

'(prog1

353

(or octet-getter

354

(cond (first-octet-seen

355

(return-from char-decoder

356

(recover-from-encoding-error format

357

                                                                 "End of data while in UTF-16 sequence.")))

358

(t (return-from char-decoder nil))))

359

(setq first-octet-seen t))))

360

(flet ((read-next-word ()

361

(+ (the octet (read-next-byte))

362

(ash* (the octet (read-next-byte)) 8))))

363

(declare (inline read-next-word))

364

(let ((word (read-next-word)))

365

(declare (type (unsigned-byte 16) word))

366

(cond ((<= #xd800 word #xdfff)

367

(let ((next-word (read-next-word)))

368

(declare (type (unsigned-byte 16) next-word))

369

(unless (<= #xdc00 next-word #xdfff)

370

(return-from char-decoder

371

(recover-from-encoding-error format

372

                                                     "Unexpected UTF-16 word #x~X following #x~X."

373

                                                     next-word word)))

374

(+ (ash* (logand* #b1111111111 word) 10)

375

(logand* #b1111111111 next-word)

376

#x10000)))

377

(t word)))))))

378

379

(define-char-decoders (flexi-utf-16-be-format flexi-cr-utf-16-be-format flexi-crlf-utf-16-be-format)

380

(let (first-octet-seen)

381

(declare (boolean first-octet-seen))

382

(macrolet ((read-next-byte ()

383

'(prog1

384

(or octet-getter

385

(cond (first-octet-seen

386

(return-from char-decoder

387

(recover-from-encoding-error format

388

                                                                 "End of data while in UTF-16 sequence.")))

389

(t (return-from char-decoder nil))))

390

(setq first-octet-seen t))))

391

(flet ((read-next-word ()

392

(+ (ash* (the octet (read-next-byte)) 8)

393

(the octet (read-next-byte)))))

394

(declare (inline read-next-word))

395

(let ((word (read-next-word)))

396

(declare (type (unsigned-byte 16) word))

397

(cond ((<= #xd800 word #xdfff)

398

(let ((next-word (read-next-word)))

399

(declare (type (unsigned-byte 16) next-word))

400

(unless (<= #xdc00 next-word #xdfff)

401

(return-from char-decoder

402

(recover-from-encoding-error format

403

                                                     "Unexpected UTF-16 word #x~X following #x~X."

404

                                                     next-word word)))

405

(+ (ash* (logand* #b1111111111 word) 10)

406

(logand* #b1111111111 next-word)

407

#x10000)))

408

(t word)))))))

409

(define-char-decoders (flexi-gbk-format flexi-cr-gbk-format flexi-crlf-gbk-format)

410

(when-let (octet octet-getter)

411

(cond ((<= (the octet octet) #x7f) octet)

412

((= (the octet octet) #x80) #x20ac)

413

((= (the octet octet) #xff) #xf8f5)

414

(t (let ((next-byte octet-getter))

415

(if (null next-byte)

416

(recover-from-encoding-error format

417

                                               "End of data while in GBK sequence.")

418

(let ((word (+ (ash* (the octet octet) 8)

419

(the octet next-byte))))

420

(declare (type (unsigned-byte 16) word))

421

(let ((octet (or (get-multibyte-mapper *gbk-to-ucs-special-table* word)

422

                                     (get-multibyte-mapper *gbk-to-ucs-table* word))))

423

(if octet

424

octet

425

(recover-from-encoding-error format

426

                                                     "No character which corresponds to octet #x~X."

427

word))))))))))

428

429

(define-char-decoders (flexi-utf-32-le-format flexi-cr-utf-32-le-format flexi-crlf-utf-32-le-format)

430

(let (first-octet-seen)

431

(declare (boolean first-octet-seen))

432

(macrolet ((read-next-byte ()

433

'(prog1

434

(or octet-getter

435

(cond (first-octet-seen

436

(return-from char-decoder

437

(recover-from-encoding-error format

438

                                                                 "End of data while in UTF-32 sequence.")))

439

(t (return-from char-decoder nil))))

440

(setq first-octet-seen t))))

441

(loop for count of-type fixnum from 0 to 24 by 8

442

for octet of-type octet = (read-next-byte)

443

sum (ash* octet count)))))

444

445

(define-char-decoders (flexi-utf-32-be-format flexi-cr-utf-32-be-format flexi-crlf-utf-32-be-format)

446

(let (first-octet-seen)

447

(declare (boolean first-octet-seen))

448

(macrolet ((read-next-byte ()

449

'(prog1

450

(or octet-getter

451

(cond (first-octet-seen

452

(return-from char-decoder

453

(recover-from-encoding-error format

454

                                                                 "End of data while in UTF-32 sequence.")))

455

(t (return-from char-decoder nil))))

456

(setq first-octet-seen t))))

457

(loop for count of-type fixnum from 24 downto 0 by 8

458

for octet of-type octet = (read-next-byte)

459

sum (ash* octet count)))))

460

461

(defmethod octets-to-char-code ((format flexi-cr-mixin) reader)

462

(declare #.*fixnum-optimize-settings*)

463

(declare (ignore reader))

464

(let ((char-code (call-next-method)))

465

(case char-code

466

(#.+cr+ #.(char-code #\Newline))

467

(otherwise char-code))))

468

469

(defmethod octets-to-char-code ((format flexi-crlf-mixin) reader)

470

(declare #.*fixnum-optimize-settings*)

471

(declare (function *current-unreader*))

472

(declare (ignore reader))

473

(let ((char-code (call-next-method)))

474

(case char-code

475

(#.+cr+

476

(let ((next-char-code (call-next-method)))

477

(case next-char-code

478

(#.+lf+ #.(char-code #\Newline))

479

;; we saw a CR but no LF afterwards, but then the data

480

;; ended, so we just return #\Return

481

((nil) +cr+)

482

;; if the character we peeked at wasn't a

483

;; linefeed character we unread its constituents

484

(otherwise (funcall *current-unreader* (code-char next-char-code))

485

char-code))))

486

(otherwise char-code))))

487