Coverage report: /home/ellis/comp/core/std/string.lisp
Kind | Covered | All | % |
expression | 307 | 574 | 53.5 |
branch | 26 | 44 | 59.1 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; std/str.lisp --- String utilities
5
;; (defvar sb-unicode-syms
6
;; '(words lines sentences whitespace-p uppercase lowercase titlecase
7
;; word-break-class line-break-class sentence-break-class char-block
8
;; cased-p uppercase-p lowercase-p titlecase-p casefold
9
;; graphemes grapheme-break-class
10
;; bidi-mirroring-glyph bidi-class
11
;; normalize-string normalized-p default-ignorable-p
12
;; confusable-p hex-digit-p mirrored-p alphabetic-p math-p
13
;; decimal-value digit-value
14
;; unicode< unicode> unicode= unicode-equal
15
;; unicode<= unicode>=))
16
(in-package :std/string)
18
(defparameter *suppress-character-coding-errors* nil
19
"Suppress errors which may arise from character encoding/decoding.")
21
(defconstant +cr+ #\Return "Return char.")
22
(defconstant +lf+ #\Linefeed "Linefeed char.")
23
(sb-int:defconstant-eqx +crlf+ (coerce #(+cr+ +lf+) 'simple-array) #'equalp
24
"Character sequcne #(Return Linefeed) = '\\r\\n'")
26
;; (mapc (lambda (s) (export s)) sb-unicode-syms)
29
;; :include sb-unicode-syms)
30
(defparameter *omit-nulls* nil
31
"When Non-nil, omit null values returned by SSPLIT.")
33
(defvar *whitespaces* (list #\Backspace #\Tab #\Linefeed #\Newline #\Vt #\Page
34
#\Return #\Space #\Rubout
35
#+sbcl #\Next-Line #-sbcl (code-char 133)
37
"On some implementations, linefeed and newline represent the same character (code).")
39
(deftype string-designator ()
40
"A string designator type. A string designator is either a string, a symbol,
42
`(or symbol string character))
44
(defmacro string-byte-length (s)
45
"Return the number of bytes of the internal representation
48
(base-string (length ,s))
49
(string (* (length ,s) 4))))
51
(defun ssplit (separator s &key (omit-nulls *omit-nulls*))
52
"Split s into substring by separator (cl-ppcre takes a regex, we do not).
54
`limit' limits the number of elements returned (i.e. the string is
55
split at most `limit' - 1 times)."
56
;; cl-ppcre:split doesn't return a null string if the separator appears at the end of s.
57
(let* ((res (cl-ppcre:split separator s)))
59
(remove-if (lambda (it) (sequence:emptyp it)) res)
62
(defun collapse-whitespaces (s)
63
"Ensure there is only one space character between words.
65
(cl-ppcre:regex-replace-all "\\s+" s " "))
67
(defun trim (s &key (char-bag *whitespaces*))
68
"Removes all characters in `char-bag` (default: whitespaces) at the beginning and end of `s`.
69
If supplied, char-bag has to be a sequence (e.g. string or list of characters).
71
Examples: (trim \" foo \") => \"foo\"
72
(trim \"+-*foo-bar*-+\" :char-bag \"+-*\") => \"foo-bar\"
73
(trim \"afood\" :char-bag (str:concat \"a\" \"d\")) => \"foo\""
75
(string-trim char-bag s)))
77
;;; TODO 2023-08-27: camel snake kebab
79
(defun make-template-parser (start-delimiter end-delimiter &key (ignore-case nil))
80
"Returns a closure than can substitute variables
81
delimited by \"start-delimiter\" and \"end-delimiter\"
82
in a string, by the provided values."
83
(check-type start-delimiter string)
84
(check-type end-delimiter string)
85
(when (or (string= start-delimiter "")
86
(string= end-delimiter ""))
87
(error 'simple-type-error
88
:format-control "The empty string is not a valid delimiter."))
89
(let ((start-len (length start-delimiter))
90
(end-len (length end-delimiter))
95
(lambda (string values)
96
(check-type string string)
97
(unless (listp values)
98
(error 'simple-type-error
99
:format-control "values should be an association list"))
101
(with-output-to-string (stream)
102
(loop for prev = 0 then (+ j end-len)
103
for i = (search start-delimiter string)
104
then (search start-delimiter string :start2 j)
105
for j = (if i (search end-delimiter string :start2 i))
106
then (if i (search end-delimiter string :start2 i))
108
do (write-string (subseq string prev i) stream)
109
(let ((instance (rest (assoc (subseq string (+ i start-len) j)
113
(princ instance stream)
114
(write-string (subseq string i (+ j end-len)) stream)))
116
finally (write-string (subseq string prev) stream))))))
119
;;; Implementing an efficient string= case in Common Lisp
121
;;; 2015-11-15: Defknown don't have explicit-check in SBCL 1.3.0
122
;;; Remove the declaration. It's never useful the way we use
125
;;; 2015-11-15: Make this a real ASDF system for Xach
126
;;; I copied the system definition from Quicklisp and mangled as
129
;;; 2010-06-30: Tiny bugfix
130
;;; Widen the type declarations inside cases to allow vectors that
131
;;; have a length that's shorter than the total size (due to fill-
137
;;; In `<http://neverfriday.com/blog/?p=10>', OMouse asks how
138
;;; best to implement a `string= case' (in Scheme). I noted that
139
;;; naively iterating through the cases with `string=' at runtime
140
;;; is suboptimal. Seeing the problem as a simplistic pattern
141
;;; matching one makes an efficient solution obvious.
142
;;; Note that, unlike Haskell, both Scheme and CL have random-
143
;;; access on strings in O(1), something which I exploit to
144
;;; generate better code.
146
;;; This is also a pbook.el file (the pdf can be found at
147
;;; `<http://www.discontinuity.info/~pkhuong/string-case.pdf>' ).
148
;;; I'm new at this not-quite-illiterate programming thing, so
149
;;; please bear with me (: I'm also looking for comments on the
150
;;; formatting. I'm particularly iffy with the way keywords look
151
;;; like. It just looks really fuzzy when you're not really zoomed
152
;;; in (or reading it on paper).
154
;;; I usually don't use packages for throw-away code, but this looks
155
;;; like it could be useful to someone.
157
;;;# Some utility code
159
(defun split-tree (list &key (test 'eql) (key 'identity))
160
"Splits input list into sublists of elements
161
whose elements are all such that (key element)
163
It's assumed that test and key form an equality class.
164
(This is similar to groupBy)"
167
(cur-list (list (first list)))
168
(cur-key (funcall key (first list))))
169
(dolist (elt (rest list) (nreverse (cons (nreverse cur-list)
171
(let ((new-key (funcall key elt)))
172
(if (funcall test cur-key new-key)
175
(push (nreverse cur-list) lists)
176
(setf cur-list (list elt)
177
cur-key new-key))))))))
180
"Return a list of positive integers below N."
181
(loop for i below n collect i))
183
(defun hash-table->list (table &key (keep-keys t) (keep-values t))
184
"Saves the keys and/or values in table to a list.
185
As with hash table iterating functions, there is no
188
(maphash (cond ((and keep-keys
191
(push (cons k v) list)))
203
(defun all-equal (list &key (key 'identity) (test 'eql))
204
"Return Non-nil if all elements of LIST are equal according to KEY and TEST."
208
(let ((first-key (funcall key (first list))))
209
(every (lambda (element)
210
(funcall test first-key
211
(funcall key element)))
214
(defun split-at (list n)
215
"Split list in k lists of n elements (or less for the last list)"
216
(declare (type (and fixnum (integer (0))) n))
220
(declare (type (and fixnum unsigned-byte) counter))
221
(dolist (elt list (nreverse (if cur-list
222
(cons (nreverse cur-list)
226
(when (= (incf counter) n)
227
(push (nreverse cur-list) lists)
231
;;;# The string matching compiler per se
233
;;; I use special variables here because I find that
234
;;; preferable to introducing noise everywhere to thread
235
;;; these values through all the calls, especially
236
;;; when `*no-match-form*' is only used at the very end.
238
(defparameter *input-string* nil
239
"Symbol of the variable holding the input string")
241
(defparameter *no-match-form* nil
242
"Form to insert when no match is found.")
244
;;; The basic idea of the pattern matching process here is
245
;;; to first discriminate with the input string's length;
246
;;; once that is done, it is very easy to safely use random
247
;;; access until only one candidate string (pattern) remains.
248
;;; However, even if we determine that only one case might be
249
;;; a candidate, it might still be possible for another string
250
;;; (not in the set of cases) to match the criteria. So we also
251
;;; have to make sure that *all* the indices match. A simple
252
;;; way to do this would be to emit the remaining checks at the
253
;;; every end, when only one candidate is left. However, that
254
;;; can result in a lot of duplicate code, and some useless
255
;;; work on mismatches. Instead, the code generator always
256
;;; tries to find (new) indices for which all the candidates
257
;;; left in the branch share the same character, and then emits
258
;;; a guard, checking the character at that index as soon as possible.
260
;;; In my experience, there are two main problems when writing
261
;;; pattern matchers: how to decide what to test for at each
262
;;; fork, and how to ensure the code won't explode exponentially.
263
;;; Luckily, for our rather restricted pattern language (equality
264
;;; on strings), patterns can't overlap, and it's possible to guarantee
265
;;; that no candidate will ever be possible in both branches of a
268
;;; Due to the the latter guarantee, we have a simple fitness
269
;;; measure for tests: simply maximising the number of
270
;;; candidates in the smallest branch will make our search tree
271
;;; as balanced as possible. Of course, we don't know whether
272
;;; the subtrees will be balanced too, but I don't think it'll
273
;;; be much of an issue.
275
;;; Note that, if we had access, whether via annotations or profiling,
276
;;; to the probability of each case, the situation would be very
277
;;; different. In fact, on a pipelined machine where branch
278
;;; mispredictions are expensive, an unbalanced tree will yield
279
;;; better expected runtimes. There was a very interesting and rather
280
;;; sophisticated Google lecture on that topic on Google video
281
;;; (the speaker used markov chains to model dynamic predictors,
282
;;; for example), but I can't seem to find the URL.
284
;;; TODO: Find bounds on the size of the code!
286
(defun find-best-split (strings to-check)
287
"Iterate over all the indices left to check to find
288
which index (and which character) to test for equality
289
with, keeping the ones which result in the most balanced
291
(flet ((evaluate-split (i char)
292
"Just count all the matches and mismatches"
295
(dolist (string strings (min = /=))
296
(if (eql (aref string i) char)
299
(uniquify-chars (chars)
300
"Only keep one copy of each char in the list"
301
(mapcar 'first (split-tree (sort chars 'char<) :test #'eql))))
302
(let ((best-split 0) ; maximise size of smallest branch
305
(dolist (i to-check (values best-posn best-char))
306
(dolist (char (uniquify-chars (mapcar (lambda (string)
309
(let ((Z (evaluate-split i char)))
310
(when (> Z best-split)
313
best-char char))))))))
315
;;; We sometimes have to execute sequences of checks for
316
;;; equality. The natural way to express this is via a
317
;;; sequence of checks, wrapped in an `and'. However, that
318
;;; translates to a sequence of conditional branches, predicated
319
;;; on very short computations. On (not so) modern architectures,
320
;;; it'll be faster to coalesce a sequence of such checks together
321
;;; as straightline code (e.g. via `or' of `xor'), and only branch
322
;;; at the very end. The code doesn't become much more complex,
323
;;; and benchmarks have shown it to be beneficial (giving a speed
324
;;; up of 2-5% for both predictable and unpredictable workloads,
327
;;; Benchmarks (and experience) have shown that, instead of executing
328
;;; a cascade of comparison/conditional branch, it's slightly
329
;;; faster, both for predictable and unpredictable workloads,
330
;;; to `or' together a bunch of comparisons (e.g. `xor'). On a Core 2
331
;;; processor, it seems that doing so for sequences of around 4
332
;;; comparisons is the sweetspot. On perfectly predictable input,
333
;;; aborting early (on the first check) saves as much time as
334
;;; the 4 test/conditional branch add, compared to a sequence of
337
;;; Numeric char= abstracts out the xor check, and, on SBCL,
338
;;; is replaced by a short assembly sequence when the first
339
;;; argument is a constant. The declared return type is then
340
;;; wider than strictly necessary making it fit in a machine
341
;;; register, but not as a fixnum ensures that the compiler
342
;;; won't repeatedly convert the values to fixnums, when all
343
;;; we'll do is `or' them together and check for zero-ness.
344
;;; This function is the only place where the macro isn't
345
;;; generic over the elements stored in the cases. It shouldn't
346
;;; be too hard to implement a numeric-eql, which would
347
;;; restore genericity to the macro, while keeping the
351
;; (defknown numeric-char= (character character)
352
;; (unsigned-byte #. (1- sb-vm:n-machine-word-bits))
353
;; (movable foldable flushable))
355
;; (define-vop (numeric-char=)
356
;; (:args (x :scs (sb-vm::character-reg sb-vm::character-stack)
358
;; :load-if (not (location= x r))))
360
;; (:arg-types (:constant character) character)
361
;; (:results (r :scs (sb-vm::unsigned-reg)
362
;; :load-if (not (location= x r))))
363
;; (:result-types sb-vm::unsigned-num)
364
;; (:translate numeric-char=)
365
;; (:policy :fast-safe)
366
;; (:note "inline constant numeric-char=")
369
;; (sb-vm::inst #:xor r (char-code y)))))
371
(defun numeric-char= (x y)
372
"Return Non-nil if X and Y are equal numeric characters."
373
(declare (type character x y))
374
(logxor (char-code x)
377
;;; At each step, we may be able to find positions for which
378
;;; there can only be one character. If we emit the check for
379
;;; these positions as soon as possible, we avoid duplicating
380
;;; potentially a lot of code. Since benchmarks have shown
381
;;; it to be useful, this function implements the checks
382
;;; as a series of (zerop (logior (numeric-char= ...)...)),
383
;;; if there is more than one such check to emit.
385
(defun emit-common-checks (strings to-check)
386
(labels ((emit-char= (pairs)
387
(mapcar (lambda (pair)
388
(destructuring-bind (posn . char)
390
`(numeric-char= ,char
391
(aref ,*input-string* ,posn))))
393
(emit-checking-form (common-chars)
395
(let ((common-chars (sort common-chars '< :key 'car)))
396
#+ (and) `(and ,@(mapcar
398
(if (null (rest chunk))
399
(destructuring-bind ((posn . char))
402
(aref ,*input-string* ,posn)))
404
(logior ,@(emit-char= chunk)))))
405
(split-at common-chars 4)))
406
#+ (or) `(and ,@(mapcar
408
(destructuring-bind (posn . char)
411
(aref ,*input-string* ,posn))))
413
(let ((common-chars ())
415
(dolist (posn to-check (values (emit-checking-form common-chars)
416
(nreverse left-to-check)))
417
(if (all-equal strings :key (lambda (string)
419
(push (cons posn (aref (first strings) posn))
421
(push posn left-to-check))))))
423
;;; The driving function: First, emit any test that is
424
;;; common to all the candidates. If there's only one
425
;;; candidate, then we just have to execute the body;
426
;;; if not, we look for the `best' test and emit the
427
;;; corresponding code: execute the test, and recurse
428
;;; on the candidates that match the test and on those
431
(defun make-search-tree (strings bodies to-check)
432
(multiple-value-bind (guard to-check)
433
(emit-common-checks strings to-check)
434
(if (null (rest strings))
436
(assert (null to-check)) ; there shouldn't be anything left to check
439
(progn ,@(first bodies))
441
`(progn ,@(first bodies))))
442
(multiple-value-bind (posn char)
443
(find-best-split strings to-check)
444
(assert posn) ; this can only happen if all strings are equal
450
for string in strings
452
do (if (eql char (aref string posn))
454
(push string =strings)
457
(push string /=strings)
458
(push body /=bodies))))
459
(let ((tree `(if (eql ,char (aref ,*input-string* ,posn))
460
,(make-search-tree =strings =bodies
461
(remove posn to-check))
462
,(make-search-tree /=strings /=bodies
470
;;; Finally, we can glue it all together.
471
;;; To recapitulate, first, dispatch on string
472
;;; length, then execute a search tree for the
473
;;; few candidates left, and finally make sure
474
;;; the input string actually matches the one
475
;;; candidate left at the leaf.
477
(defun emit-string-case (cases input-var no-match)
478
(flet ((case-string-length (x)
480
(let ((*input-string* input-var)
481
(*no-match-form* no-match)
482
(cases-lists (split-tree (sort cases '<
483
:key #'case-string-length)
484
:key #'case-string-length)))
485
`(locally (declare (type vector ,input-var))
486
(case (length ,input-var)
487
,@(loop for cases in cases-lists
488
for length = (case-string-length (first cases))
490
;; arrays with fill pointers expose the total length
491
;; in their type, not the position of the fill-pointer.
492
;; The type below only applies to simple-arrays.
493
(locally (declare (type (or (not simple-array)
494
(simple-array * (,length)))
496
,(make-search-tree (mapcar 'first cases)
501
;;; Just wrapping the previous function in a macro,
502
;;; and adding some error checking (the rest of the code
503
;;; just assumes there won't be duplicate patterns).
504
;;; Note how we use a local function instead of passing
505
;;; the default form directly. This can save a lot on
506
;;; code size, especially when the default form is
509
(defmacro string-case ((string &key (default '(error "No match")))
511
"(string-case (string &key default)
513
case ::= string form*
515
Where t is the default case."
516
(let ((cases-table (make-hash-table :test 'equal)))
517
"Error checking cruft"
519
(assert (typep case '(cons (or string (eql t)))))
520
(let ((other-case (gethash (first case) cases-table)))
522
(warn "Duplicate string-case cases: ~A -> ~A or ~A~%"
526
(setf (gethash (first case) cases-table)
528
(let ((input-var (gensym "INPUT"))
529
(default-fn (gensym "ON-ERROR"))
530
(default-body (gethash t cases-table (list default))))
531
`(let ((,input-var ,string))
532
(flet ((,default-fn ()
534
,(emit-string-case (progn
535
(remhash t cases-table)
536
(hash-table->list cases-table))
540
(defvar *tab-width* 4
541
"The number of spaces to replace all #\Tab characters with in DETABIFY.")
543
;; pulled from SB-COVER
544
(defun detabify (string)
545
"Read STRING and replace all #\Tab characters with *TAB-WIDTH* spaces."
546
(with-output-to-string (stream)
547
(loop for char across string
550
do (if (eql char #\Tab)
551
(loop repeat (- *tab-width* (mod col *tab-width*))
552
do (write-char #\Space stream)
556
(when (eql char #\Newline)
557
;; Filter out empty last line
558
(when (eql i (1- (length string)))
561
(write-char char stream))))))
564
(defun remove-string (rem-string full-string &rest args &key &allow-other-keys)
565
"returns full-string with rem-string removed"
566
(let ((subst-point (apply 'search rem-string full-string args)))
569
(subseq full-string 0 subst-point)
570
(subseq full-string (+ subst-point (length rem-string))))
574
(defun make-growable-string (&optional (init ""))
575
"Make an adjustable string with a fill pointer.
576
Given INIT, a string, return an adjustable version of it with the fill pointer
579
(make-array (max 5 (length init))
580
:element-type 'character
582
:fill-pointer (length init))))
584
(replace string init))
585
(the string string)))
587
(defun nconcat (string &rest data)
588
"Destructively concatenate DATA (string designators) to STRING."
589
(declare (optimize speed)
590
(dynamic-extent string))
591
(unless (array-has-fill-pointer-p string)
592
(setf string (make-growable-string string)))
593
(labels ((conc (string x)
596
(vector-push-extend x string))
598
(let ((len (length x)))
599
(loop for c of-type character across x do
600
(vector-push-extend c string len))))
601
(t (conc string (string x))))))
602
(dolist (x data string)
605
(define-modify-macro nconcatf (&rest data) nconcat)
607
(defun char-range (char1 char2)
608
(loop for i from (char-code char1) to (char-code char2)
609
collect (code-char i)))
611
(defun ascii-ichar= (char1 char2)
612
"ASCII case-insensitive char="
613
(or (char= char1 char2)
614
(and (or (char<= #\A char1 #\Z)
615
(char<= #\A char2 #\Z))
616
(char= (char-downcase char1)
617
(char-downcase char2)))))
619
(defun ascii-istring= (string1 string2)
620
"ASCII case-insensitive string="
621
(every #'ascii-ichar= string1 string2))