Coverage report: /home/ellis/comp/core/lib/nlp/string.lisp
Kind | Covered | All | % |
expression | 0 | 372 | 0.0 |
branch | 0 | 46 | 0.0 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; string.lisp --- String Metric Algorithms
3
;; mk-string-metrics—library of efficient implementations of various string
6
;; Copyright © 2014–2018 Mark Karpov
8
;; Permission is hereby granted, free of charge, to any person obtaining a
9
;; copy of this software and associated documentation files (the
10
;; "Software"), to deal in the Software without restriction, including
11
;; without limitation the rights to use, copy, modify, merge, publish,
12
;; distribute, sublicense, and/or sell copies of the Software, and to
13
;; permit persons to whom the Software is furnished to do so, subject to
14
;; the following conditions:
16
;; The above copyright notice and this permission notice shall be included
17
;; in all copies or substantial portions of the Software.
19
;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20
;; OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21
;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22
;; NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
23
;; LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24
;; OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25
;; WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28
(in-package :nlp/string)
30
(deftype array-index (&optional (length (1- array-dimension-limit)))
31
(list 'integer 0 length))
34
"Calculate Hamming distance between two given strings X and Y, they have
35
to be of the same length."
36
(declare (type (simple-array character) x y)
38
(optimize (safety 0) (speed 3) (space 3)))
40
(declare (type array-index result))
41
(dotimes (i (length x) result)
42
(declare (type array-index i))
43
(unless (char= (char x i)
47
(defun levenshtein (x y)
48
"Calculate Levenshtein distance between two given strings X and Y."
49
(declare (type (simple-array character) x y)
51
(optimize (safety 0) (speed 3) (space 3)))
52
(let* ((x-len (length x))
54
(v0 (make-array (1+ y-len) :element-type 'array-index))
55
(v1 (make-array (1+ y-len) :element-type 'array-index)))
56
(declare (type (simple-array array-index) v0 v1))
57
(dotimes (i (1+ y-len))
58
(declare (type array-index i))
60
(dotimes (i x-len (aref v0 y-len))
61
(declare (type array-index i))
62
(setf (aref v1 0) (1+ i))
64
(declare (type array-index j))
65
(setf (aref v1 (1+ j))
74
(defun damerau-levenshtein (x y)
75
"Calculate Damerau-Levenshtein distance between two given strings X and
77
(declare (type (simple-array character) x y)
79
(optimize (safety 0) (speed 3) (space 3)))
80
(let* ((x-len (length x))
82
(v0 (make-array (1+ y-len) :element-type 'array-index))
83
(v1 (make-array (1+ y-len) :element-type 'array-index))
84
(v* (make-array (1+ y-len) :element-type 'array-index)))
85
(declare (type (simple-array array-index) v0 v1 v*))
86
(dotimes (i (1+ y-len))
87
(declare (type array-index i))
89
(dotimes (i x-len (aref v0 y-len))
90
(declare (type array-index i))
91
(setf (aref v1 0) (1+ i))
93
(declare (type array-index j))
94
(let* ((x-i (char x i))
96
(cost (if (char= x-i y-j) 0 1)))
97
(declare (type array-index cost))
98
(setf (aref v1 (1+ j))
100
(1+ (aref v0 (1+ j)))
101
(+ (aref v0 j) cost)))
104
(let ((x-i-1 (char x (1- i)))
105
(y-j-1 (char y (1- j)))
106
(val (+ (aref v* (1- j)) cost)))
107
(declare (type array-index val))
108
(when (and (char= x-i y-j-1)
110
(< val (aref v1 (1+ j))))
111
(setf (aref v1 (1+ j)) val))))))
112
(rotatef v* v0 v1))))
114
(defun norm-levenshtein (x y)
115
"Return normalized Levenshtein distance between X and Y. Result is a real
116
number from 0 to 1, where 0 signifies no similarity between the strings,
117
while 1 means exact match."
118
(let ((r (levenshtein x y)))
125
(defun norm-damerau-levenshtein (x y)
126
"Return normalized Damerau-Levenshtein distance between X and Y. Result is
127
a real number from 0 to 1, where 0 signifies no similarity between the
128
strings, while 1 means exact match."
129
(let ((r (damerau-levenshtein x y)))
136
(defun string-to-set (str)
137
"Convert string STR into a set. This function is supposed to be inlined."
138
(declare (type (simple-array character) str)
140
(optimize (safety 0) (speed 3) (space 3)))
141
(let ((result (make-hash-table)))
142
(dotimes (i (length str))
143
(let ((ch (char str i)))
144
(if (gethash ch result)
145
(incf (the array-index (gethash ch result)))
146
(setf (gethash ch result) 1))))
149
(defun intersection-length (x y)
150
"Returns length of intersection of two strings X and Y. This function is
151
supposed to be inlined."
153
(declare (type array-index result)
154
(optimize (safety 0) (speed 3) (space 3)))
155
(maphash (lambda (key x-val)
156
(declare (type array-index x-val))
157
(let ((y-val (gethash key y)))
158
(declare (type (or array-index null) y-val))
160
(incf result (min x-val y-val)))))
164
(defun union-length (x y)
165
"Returns length of union of two strings X and Y. This function is supposed
167
(let ((temp (make-hash-table))
169
(declare (type array-index result)
170
(optimize (safety 0) (speed 3) (space 3)))
172
(maphash (lambda (key val)
173
(declare (type array-index val))
174
(let ((t-val (gethash key temp)))
175
(declare (type (or array-index null) t-val))
176
(setf (gethash key temp)
183
(maphash (lambda (key val)
184
(declare (ignore key)
185
(type array-index val))
191
"This function calculates overlap coefficient between two given strings X
192
and Y. Returned value is in range from 0 (no similarity) to 1 (exact
194
(declare (type (simple-array character) x y)
196
(optimize (safety 0) (speed 3) (space 3)))
197
(/ (the array-index (intersection-length (string-to-set x)
203
"Calculate Jaccard similarity coefficient for two strings X and
204
Y. Returned value is in range from 0 (no similarity) to 1 (exact match)."
205
(declare (type (simple-array character) x y)
206
(optimize (safety 0) (speed 3) (space 3)))
207
(let ((x (string-to-set x))
208
(y (string-to-set y)))
209
(if (and (zerop (hash-table-count x))
210
(zerop (hash-table-count y)))
212
(/ (the array-index (intersection-length x y))
213
(the array-index (union-length x y))))))
215
(defun fast-find (char str str-len &optional (start 0))
216
"Check if CHAR is in STR. This function is supposed to be inlined."
217
(declare (type character char)
218
(type (simple-array character) str)
219
(type array-index str-len start)
220
(optimize (safety 0) (speed 3) (space 3)))
221
(do ((i start (1+ i)))
223
(declare (type array-index i))
224
(when (char= char (char str i))
225
(return-from fast-find i))))
228
"Calculate Jaro distance between two strings X and Y. Returned value is in
229
range from 0 (no similarity) to 1 (exact match)."
230
(declare (type (simple-array character) x y)
232
(optimize (safety 0) (speed 1) (space 3)))
233
(let* ((x-len (length x))
235
(d (if (and (>= x-len 2)
237
(- (floor (max x-len y-len) 2) 1)
242
(declare (type array-index d m p pj))
244
(declare (type array-index i))
245
(let ((ch (char x i)))
246
(do ((j (fast-find ch y y-len 0)
247
(fast-find ch y y-len (1+ j)))
250
(declare (type (or array-index null) j))
251
(when (and j (<= (the array-index (abs (- i j)))
253
(when (and (plusp pj)
266
(defun prefix-length (x y)
267
"Calculate length of common prefix for strings X and Y."
268
(declare (type (simple-array character) x y)
270
(optimize (safety 0) (speed 3) (space 3)))
271
(let ((x-len (length x))
274
(declare (type array-index result))
280
(return-from prefix-length result)))
283
(defun jaro-winkler (x y)
284
"Calculate Jaro-Winkler distance between two strings X and Y. Returned
285
value is in range from 0 (no similarity) to 1 (exact match)."
286
(let ((jd (jaro x y))
287
(l (prefix-length x y)))
288
(+ jd (* l 1/10 (- 1 jd)))))