Coverage report: /home/ellis/comp/core/lib/nlp/fuzzy.lisp
Kind | Covered | All | % |
expression | 0 | 200 | 0.0 |
branch | 0 | 20 | 0.0 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; fuzzy.lisp --- Fuzzy Matchers
3
;; Extracted from Nyxt.
4
;; Initial authors: Ambrevar, Vindarel.
7
(in-package :nlp/fuzzy)
9
(defmethod object-display (obj)
10
"Like `print-object', but for fuzzy-match."
11
(format nil "~A" obj))
13
(defun substring-norm (substrings string &key (substring-length 2))
14
"Return the norm of SUBSTRINGS with regard to STRING.
15
The norm is closer to 1 if
16
- substrings start near the beginning of STRING;
17
- substrings length are closer to the length of STRING.
19
Only substrings of SUBSTRING-LENGTH characters or more are considered."
20
;; TODO: Remove duplicates in SUBSTRINGS? Repeats could mean we insist more on it.
21
(let ((position-factor 1.0)
23
(long-substrings (remove-if (lambda (s) (> substring-length (length s)))
28
(let ((position (search s string)))
34
;; We use the sqrt to slow down the
35
;; decrease rate, we want the a
36
;; position of 10-15 still be >0.1.
37
(sqrt (1+ position))))
39
(/ (min (length s) (length string))
41
(+ position-factor length-factor)))))
43
(length long-substrings))
46
(defun to-unicode (input)
47
"Convert INPUT to (simple-array character) type."
48
(if (typep input 'base-string)
49
(coerce input `(simple-array character (,(length input))))
52
(defun score-suggestion (input suggestion)
53
"Return a SUGGESTION's score for INPUT.
54
A higher score means the suggestion comes first."
55
;; The Jaccard metric seems to provide much better results than, say,
56
;; Damerau-Levensthein but it's much slower.
57
;; TODO: Check out fzf for a possibly good scoring algorithm.
58
(+ (* 1.0 (nlp/string:norm-damerau-levenshtein suggestion input))
59
(* 1.0 (substring-norm (std:ssplit " " input) suggestion))))
61
(defvar score-threshold 0.0 ; TODO: Learn good value and enable low-score filtering below.
62
"The threshold under which suggestions are eleminated.")
64
(defun sort-suggestions (input suggestion-pairs)
65
"Sort SUGGESTION-PAIRS, the pair closest to INPUT in the levenshtein distance comes first.
66
SUGGESTION-PAIRS is a list of (display-value real-value). See `fuzzy-match' for
68
;; WARNING: mk-string-metrics works on low-level arrays and might not get
69
;; the text encoding right. We need to make sure the suggestions and the
70
;; input are of the same encoding.
71
(setf input (to-unicode input))
72
(dolist (suggestion-pair suggestion-pairs)
73
(setf (first suggestion-pair) (to-unicode (first suggestion-pair))))
74
(flet ((score-suggestion (pair)
75
(cons (score-suggestion input (first pair)) pair))
76
;; (low-score (triplet)
77
;; (< (first triplet) score-threshold))
78
(sort-suggestion (triplet1 triplet2)
81
(triplet-to-pair (triplet)
83
(mapcar #'triplet-to-pair
84
(sort ;; (remove-if #'low-score)
85
(mapcar #'score-suggestion suggestion-pairs)
88
(defun find-exactly-matching-substrings (input suggestions &key (substring-length 2))
89
"Return the list of input substrings that match at least one suggestion.
90
The substrings must be SUBSTRING-LENGTH characters long or more."
91
(let ((input-strings (delete-if (lambda (s) (< (length s) substring-length))
92
(std:ssplit " " input :omit-nulls t))))
95
(loop for suggestion in suggestions
98
(not (search i suggestion)))
102
(defun keep-exact-matches-in-suggestions (input suggestion-pairs)
103
"Destructively filter out non-exact matches from suggestions.
104
If any input substring matches exactly (but not necessarily a whole word),
105
then all suggestions that are not exactly matched by at least one substring are removed."
106
(let* ((exactly-matching-substrings (find-exactly-matching-substrings
108
(mapcar #'first suggestion-pairs))))
109
(if exactly-matching-substrings
110
(setf suggestion-pairs
111
(delete-if (lambda (suggestion-pair)
112
(not (loop for i in exactly-matching-substrings
113
always (search i (first suggestion-pair)))))
117
; TODO: Make score functions customizable, e.g. for global history.
118
(defun fuzzy-match (input suggestions &key suggestions-display (score-suggestion 'score-suggestion))
119
"From the user input and a list of suggestions, return a filtered list of
120
suggestions that have all the input words in them, and sort this list to have the
121
'most relevant' first.
122
The match is case-sensitive if INPUT contains at least one uppercase character.
123
SUGGESTIONS-DISPLAY can be used to pass the pre-computed display strings of the
124
suggestions; otherwise `object-display' is used."
125
;; To sort by the display value, we store all the suggestions in a
126
;; (display-value real-value) list or pairs.
127
(let ((pairs (if suggestions-display
128
(mapcar #'list suggestions-display suggestions)
129
(mapcar (lambda (c) (list (object-display c) c)) suggestions))))
130
(if (not (sequence:emptyp input))
131
(let* ((input (substitute " " " " input))
132
(pairs (if (not (some 'lower-case-p input))
133
(mapcar (lambda (p) (list (string-downcase (first p)) (second p))) pairs)
135
(pairs (keep-exact-matches-in-suggestions input pairs))
136
(pairs (sort-suggestions input pairs)))
138
(pairs (mapcar (lambda (c)
140
(funcall score-suggestion (to-unicode input) (first c))))
142
;; Don't display more than 100 elements to avoid flooding stdout.
143
(if (< (length pairs) limit)
145
(nconc (subseq pairs 0 limit) (list "..."))))
146
(mapcar #'second pairs))
149
(defun file-suggestion-function (input files)
150
"Fuzzy-match this list of files."
151
(fuzzy-match input files))