Coverage report: /home/ellis/comp/core/lib/nlp/stem/porter.lisp
Kind | Covered | All | % |
expression | 268 | 753 | 35.6 |
branch | 48 | 204 | 23.5 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; stem/porter.lisp --- Porter Stemming Algorithm
3
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4
;; The software is completely free for any purpose, unless notes at
5
;; the head of the program text indicates otherwise (which is
6
;; rare). In any case, the notes about licensing are never more
7
;; restrictive than the BSD License.
9
;; In every case where the software is not written by me (Martin
10
;; Porter), this licensing arrangement has been endorsed by the
11
;; contributor, and it is therefore unnecessary to ask the contributor
12
;; again to confirm it.
14
;; The Porter Stemming Algorithm, somewhat mechanically hand translated to Common Lisp by
15
;; Steven M. Haflich smh@franz.com Feb 2002. Most of the inline comments refer to the
16
;; original C code. At the time of this translation the code passes the associated Porter
17
;; test files. See the function test at the end of this file.
19
;; This port is intended to be portable ANSI Common Lisp. However, it has only been
20
;; compiled and tested with Allegro Common Lisp. This code is offered in the hope it will
21
;; be useful, but with no warranty of correctness, suitability, usability, or anything
22
;; else. The C implementation from which this code was derived was not reentrant, relying
23
;; on global variables. This implementation corrects that. It is intended that a word to
24
;; be stemmed will be in a string with fill-pointer, as this is a natural result when
25
;; parsing user input, web scraping, whatever. If not, a string with fill-pointer is
26
;; created, but this is an efficiency hit and is here intended only for lightweight use or
27
;; testing. Using some resource mechanism on these strings would be a useful improvement,
28
;; whether here or in the calling code.
30
;; This is the Porter stemming algorithm, coded up in ANSI C by the
31
;; author. It may be be regarded as cononical, in that it follows the
32
;; algorithm presented in
34
;; Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
37
;; only differing from it at the points maked --DEPARTURE-- below.
39
;; See also http://www.tartarus.org/~martin/PorterStemmer
41
;; The algorithm as described in the paper could be exactly replicated
42
;; by adjusting the points of DEPARTURE, but this is barely necessary,
43
;; because (a) the points of DEPARTURE are definitely improvements, and
44
;; (b) no encoding of the Porter stemmer I have seen is anything like
45
;; as exact as this version, even with the points of DEPARTURE!
47
;; You can compile it on Unix with 'gcc -O3 -o stem stem.c' after which
48
;; 'stem' takes a list of inputs and sends the stemmed equivalent to
51
;; The algorithm as encoded here is particularly fast.
55
;; The main part of the stemming algorithm starts here. b is a buffer
56
;; holding a word to be stemmed. The letters are in b[k0], b[k0+1] ...
57
;; ending at b[k]. In fact k0 = 0 in this demo program. k is readjusted
58
;; downwards as the stemming progresses. Zero termination is not in fact
59
;; used in the algorithm.
61
;; Note that only lower case sequences are stemmed. Forcing to lower case
62
;; should be done before stem(...) is called.
64
;; cons(i) is TRUE <=> b[i] is a consonant.
66
;;; Common Lisp port Version 1.01
69
;;; Common Lisp port Version history
71
;;; 1.0 -- smh@franz.com Feb 2002
74
;;; 1.01 -- smh@franz.com 25 Apr 2004
75
;;; step4 signalled error for "ion" "ions". Thanks to Jeff Heard
76
;;; for detecting this and suggesting the fix.
79
(in-package :nlp/stem/porter)
81
(defun consonantp (str i)
82
(let ((char (char str i)))
83
(cond ((member char '(#\a #\e #\i #\o #\u)) nil)
85
(if (= i 0) t (not (consonantp str (1- i)))))
88
;; m() measures the number of consonant sequences between k0 and j. if c is
89
;; a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
95
;; <c>vcvcvc<v> gives 3
102
(when (>= i lim) (return-from m n))
103
(if (not (consonantp str i)) (return nil))
108
(if (>= i lim) (return-from m n))
109
(if (consonantp str i) (return nil))
114
(if (>= i lim) (return-from m n))
115
(if (not (consonantp str i)) (return nil))
119
;; vowelinstem() is TRUE <=> k0,...j contains a vowel
121
(defun vowelinstem (str)
122
(loop for i from 0 below (fill-pointer str)
123
unless (consonantp str i) return t))
125
;; doublec(j) is TRUE <=> j,(j-1) contain a double consonant.
127
(defun doublec (str i)
129
((not (eql (char str i) (char str (1- i)))) nil)
130
(t (consonantp str i))))
132
;; cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
133
;; and also if the second c is not w,x or y. this is used when trying to
134
;; restore an e at the end of a short word. e.g.
136
;; cav(e), lov(e), hop(e), crim(e), but
142
(not (consonantp str lim))
143
(consonantp str (1- lim))
144
(not (consonantp str (- lim 2))))
145
(return-from cvc nil))
146
(if (member (char str lim) '(#\w #\x #\y)) (return-from cvc nil))
149
;; ends(s) is TRUE <=> k0,...k ends with the string s.
151
(defun ends (str ending)
152
(declare (string str) (simple-string ending))
153
(let ((len1 (length str)) (len2 (length ending)))
155
for pa downfrom (1- len1) to 0
156
and pb downfrom (1- len2) to 0
157
unless (eql (char str pa) (char ending pb))
159
finally (return (when (< pb 0)
160
(decf (fill-pointer str) len2)
163
;; setto(s) sets (j+1),...k to the characters in the string s, readjusting k.
165
(defun setto (str suffix)
166
(declare (string str) (simple-string suffix))
167
(loop for char across suffix
168
do (vector-push-extend char str)))
170
;; r(s) is used further down.
173
(if (> (m str (fill-pointer str)) 0)
175
(setf (fill-pointer str) sfp)))
177
;; step1ab() gets rid of plurals and -ed or -ing. e.g.
179
;; caresses -> caress
187
;; disabled -> disable
198
(when (eql (char str (1- (fill-pointer str))) #\s)
199
(cond ((ends str "sses") (incf (fill-pointer str) 2))
200
((ends str "ies") (setto str "i"))
201
((not (eql (char str (- (fill-pointer str) 2)) #\s)) (decf (fill-pointer str)))))
202
(cond ((ends str "eed") (if (> (m str (fill-pointer str)) 0)
203
(incf (fill-pointer str) 2)
204
(incf (fill-pointer str) 3)))
205
((let ((sfp (fill-pointer str)))
206
(if (or (ends str "ed")
208
(if (vowelinstem str)
210
(progn (setf (fill-pointer str) sfp)
212
(cond ((ends str "at") (setto str "ate"))
213
((ends str "bl") (setto str "ble"))
214
((ends str "iz") (setto str "ize"))
215
((doublec str (1- (fill-pointer str)))
216
(unless (member (char str (1- (fill-pointer str))) '(#\l #\s #\z))
217
(decf (fill-pointer str))))
218
(t (if (and (= (m str (fill-pointer str)) 1)
219
(cvc str (fill-pointer str)))
223
;; step1c() turns terminal y to i when there is another vowel in the stem.
226
(let ((saved-fill-pointer (fill-pointer str)))
227
(when (and (ends str "y")
229
(setf (char str (fill-pointer str)) #\i))
230
(setf (fill-pointer str) saved-fill-pointer))
233
;; step2() maps double suffices to single ones. so -ization ( = -ize plus
234
;; -ation) maps to -ize etc. note that the string before the suffix must give
238
(let ((sfp (fill-pointer str)))
241
(case (char str (- (length str) 2))
242
(#\a (when (ends str "ational") (r str "ate" sfp) (return))
243
(when (ends str "tional") (r str "tion" sfp) (return)))
244
(#\c (when (ends str "enci") (r str "ence" sfp) (return))
245
(when (ends str "anci") (r str "ance" sfp) (return)))
246
(#\e (when (ends str "izer") (r str "ize" sfp) (return)))
247
(#\l (when (ends str "bli") (r str "ble" sfp) (return))
249
;; To match the published algorithm, replace prev line with
250
;; ((when (ends str "abli") (r str "able" sfp) (return))
251
(when (ends str "alli") (r str "al" sfp) (return))
252
(when (ends str "entli") (r str "ent" sfp) (return))
253
(when (ends str "eli") (r str "e" sfp) (return))
254
(when (ends str "ousli") (r str "ous" sfp) (return)))
255
(#\o (when (ends str "ization") (r str "ize" sfp) (return))
256
(when (ends str "ation") (r str "ate" sfp) (return))
257
(when (ends str "ator") (r str "ate" sfp) (return)))
258
(#\s (when (ends str "alism") (r str "al" sfp) (return))
259
(when (ends str "iveness") (r str "ive" sfp) (return))
260
(when (ends str "fulness") (r str "ful" sfp) (return))
261
(when (ends str "ousness") (r str "ous" sfp) (return)))
262
(#\t (when (ends str "aliti") (r str "al" sfp) (return))
263
(when (ends str "iviti") (r str "ive" sfp) (return))
264
(when (ends str "biliti") (r str "ble" sfp) (return)))
266
;; To match the published algorithm, delete next line.
267
(#\g (when (ends str "logi") (r str "log" sfp) (return)))))))
270
;; step3() deals with -ic-, -full, -ness etc. similar strategy to step2.
273
(let ((sfp (fill-pointer str)))
275
(case (char str (1- (length str)))
276
(#\e (when (ends str "icate") (r str "ic" sfp) (return))
277
(when (ends str "ative") (r str "" sfp) (return)) ; huh?
278
(when (ends str "alize") (r str "al" sfp) (return)))
279
(#\i (when (ends str "iciti") (r str "ic" sfp) (return)))
280
(#\l (when (ends str "ical") (r str "ic" sfp) (return))
281
(when (ends str "ful") (r str "" sfp) (return))) ; huh?
282
(#\s (when (ends str "ness") (r str "" sfp) (return))) ; huh?
286
;; step4() takes off -ant, -ence etc., in context <c>vcvc<v>.
289
(let ((sfp (fill-pointer str)))
290
(when (> sfp 2) ; Unnecessary?
292
(case (char str (- sfp 2))
293
(#\a (if (ends str "al") (return)))
294
(#\c (if (ends str "ance") (return))
295
(if (ends str "ence") (return)))
296
(#\e (if (ends str "er") (return)))
297
(#\i (if (ends str "ic") (return)))
298
(#\l (if (ends str "able") (return))
299
(if (ends str "ible") (return)))
300
(#\n (if (ends str "ant") (return))
301
(if (ends str "ement") (return))
302
(if (ends str "ment") (return))
303
(if (ends str "ent") (return)))
304
(#\o (if (ends str "ion")
305
(let ((len (length str)))
307
(let ((c (char str (1- len))))
308
(or (eql c #\s) (eql c #\t))))
310
(setf (fill-pointer str) sfp))))
311
(if (ends str "ou") (return))) ; takes care of -ous
312
(#\s (if (ends str "ism") (return)))
313
(#\t (if (ends str "ate") (return))
314
(if (ends str "iti") (return)))
315
(#\u (if (ends str "ous") (return)))
316
(#\v (if (ends str "ive") (return)))
317
(#\z (if (ends str "ize") (return))))
318
(return-from step4 str))
319
(unless (> (m str (fill-pointer str)) 1)
320
(setf (fill-pointer str) sfp)))
323
;; step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
326
(let ((len (fill-pointer str)))
327
(if (eql (char str (1- len)) #\e)
328
(let ((a (m str len)))
331
(not (cvc str (1- len)))))
332
(decf (fill-pointer str))))))
333
(let ((len (fill-pointer str)))
334
(if (and (eql (char str (1- len)) #\l)
335
(doublec str (1- len))
337
(decf (fill-pointer str))))
340
;; In stem(p,i,j), p is a char pointer, and the string to be stemmed is from p[i] to p[j]
341
;; inclusive. Typically i is zero and j is the offset to the last character of a string,
342
;; (p[j+1] == '\0'). The stemmer adjusts the characters p[i] ... p[j] and returns the new
343
;; end-point of the string, k. Stemming never increases word length, so i <= k <= j. To
344
;; turn the stemmer into a module, declare 'stem' as extern, and delete the remainder of
348
(let ((len (length str)))
349
;; With this line, strings of length 1 or 2 don't go through the
350
;; stemming process, although no mention is made of this in the
351
;; published algorithm. Remove the line to match the published
353
(if (<= len 2) (return-from stem str)) ; /*-DEPARTURE-*/
354
(if (typep str 'simple-string) ; Primarily for testing.
356
(make-array len :element-type 'character
357
:fill-pointer len :initial-contents str)))
358
(step1ab str) (step1c str) (step2 str) (step3 str) (step4 str) (step5 str)
362
(trace step1ab step1c step2 step3 step4 step5)
365
(defun test () ; Run against the distributed test files.
366
(with-open-file (f1 "voc.txt")
367
(with-open-file (f2 "output.txt")
368
(loop as w1 = (read-line f1 nil nil)
370
as w2 = (read-line f2 nil nil)
373
count t into successes
374
else count t into failures
375
and do (format t "(stem ~s) => ~s wanted ~s~%" w1 w3 w2)
376
finally (progn (format t "sucesses ~d failures ~d~%" successes failures)
377
(return failures))))))