Coverage report: /home/ellis/comp/core/lib/nlp/tokenize.lisp
Kind | Covered | All | % |
expression | 30 | 36 | 83.3 |
branch | 0 | 0 | nil |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
(in-package :nlp/tokenize)
3
(defun word-tokenize (string &key (remove-stop-words t) (stem nil) (down-case t) (alphabetic t))
4
"Split a string into a list of words."
5
(let* ((tokens (split " " (collapse-whitespaces string)))
6
(tokens (if remove-stop-words
7
(delete-if (lambda (x) (gethash (string-downcase x) (stop-words-lookup *language-data*))) tokens)
10
(mapcar #'stem tokens)
13
(mapcar #'string-downcase tokens)
15
(tokens (if alphabetic
16
(delete-if-not (lambda (x) (cl-ppcre:scan "^[A-Za-z]*$" x)) tokens)
20
(defun sentence-tokenize (string)
21
"Split a string into a list of sentences."
22
(remove "" (mapcar #'std:trim (cl-ppcre:split "[.!?]" string)) :test #'equal))