Coverage report: /home/ellis/comp/core/lib/nlp/tokenize.lisp

Kind	Covered	All	%
expression	30	36	83.3
branch	0	0	nil

Key

Not instrumented

Conditionalized out

Executed

Not executed

Both branches taken

One branch taken

Neither branch taken

1

(in-package :nlp/tokenize)

2

3

(defun word-tokenize (string &key (remove-stop-words t) (stem nil) (down-case t) (alphabetic t))

4

"Split a string into a list of words."

5

(let* ((tokens (split " " (collapse-whitespaces string)))

6

(tokens (if remove-stop-words

7

                      (delete-if (lambda (x) (gethash (string-downcase  x) (stop-words-lookup *language-data*))) tokens)

8

tokens))

9

(tokens (if stem

10

(mapcar #'stem tokens)

11

tokens))

12

(tokens (if down-case

13

(mapcar #'string-downcase tokens)

14

tokens))

15

(tokens (if alphabetic

16

(delete-if-not (lambda (x) (cl-ppcre:scan "^[A-Za-z]*$" x)) tokens)

17

tokens)))

18

tokens))

19

20

(defun sentence-tokenize (string)

21

"Split a string into a list of sentences."

22

(remove "" (mapcar #'std:trim (cl-ppcre:split "[.!?]" string)) :test #'equal))