Coverage report: /home/ellis/comp/core/lib/nlp/dbscan.lisp
Kind | Covered | All | % |
expression | 43 | 102 | 42.2 |
branch | 0 | 8 | 0.0 |
Key
Not instrumented
Conditionalized out
Executed
Not executed
Both branches taken
One branch taken
Neither branch taken
1
;;; dbscan --- Density-based spacial clustering of applications with noise (DBSCAN) algorithm
4
(in-package :nlp/dbscan)
6
(defclass document-cluster (document-vertex)
7
((cluster :accessor cluster :initform :noise)
8
(neighbors :accessor neighbors))
9
(:documentation "The document cluster class represents a document
10
that is part of a graph which will be clustered. It extends the
11
documenet-vertex class and adds support for a cluster tag and a list
12
of neighbors. These slots are useful for clustering algorithms."))
14
(defmethod clusters ((collection document-collection))
15
"Return a list of clusters. Each hash key represents a cluster, and
16
the hash value is the list of elements in that cluster.
18
Please note: this function is not responsible for computing the
19
clusters, only for returning the list of pre-tagged documents in
21
(let ((result (make-hash-table)))
22
(loop for document in (documents collection)
23
do (push document (gethash (cluster document) result (list))))
26
(defun get-cluster (cluster-label points)
27
"Return all matching points for a given cluster label."
28
(remove-if-not (lambda (i) (eq (cluster i) cluster-label)) points))
30
(defmethod distance ((vector-1 t) (vector-2 t))
31
"Return the Euclidean distance between two vectors."
32
(sqrt (loop for i across vector-1
34
sum (expt (- i j) 2))))
36
(defmethod distance ((document-a document-cluster) (document-b document-cluster))
37
(distance (vector-data document-a) (vector-data document-b)))
39
(defmethod generate-document-distance-vectors ((collection document-collection))
40
"Set the edge weights for all document neighbors (graph is fully connected)."
41
(with-accessors ((documents documents)) collection
42
(loop for document-a in documents
43
do (loop for document-b in documents
44
do (setf (gethash document-b (edges document-a))
45
(distance document-a document-b))))))
47
(defmethod dbscan ((collection document-collection) &key (minimum-points 3)
49
"Minimum points refers to the minimum amount of points that must
50
exist in the neighborhood of a point for it to be considered a
51
core-point in a cluster. Epsilon refers to the distance between
52
two points for them to be considered neighbors."
53
(labels ((range-query (document)
54
"Return all points that have a distance less than epsilon."
55
(loop for vertex being the hash-keys of (edges document)
56
when (and (<= (gethash vertex (edges document)) epsilon)
57
(not (eq vertex document)))
60
"Is a point a core-point?"
61
(<= minimum-points (length (range-query point))))
62
(cluster-match-p (point cluster)
63
"Check if a core point belongs to a cluster."
64
(intersection cluster (range-query point))))
65
;;; identify core points
66
(let* ((core-points (remove-if-not #'core-point-p (documents collection)))
67
(non-core-points (set-difference (documents collection) core-points)))
68
;;; assign labels to core points
69
(loop for point in core-points
70
with cluster-count = 0
71
do (loop named cluster-set
72
for i from 0 to cluster-count
73
;; point found cluster match, setf and break
74
when (cluster-match-p point (get-cluster i core-points))
75
do (setf (cluster point) i)
76
(return-from cluster-set)
77
;; point found no cluster-match, create new cluster
78
finally (setf (cluster point) (incf cluster-count))))
79
;;; assign labels to non-core points
80
(loop for point in non-core-points
81
for intersection = (intersection core-points (range-query point))
83
do (setf (cluster point) (cluster (first intersection)))))))