;;; for decomposing text (defun split (str) (let ((ignored (list #\? #\! #\. #\, #\; #\" #\Space #\Newline #\Tab #\Return))) (loop for word in (sb-unicode:words str) unless (member word ignored :test #'string=) collect word))) (defun stop-word? (word) (let ((stop-words (list "a" "about" "above" "after" "again" "against" "all" "am" "an" "and" "any" "are" "as" "at" "be" "because" "been" "before" "being" "below" "between" "both" "but" "by" "can" "did" "do" "does" "doing" "don" "down" "during" "each" "few" "for" "from" "further" "had" "has" "have" "having" "he" "her" "here" "hers" "herself" "him" "himself" "his" "how" "i" "if" "in" "into" "is" "it" "its" "itself" "just" "me" "more" "most" "my" "myself" "no" "nor" "not" "now" "of" "off" "on" "once" "only" "or" "other" "our" "ours" "ourselves" "out" "over" "own" "s" "same" "she" "should" "so" "some" "such" "t" "than" "that" "the" "their" "theirs" "them" "themselves" "then" "there" "these" "they" "this" "those" "through" "to" "too" "under" "until" "up" "very" "was" "we" "were" "what" "when" "where" "which" "while" "who" "whom" "why" "will" "with" "you" "your" "yours" "yourself" "yourselves"))) (member word stop-words :test #'string=))) (defun stem-tokens (words) (loop for word in words collect (stem word))) (defun analyze (str) (let* ((lowercased (string-downcase str)) (individual-words (split lowercased)) (interesting-words (remove-if #'stop-word? individual-words)) (stemmed (stem-tokens interesting-words)) (analyzed (remove-duplicates stemmed :test #'string=))) analyzed)) ;;; index and querying (defparameter *INDEX* (make-hash-table :test #'equal)) (defstruct (document) id text) (defun add (documents) (loop for doc in documents do (loop for token in (analyze (document-text doc)) do (pushnew (document-id doc) (gethash token *INDEX* '()))))) (defun query (str) (let ((indices (loop for token in (analyze str) when (gethash token *INDEX*) collect it))) (reduce #'intersection (or indices (list NIL)))))