diff --git a/NEWS b/NEWS index bdb4169..74e4354 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,16 @@ Release 0.3 (not yet released) +* A new protocol enables clients to find dictionary entries that are similar to + a given string or corrections for a given misspelled word. + + The following new functions provide increasingly abstract functionality for + enumerating similar words and corrections: SPELL:MAP-SIMILAR, + SPELL:MAP-CORRECTIONS and SPELL:CORRECTIONS. + + For convenience, the function SPELL:ENGLISH-CORRECTIONS automatically uses + the English dictionary and considers the appropriate case variants of the + supplied string. + * Documentation is now available in the documentation directory. * The new function MAP-ENTRIES calls a supplied function for each entry in a diff --git a/README.org b/README.org index 2a56201..3046672 100644 --- a/README.org +++ b/README.org @@ -2,14 +2,14 @@ * Introduction - SPELL is a spellchecking library for Common Lisp. + SPELL is a spellchecking library for Common Lisp. It is made + available under the BSD license. - It is made available under the BSD license. - - Loading the ~spell~ system may initially take up to between, say, 3 - and 60 seconds (depending on the machine and CL implementation) as - an English dictionary is loaded and compiled into the resulting FASL - file. + Loading (possibly after compiling) the ~spell~ system may initially + take between something like 3 and 60 seconds, depending on the + machine and CL implementation, as an English dictionary is loaded, + optimized and compiled into a FASL file. Subsequent load operations + (without compiling) should finish within well below one second. For loading the ~spell~ system, use @@ -17,12 +17,16 @@ (ql:quickload "spell") #+end_src - Currently the only exported functions are ~spell:english-lookup~ - that accepts a string, and ~spell:english-check-paragraph~ that - checks a whole paragraph of text and returns a list of conses. Each - cons represents a single word in the paragraph which has failed - dictionary lookup, with the ~car~ and ~cdr~ being offsets in the - original string outlining the word. + This document gives only a very brief overview and highlights some + features. Proper documentation can be found in the + file:documentation directory. + +* Looking up Words + + The exported functions for looking up words are + ~spell:english-lookup~ which accepts a string, and + ~spell:english-check-paragraph~ which checks a whole paragraph of + text and returns a list of conses: #+begin_src lisp :exports both (spell:english-lookup "horse") @@ -30,8 +34,8 @@ #+RESULTS: #+begin_example - (# - #) + (# + #) #+end_example #+begin_src lisp :exports both @@ -44,6 +48,31 @@ ((22 . 25) (47 . 50) (51 . 56)) #+end_example + Each cons represents a single word in the paragraph which has failed + dictionary lookup, with the ~car~ and ~cdr~ being offsets in the + original string outlining the word. + +* Obtaining Corrections + + The SPELL library exports a few functions that obtain similar words + for a given word or corrections for a misspelled word. The most + convenient function is ~spell:english-corrections~ which returns a + list of corrections for a (possibly) misspelled English word; + + #+begin_src lisp :exports both :results value verbatim + (spell:english-corrections "lifp" :threshold 1) + #+end_src + + #+RESULTS: + #+begin_example + ("lift" "life" "lip" "lisp" "limp") + NIL + #+end_example + + For more ways to use this function as well as the information about + the lower-level functions for similar words and corrections, see the + main documentation. + * Backward Compatibility Notice The SPELL library provides the ~spell/simple~ ASDF system. The @@ -61,5 +90,5 @@ #+end_src # Local Variables: -# eval: (load-library 'ob-lisp) +# eval: (load-library "ob-lisp") # End: diff --git a/code/compact-trie.lisp b/code/compact-trie.lisp index 4b54442..b00f09c 100644 --- a/code/compact-trie.lisp +++ b/code/compact-trie.lisp @@ -274,7 +274,7 @@ ;; Now that children and compact entries have been handled, if NODE ;; is also a leaf, call the next method which is the one specialized ;; to `leaf-mixin' to handle the non-compact entries. - (when (typep node 'leaf-mixin) + (when (leafp node) (call-next-method))) (defmethod compact-node-slots append ((node raw-interior-mixin) (depth integer)) @@ -323,6 +323,9 @@ (defclass compact-interior-node (compact-interior-mixin compact-node) ()) +(defmethod leafp ((node compact-interior-node)) + nil) + (defmethod node-lookup ((function function) (string string) (suffix (eql 0)) diff --git a/code/english.lisp b/code/english.lisp index 811108f..76a0c76 100644 --- a/code/english.lisp +++ b/code/english.lisp @@ -32,27 +32,37 @@ (defparameter *english-dictionary* #.(load-english-dictionary)) +;;; Query functions + +(declaim (inline map-english-case-variants)) +(defun map-english-case-variants (function word) + (let ((function (a:ensure-function function))) + (funcall function word) + (when (plusp (length word)) + (let* ((initial (aref word 0)) + (downcased (char-downcase initial))) + (unless (char= initial downcased) + ;; We change, for example, "Anti-Semitic" at the beginning + ;; of a sentence to "anti-Semitic" which is in the + ;; dictionary. + (let ((decapitalized (copy-seq word))) + (setf (aref decapitalized 0) downcased) + (funcall function decapitalized)) + ;; We change, for example, "PARAMETER" (which is typical for + ;; some commenting styles) to "parameter" which is in the + ;; dictionary. + (when (every #'upper-case-p word) + (funcall function (string-downcase word)))))))) +(declaim (notinline map-english-case-variants)) + (defun english-lookup (word) - (when (and word (string/= word "")) - (let ((dictionary *english-dictionary*)) - (flet ((try (variant) - (a:when-let ((result (lookup variant dictionary))) - (return-from english-lookup result)))) - (try word) - (let* ((initial (aref word 0)) - (downcased (char-downcase initial))) - (unless (char= initial downcased) - ;; We change, for example, "Anti-Semitic" at the beginning - ;; of a sentence to "anti-Semitic" which is in the - ;; dictionary. - (let ((decapitalized (copy-seq word))) - (setf (aref decapitalized 0) downcased) - (try decapitalized)) - ;; We change, for example, "PARAMETER" (which is typical - ;; for some commenting styles) to "parameter" which is in - ;; the dictionary. - (when (every #'upper-case-p word) - (try (string-downcase word))))))))) + (let ((dictionary *english-dictionary*)) + (flet ((try (variant) + (a:when-let ((result (lookup variant dictionary))) + (return-from english-lookup result)))) + (declare (dynamic-extent #'try)) + (locally (declare (inline map-english-case-variants)) + (map-english-case-variants #'try word))))) (declaim (inline english-text-char-p find-start find-end)) (defun english-text-char-p (character) @@ -85,3 +95,11 @@ do (setf position word-end) unless (english-lookup (subseq string word-start word-end)) collect (cons word-start word-end))) + +(defun english-corrections (string &key (threshold 2) + (variants 'map-english-case-variants) + (group-by :spelling) + (count nil)) + (corrections string *english-dictionary* threshold :variants variants + :group-by group-by + :count count)) diff --git a/code/package.lisp b/code/package.lisp index 0704c15..e3c388d 100644 --- a/code/package.lisp +++ b/code/package.lisp @@ -65,9 +65,13 @@ #:entry-count #:map-entries #:lookup + #:map-similar + #:map-corrections + #:corrections #:insert #:load-dictionary) (:export #:english-lookup - #:english-check-paragraph)) + #:english-check-paragraph + #:english-corrections)) diff --git a/code/protocol.lisp b/code/protocol.lisp index 6db1f29..561e551 100644 --- a/code/protocol.lisp +++ b/code/protocol.lisp @@ -8,18 +8,34 @@ (defgeneric lookup (string dictionary)) +(defgeneric map-similar (function string dictionary threshold &key group-by)) + +(defgeneric map-corrections (function string dictionary threshold + &key variants group-by)) + +(defgeneric corrections (string dictionary threshold + &key variants group-by count)) + (defgeneric insert (object string dictionary)) (defgeneric load-dictionary (source &key into)) ;;; Trie node protocols +(defgeneric leafp (node)) + +(defgeneric interiorp (node)) + ;;; Lookup protocol (defgeneric node-lookup (function string suffix node)) (defgeneric map-node-entries (function node characters)) +;;; Similar protocol + +(defgeneric node-map-similar (function string suffix node threshold characters)) + ;;; Insert protocol (defgeneric node-insert (object string suffix node)) diff --git a/code/raw-trie.lisp b/code/raw-trie.lisp index 1073ce1..4750e07 100644 --- a/code/raw-trie.lisp +++ b/code/raw-trie.lisp @@ -151,6 +151,9 @@ (defclass raw-interior-node (raw-interior-mixin interior-node raw-node) ()) +(defmethod leafp ((node raw-interior-node)) + nil) + #-minimal-raw-trie (defmethod node-lookup ((function function) (string string) (suffix (eql 0)) (node raw-interior-node)) diff --git a/code/similar.lisp b/code/similar.lisp new file mode 100644 index 0000000..62e908a --- /dev/null +++ b/code/similar.lisp @@ -0,0 +1,277 @@ +;;;; This file contains the implementation of the family of functions +;;;; that find dictionary entries which are similar to a given string. + +(cl:in-package #:spell) + +;;; The generic function `node-map-similar' accepts a query string, a +;;; remaining suffix length within that string and a threshold for the +;;; number of remaining edit operations. `node-map-similar' traverses +;;; the trie, visiting nodes that can be reached by applying a number +;;; of insert, delete and change edits to the query string that is +;;; below the threshold. Recursive calls to `node-map-similar' +;;; advance the suffix index and possibly decrease the threshold until +;;; either a leaf is reached, the suffix is empty or the threshold +;;; drops below 0. The first two cases may correspond to having +;;; located entries that are "similar enough" to the query string with +;;; respect to the initial threshold. +;;; +;;; The basic traversal that is performed by `node-map-similar' is +;;; similar to that performed by `map-node-entries'. The only (big) +;;; difference is the inclusion of similar children and entries. + +;;; This method handles interior nodes by considering each child and +;;; its key which is either a character or a string. In any case, if +;;; possible, the traversal continues in the respective child with +;;; SUFFIX, THRESHOLD and CHARACTERS adjusted to account for the +;;; matching (possible via "editing" STRING) of the key to a +;;; sub-sequence of STRING. +(defmethod node-map-similar ((function function) + (string string) + (suffix integer) + (node interior-mixin) + (threshold integer) + (characters t)) + (declare (type simple-string string) + (type a:array-index suffix threshold) + (type list characters)) + ;; First look at the keys of all children and determine which + ;; children can be traversed within the remaining THRESHOLD. + (let ((string-length (length string))) + (labels ((visit-child (child suffix threshold characters) + (funcall function child suffix threshold characters) + nil) ; return predictable value(s) + (consider-child/char (key child suffix threshold) + ;; The key is a single character try the three edit + ;; operations if THRESHOLD permits. + (declare (type a:array-index suffix threshold)) + (when (plusp threshold) + ;; "insert" edit + (let ((characters (list* key characters))) + (visit-child child suffix (1- threshold) characters)) + ;; "delete" edit + (when (plusp suffix) + (consider-child/char key child (1- suffix) (1- threshold)))) + ;; Match or "change" edit + (cond ((= suffix 0)) + ((let ((offset (- string-length suffix))) + (char= key (aref string offset))) + (let ((characters (list* key characters))) + (visit-child child (1- suffix) threshold characters))) + ((plusp threshold) + (let ((characters (list* key characters))) + (visit-child child (1- suffix) (1- threshold) characters))))) + (consider-child/string (key child suffix threshold) + (let ((key-length (length key))) + ;; The key is a string. Try the three edit operations + ;; for each character where THRESHOLD permits. + (declare (type simple-string key)) + (labels + ((rec (index suffix threshold characters) + (declare (type a:array-index suffix threshold)) + (cond ((= index key-length) + (visit-child child suffix threshold characters)) + ((let ((offset (- string-length suffix))) + (and (plusp suffix) + (char= (aref key index) + (aref string offset)))) + (let ((characters (list* (aref key index) + characters))) + (rec (1+ index) (1- suffix) threshold characters))) + ((plusp threshold) + ;; "insert" character into query string + (let ((characters (list* (aref key index) + characters))) + (rec (1+ index) suffix (1- threshold) characters)) + (when (plusp suffix) + ;; "delete" character from query string + (rec index (1- suffix) (1- threshold) characters) + ;; match or "change" edit + (let ((characters (list* (aref key index) + characters))) + (rec (1+ index) (1- suffix) (1- threshold) characters))))))) + (declare (dynamic-extent #'rec)) + (rec 0 suffix threshold characters)))) + (consider-child (key child) + (etypecase key + (character + (consider-child/char key child suffix threshold)) + (string + (consider-child/string key child suffix threshold))) + ;; `map-children' can write back modifications, so + ;; avoid returning anything that would be written back. + nil)) + (declare (dynamic-extent #'visit-child + #'consider-child/char + #'consider-child/string + #'consider-child)) + (map-children #'consider-child node (%children node))))) + +;;;; Dictionary-level methods + +(defmethod %map-similar ((function function) + (string string) + (dictionary dictionary) + (threshold integer)) + (check-type threshold a:array-index) + ;; We keep discovered but not yet visited nodes in worklists that + ;; group the queued nodes by "remaining threshold" which is roughly + ;; (- THRESHOLD ) that the node has + ;; accrued so far. Since we want to visit nodes with small + ;; edit-distances first, we dequeue from the bucket with the highest + ;; remaining threshold. + (let ((leaf-worklist (make-array (1+ threshold) :initial-element '())) + (node-worklist (make-array (1+ threshold) :initial-element '()))) + (labels ((enqueue (node suffix remaining-threshold characters) + (declare (type a:array-index suffix)) + ;; Push NODE into the buckets for REMAINING-THRESHOLD + ;; in NODE-WORKLIST and LEAF-WORKLIST as appropriate. + (let ((base (cons node characters))) ; shared list tail + ;; If NODE is a leaf and the entire STRING has been + ;; matched (modulo edits and possibly dropping a + ;; suffix), enqueue NODE for `visit-leaf'. + (when (and (<= suffix remaining-threshold) (leafp node)) + (push base (aref leaf-worklist (- remaining-threshold suffix)))) + ;; If NODE has children, queue NODE for recursive + ;; traversal. + (when (interiorp node) + (push (cons suffix base) + (aref node-worklist remaining-threshold))))) + (visit-leaf (element remaining-threshold) + (let ((node (car element)) + (characters (cdr element))) + (declare (type list characters)) + ;; STRING has been matched (modulo edits), report all + ;; entries in NODE (which may be a proper node or + ;; some compact representation) to FUNCTION. + (flet ((visit-entry (entry) + (let ((spelling (nreverse (coerce characters 'string))) + (distance (- threshold remaining-threshold))) + (funcall function spelling entry distance)))) + (declare (dynamic-extent #'visit-entry)) + (typecase node + ((or integer cons) + (visit-entry node)) + (vector + (map nil #'visit-entry node)) + (t + (map-leaf-entries #'visit-entry node (%entries node))))))) + (visit-node (element remaining-threshold) + (destructuring-bind (suffix node . characters) element + ;; Enqueue children of NODE that have a chance to + ;; match within REMAINING-THRESHOLD into the + ;; appropriate bucket of NODE-WORKLIST. + (node-map-similar + #'enqueue string suffix node remaining-threshold characters)))) + (declare (dynamic-extent #'enqueue #'visit-leaf #'visit-node)) + ;; Seed NODE-WORKLIST (and possibly LEAF-WORKLIST) with the root + ;; node. + (enqueue (contents dictionary) (length string) threshold '()) + ;; Repeatedly grab entries with the smallest edit-distance (so + ;; far) from either worklist. + (loop :while (loop :for i :from threshold :downto 0 + :do (a:when-let ((cluster (aref leaf-worklist i))) + (setf (aref leaf-worklist i) (rest cluster)) + (visit-leaf (first cluster) i) + (return t)) + (a:when-let ((cluster (aref node-worklist i))) + (setf (aref node-worklist i) (rest cluster)) + (visit-node (first cluster) i) + (return t)) + :finally (return nil))))) + nil) + +(defmethod map-similar ((function t) + (string string) + (dictionary dictionary) + (threshold integer) + &key (group-by :entry)) + ;; We call `%map-similar' for the basic traversal and add two + ;; aspects: 1) depending on GROUP-BY, expand compact entries to + ;; `word' instances 2) detect duplicate either spellings or + ;; spelling-word pairs and do not report duplicates to FUNCTION. + (check-type threshold a:array-index) + (check-type group-by (member :spelling :entry)) + (let ((function (a:ensure-function function)) + (string (coerce string 'simple-string)) + (seen (make-hash-table :test #'equal))) + (cl:case group-by + (:spelling + (flet ((result (spelling entry distance) + (declare (type a:array-index distance) + (ignore entry)) + (let ((existing (gethash spelling seen))) + (declare (type (or null a:array-index) existing)) + (when (null existing) + (setf (gethash spelling seen) distance) + (funcall function spelling distance))))) + (declare (dynamic-extent #'result)) + (%map-similar #'result string dictionary threshold))) + (:entry + (flet ((result (spelling entry distance) + (declare (type a:array-index distance)) + (let* ((key (cons spelling entry)) + (existing (gethash key seen))) + (declare (type (or null a:array-index) existing)) + (when (null existing) + (setf (gethash key seen) distance) + (let ((word (expand-entry entry spelling))) + (funcall function spelling word distance)))))) + (declare (dynamic-extent #'result)) + (%map-similar #'result string dictionary threshold)))))) + +(defmethod map-corrections ((function function) + (string string) + (dictionary dictionary) + (threshold integer) + &key (variants (lambda (continuation string) + (funcall continuation string))) + (group-by :entry)) + (check-type threshold a:array-index) + (check-type group-by (member :spelling :entry)) + (let ((variants (a:ensure-function variants))) + (labels ((report/spelling (spelling distance) + (funcall function spelling distance)) + (report/entry (spelling node distance) + (funcall function spelling node distance)) + (try (variant) + (let ((collector (cl:case group-by + (:spelling #'report/spelling) + (:entry #'report/entry)))) + (map-similar collector variant dictionary threshold + :group-by group-by)))) + (declare (dynamic-extent #'report/spelling #'report/entry #'try)) + ;; TODO: this is wrong since results will not be reported in + ;; order of increasing edit-distance but primarily in order of + ;; variants tried. + (funcall variants #'try string)))) + +(defmethod corrections ((string string) + (dictionary dictionary) + (threshold integer) + &key (group-by :entry) + (variants nil variants-supplied-p) + count) + (check-type count (or null a:array-index)) + (let ((results '()) + (correct? nil) + (remaining count)) + ;; The results arrive in increasing-edit-distance order. We + ;; expect a small-ish number of results so we use a list. + (block nil + (macrolet ((handler ((&rest parameters) value-form) + `(lambda (,@parameters distance) + (when (zerop distance) + (setf correct? t)) + (cond ((null remaining)) + ((eql remaining 0) (return)) + (t (decf remaining))) + (push ,value-form results)))) + (apply #'map-corrections + (ecase group-by + (:spelling (handler (spelling) spelling)) + (:entry (handler (spelling word) (cons word spelling)))) + string dictionary threshold :group-by group-by + (when variants-supplied-p (list :variants variants))))) + ;; Reverse so that the results with the smallest edit-distance are + ;; at the beginning of the result list. + (values (nreverse results) correct?))) diff --git a/code/trie.lisp b/code/trie.lisp index cb1ba86..c55447a 100644 --- a/code/trie.lisp +++ b/code/trie.lisp @@ -1,5 +1,13 @@ (cl:in-package #:spell) +;;; Trie node protocol default behavior + +(defmethod leafp ((node t)) + t) + +(defmethod interiorp ((node t)) + nil) + ;;; `node' class (defclass node (utilities.print-items:print-items-mixin) ()) @@ -37,6 +45,9 @@ object (%children object)) `((:children "~D ~:*child~[ren~;~:;ren~]" ,child-count)))) +(defmethod interiorp ((node interior-mixin)) + t) + ;;; Concrete node classes (defclass interior-node (interior-mixin node) ()) ; TODO: are these useful? diff --git a/data/changes.sexp b/data/changes.sexp index 10b8cca..b25b049 100644 --- a/data/changes.sexp +++ b/data/changes.sexp @@ -1,5 +1,20 @@ (:changes (:release "0.3" nil + (:item + (:paragraph + "A" "new" "protocol" "enables" "clients" "to" "find" "dictionary" "entries" + "that" "are" "similar" "to" "a" "given" "string" "or" "corrections" "for" + "a" "given" "misspelled" "word" ".") + (:paragraph + "The" "following" "new" "functions" "provide" "increasingly" "abstract" + "functionality" "for" "enumerating" "similar" "words" "and" "corrections:" + (:symbol "spell:map-similar") "," (:symbol "spell:map-corrections") "and" + (:symbol "spell:corrections") ".") + (:paragraph + "For" "convenience" "," "the" "function" + (:symbol "spell:english-corrections") "automatically" "uses" "the" "English" + "dictionary" "and" "considers" "the" "appropriate" "case" "variants" "of" + "the" "supplied" "string" ".")) (:item (:paragraph "Documentation" "is" "now" "available" "in" "the" (:tt "documentation") diff --git a/documentation/chapter-external-protocols.texi b/documentation/chapter-external-protocols.texi index a815cba..22292ff 100644 --- a/documentation/chapter-external-protocols.texi +++ b/documentation/chapter-external-protocols.texi @@ -70,6 +70,76 @@ das Pferd." @end lisp @end deffn +@deffuna{english-corrections,@toppackage{}} string @pkey{} threshold @ + variants group-by count + +Return entries from the English dictionary that are at most +@var{threshold} removed from @var{string} in terms of edit-distance. +Return two values: a list of entries and a Boolean which indicates +whether @var{string} is spelled correctly according to the English +dictionary. By default, the elements of the list that is the first +return value are unique strings that correspond to the spellings of +matching dictionary entries. The caller can use the @var{group-by} +parameter (see below) to make the function return pairs of +@ref{Class @toppackage{}|word,word} instances and spellings instead. + +If supplied, @var{threshold} has to be a non-negative integer that +controls the maximum edit-distance of returned corrections from +@var{string}. Note that the value @t{0} for @var{threshold} is not very +useful since @ref{Function @toppackage{}|english-lookup,english-lookup} +is a simpler way to perform that operation. + +If supplied, @var{variants} has to be a function that accepts as its +arguments a string and another function which it must call once for each +case variant of the string that should be considered. The default value +is a more complete and sophisticated version of the following: +@lisp +(lambda (continuation string) + (funcall continuation string) + (when @samp{certain-conditions} + (funcall continuation (string-decapitalized string))) + (when @samp{certain-conditions}) + (funcall continuation (string-downcase string))) +@end lisp + +If supplied, @var{group-by} has to be either @t{:spelling} or +@t{:entry}. It controls whether word objects are included in elements +of the returned list. If @var{group-by} is @t{:entry}, elements are of +the form @t{(@var{word} . @var{spelling})}. If @var{group-by} is +@t{:spelling}, elements are strings and each string is one spelling. +The default is @t{:spelling}. Note that the value of @var{group-by} +influences the number returned elements since a spelling that +corresponds to multiple entries will be reported once for @t{:spelling} +but multiple times for @t{:entry}. + +If supplied, @var{count} controls the maximum number of entries that are +returned as the first value. + +Examples: +@lisp +(spell:english-corrections "lisp" :threshold 1) +@result{} ("lisp" "lisps" "list" "lip" "limp" "wisp") T + +(spell:english-corrections "lifp" :threshold 1) +@result{} ("lift" "life" "lip" "lisp" "limp") NIL + +(spell:english-corrections + "abc" :threshold 1 + :variants (lambda (continuation string) + (funcall continuation (string-upcase string)))) +@result{} ("ABC" "ABCs") T + +(spell:english-corrections + "abc" :threshold 1 + :variants (lambda (continuation string) + (funcall continuation (string-upcase string))) + :group-by :entry) +@result{} ((# . "ABC") + (# . "ABCs")) + T +@end lisp +@end deffn + @node Word Protocols @section Word Protocols @@ -139,6 +209,220 @@ Example: @end lisp @end deffn +@defgena{map-similar,@toppackage{}} function string dictionary threshold @pkey{} @ + group-by + +Call @var{function} for each entry in @var{dictionary} for which the +edit-distance to @var{string} is below @var{threshold}. The entries are +reported in order of increasing edit-distance from @var{string}. + +The lambda-list of @var{function} has to be compatible with either +@t{(spelling word distance)} if @var{group-by} is @t{:entry} or +@t{(spelling distance)} if @var{group-by} is @t{:spelling}. @var{word} +is the matching @ref{Class @toppackage{}|word,word} object, +@var{spelling} is a string that is the spelling of @var{word} (which is +not obtainable from @var{word} itself) and @var{distance} is the +edit-distance between @var{spelling} and @var{string}. + +@var{string} is the query string and can be of any subtype of +@t{cl:string}. At the moment, the case of string is left untouched and +only dictionary entries that match the case are reported. + +@var{dictionary} is a dictionary. + +@var{threshold} is a non-negative integer which controls how similar to +@var{string} in terms of edit-distance the reported entries have to be. + +@var{group-by} controls whether word objects are included in calls of +@var{function}. If @var{group-by} is @t{:entry}, @var{function} is +called with three arguments: spelling, word and distance. If +@var{group-by} is @t{:spelling}, @var{function} is called with two +arguments, spelling and distance. The default is @t{:entry}. Note that +the value of @var{group-by}r influences the number of times +@var{function} is called since a spelling that corresponds to multiple +entries will be reported once for @t{:spelling} but multiple times for +@t{:entry}. + +Example: +@lisp +(spell:map-similar + (lambda (spelling word distance) + (format t "~D ~10S ~A~%" distance spelling (class-name (class-of word)))) + "lisp" spell::*english-dictionary* 1) +@print{} 0 "lisp" EXPLICIT-BASE-NOUN +@print{} 0 "lisp" EXPLICIT-BASE-VERB +@print{} 1 "limp" EXPLICIT-BASE-NOUN +@print{} 1 "limp" EXPLICIT-BASE-ADJECTIVE +@print{} 1 "limp" EXPLICIT-BASE-VERB +@print{} 1 "lip" EXPLICIT-BASE-NOUN +@print{} 1 "lip" EXPLICIT-BASE-VERB +@print{} 1 "lisps" EXPLICIT-BASE-NOUN +@print{} 1 "lisps" EXPLICIT-BASE-VERB +@print{} 1 "list" EXPLICIT-BASE-NOUN +@print{} 1 "list" EXPLICIT-BASE-VERB +@print{} 1 "wisp" EXPLICIT-BASE-NOUN +@print{} 1 "wisp" EXPLICIT-BASE-VERB +@result{} NIL +@end lisp + +With @t{:group-by :spelling} +@lisp +(spell:map-similar + (lambda (spelling distance) + (format t "~D ~S~%" distance spelling)) + "lisp" spell::*english-dictionary* 1 :group-by :spelling) +@print{} 0 "lisp" +@print{} 1 "lisps" +@print{} 1 "list" +@print{} 1 "lip" +@print{} 1 "limp" +@print{} 1 "wisp" +@result{} NIL +@end lisp +@end deffn + +@defgena{map-corrections,@toppackage{}} function string dictionary threshold @ + @pkey{} variants group-by + +Call @var{function} for each entry in @var{dictionary} for which the +edit-distance to a @emph{variant} of @var{string} is below +@var{threshold}. The entries are reported in order of increasing +edit-distance from variants of @var{string}. + +The lambda-list of @var{function} has to be compatible with either +@t{(spelling word distance)} if @var{group-by} is @t{:entry} or +@t{(spelling distance)} if @var{group-by} is @t{:spelling}. @var{word} +is the matching @ref{Class @toppackage{}|word,word} object, @var{spelling} is a +string that is the spelling of @var{word} (which is not obtainable from +@var{word} itself) and @var{distance} is the edit-distance between +@var{spelling} and one variant of @var{string}. + +@var{string} is the query string and can be of any subtype of +@t{cl:string}. + +@var{dictionary} is a dictionary. + +@var{threshold} is a non-negative integer which controls how similar to +a variant of @var{string} in terms of edit-distance the reported entries +have to be. + +@var{variants} controls which variants in terms of capitalization and +case of @var{string} should be considered. If supplied, the value of +@var{variants} has to be a function the lambda-list of which has to +compatible with @t{(continuation string)} where @var{string} is the +@var{string} mentioned above and @var{continuation} is a function that +should be called once for each case-modified variant of @var{string}. +Note that capitalization and case information should ideally be +contained in dictionary entries. The implementation of that improvement +would make this parameter unnecessary. + +@var{group-by} controls whether word objects are included in calls of +@var{function}. If @var{group-by} is @t{:entry}, @var{function} is +called with three arguments, spelling, word and distance. If +@var{group-by} is @t{:spelling}, @var{function} is called with two +arguments, spelling and distance. The default is @t{:entry}. Note that +the value of @var{group-by} influences the number of times +@var{function} is called since a spelling that corresponds to multiple +entries will be reported once for @t{:spelling} but multiple times for +@t{:entry}. + +Examples: +@lisp +(spell:map-corrections + (lambda (spelling word distance) + (format t "~D ~10S ~A~%" distance spelling (class-name (class-of word)))) + "lisp" spell::*english-dictionary* 1) +@print{} 0 "lisp" EXPLICIT-BASE-NOUN +@print{} 0 "lisp" EXPLICIT-BASE-VERB +@print{} 1 "lisps" EXPLICIT-BASE-NOUN +@print{} 1 "lisps" EXPLICIT-BASE-VERB +@print{} 1 "list" EXPLICIT-BASE-NOUN +@print{} 1 "list" EXPLICIT-BASE-VERB +@print{} 1 "lip" EXPLICIT-BASE-NOUN +@print{} 1 "lip" EXPLICIT-BASE-VERB +@print{} 1 "limp" EXPLICIT-BASE-NOUN +@print{} 1 "limp" EXPLICIT-BASE-ADJECTIVE +@print{} 1 "limp" EXPLICIT-BASE-VERB +@print{} 1 "wisp" EXPLICIT-BASE-NOUN +@print{} 1 "wisp" EXPLICIT-BASE-VERB +@result{} NIL + +(spell:map-corrections + (lambda (spelling word distance) + (format t "~D ~10S ~A~%" distance spelling (class-name (class-of word)))) + "lifp" spell::*english-dictionary* 1) +@print{} 1 "lift" EXPLICIT-BASE-NOUN +@print{} 1 "lift" EXPLICIT-BASE-VERB +@print{} 1 "life" EXPLICIT-BASE-NOUN +@print{} 1 "lip" EXPLICIT-BASE-NOUN +@print{} 1 "lip" EXPLICIT-BASE-VERB +@print{} 1 "lisp" EXPLICIT-BASE-NOUN +@print{} 1 "lisp" EXPLICIT-BASE-VERB +@print{} 1 "limp" EXPLICIT-BASE-NOUN +@print{} 1 "limp" EXPLICIT-BASE-ADJECTIVE +@print{} 1 "limp" EXPLICIT-BASE-VERB +@result{} NIL +@end lisp +@end deffn + +@defgena{corrections,@toppackage{}} string dictionary threshold @ + @pkey{} variants group-by count + +Return corrections from @var{dictionary} that are within @var{threshold} +of the misspelled word @var{string}. Return two values: a list of +corrections and a Boolean which indicates whether @var{string} is +spelled correctly according to @var{dictionary}. + +The list of entries which is returned as the first value is ordered +according to increasing edit-distance between @var{string} and the +respective entry. Depending on @var{group-by} (see below), entries in +the list are either strings that are spellings of entries in +@var{dictionary} or pairs of the form @t{(@var{word} . @var{spelling})}. +If the second return value is true, which indicates that @var{string} is +spelled correctly, the first return value includes one or more entries +which correspond to @var{string}. + +@var{dictionary} is a dictionary. + +@var{threshold} is a non-negative integer which controls how similar to +a variant of @var{string} in terms of edit-distance the reported entries +have to be. + +@var{variants} controls which variants in terms of capitalization and +case of @var{string} should be considered. If supplied, the value of +@var{variants} has to be a function the lambda-list of which has to +compatible with @t{(continuation string)} where @var{string} is the +@var{string} mentioned above and @var{continuation} is a function that +should be called once for each case-modified variant of @var{string}. +Note that capitalization and case information should ideally be +contained in dictionary entries. The implementation of that improvement +would make this parameter unnecessary. + +@var{group-by} controls whether word objects are included in elements of +the returned list. If @var{group-by} is @t{:entry}, elements are of the +form @t{(@var{word} . @var{spelling})}. If @var{group-by} is +@t{:spelling}, elements are strings and each string is one spelling. +The default is @t{:entry}. Note that the value of @var{group-by} +influences the number returned elements since a spelling that +corresponds to multiple entries will be reported once for @t{:spelling} +but multiple times for @t{:entry}. + +If supplied, @var{count} limits the maximum number of entries that are +returned as the first value. + +Examples: +@lisp +(spell:corrections "lisp" spell::*english-dictionary* 0 :group-by :spelling) +@result{} ("lisp") T + +(spell:corrections "lifp" spell::*english-dictionary* 1 :group-by :spelling) +@result{} ("lift" "life" "lip" "lisp" "limp") NIL + +(spell:corrections "lisp" spell::*english-dictionary* 2 :group-by :spelling :count 3) +@result{} ("lisp" "lisps" "list") T +@end lisp +@end deffn + @defgena{insert,@toppackage{}} word string dictionary Insert @var{word} into @var{dictionary} as an entry for @var{string}. diff --git a/spell.asd b/spell.asd index db95259..d824cf9 100644 --- a/spell.asd +++ b/spell.asd @@ -28,7 +28,9 @@ (:file "compact-trie") (:file "shared-trie") ;; Dictionary - (:file "dictionary"))) + (:file "dictionary") + ;; Similar + (:file "similar"))) (:module "english-dictionary-data" :pathname "data" @@ -57,6 +59,7 @@ :serial t :components ((:file "package") (:file "utilities") + (:file "similar") (:file "dictionary") (:file "english")))) diff --git a/test/english.lisp b/test/english.lisp index 1a49d95..58d647e 100644 --- a/test/english.lisp +++ b/test/english.lisp @@ -3,6 +3,8 @@ (fiveam:def-suite* :spell.english :in :spell) +;;; Tests for `english-lookup' + (test english.non-existing-word "Test dictionary lookup with strings that are not existing words." (is (null (spell:english-lookup "no-such-word"))) @@ -91,3 +93,75 @@ dictionary." (is-true (spell:lookup word1 dictionary)) (is-true (spell:lookup word2 dictionary))))) dictionary))) + +;;; Tests for `english-corrections' + +(test english-corrections/empty + "Smoke test for `english-corrections' with the empty word." + (is (equal (values '() nil) (spell:english-corrections "" :threshold 0))) + (multiple-value-bind (corrections found?) + (spell:english-corrections "" :threshold 1) + (mapc (lambda (suggestion) (is (= 1 (length suggestion)))) corrections) + (is-false found?))) + +(test english-corrections/variants + "Smoke test for `english-corrections' with just spelling and variants." + ;; Threshold 0 for invalid and valid word. + (flet ((one-variant (string) + (is (equal (values '() nil) + (spell:english-corrections string :threshold 0))))) + (one-variant "circumstanzes") + (one-variant "Circumstanzes") + (one-variant "CIRCUMSTANZES")) + (flet ((one-variant (string) + (is (equal (values '("circumstances") t) + (spell:english-corrections string :threshold 0))))) + (one-variant "circumstances") + (one-variant "Circumstances") + (one-variant "CIRCUMSTANCES")) + ;; Threshold 1 for invalid and valid word. + (flet ((one-variant (string) + (multiple-value-bind (corrections found?) + (spell:english-corrections string :threshold 1) + (is (set-equal/string '("circumstances") corrections)) + (is-false found?)))) + (one-variant "circumstanzes") + (one-variant "Circumstanzes") + (one-variant "CIRCUMSTANZES")) + (flet ((one-variant (string) + (multiple-value-bind (corrections found?) + (spell:english-corrections string :threshold 1) + (is (set-equal/string '("circumstances" "circumstance" + "circumstances'" "circumstance's") + corrections)) + (is-true found?)))) + (one-variant "circumstances") + (one-variant "Circumstances") + (one-variant "CIRCUMSTANCES"))) + +(test english-corrections/word + "Smoke test for `english-corrections' with spelling and word." + (let* ((results (spell:english-corrections "circumstanzes" :threshold 1 + :group-by :entry)) + (result (first results))) + (is (= 1 (length results))) + (destructuring-bind (word . spelling) result + (is (string= "circumstances" spelling)) + (is-true (typep word 'spell::noun)) + (is (string= "circumstance" (spell:base word))) + (is (eql :plural (spell:number word))) + (is (eql nil (spell:case word))) + (is (eql nil (spell:gender word)))))) + +(test english-corrections/threshold-2 + "Smoke test for `english-corrections' with threshold 2." + (is (set-equal/string + #1='("circumstances" "circumstance" "circumstances'" "circumstance's") + (spell:english-corrections "circumstanzes" :threshold 2))) + ;; Requesting fewer suggestions must result in a subset of the + ;; requested size. + (let* ((results (spell:english-corrections "circumstanzes" :threshold 2 + :count 2)) + (intersection (intersection #1# results :test #'string=))) + (is (= 2 (length results))) + (is (set-equal/string results intersection)))) diff --git a/test/similar.lisp b/test/similar.lisp new file mode 100644 index 0000000..dbb241c --- /dev/null +++ b/test/similar.lisp @@ -0,0 +1,52 @@ +(cl:in-package #:spell.test) + +(fiveam:in-suite :spell) + +(test map-similar.entry + "Smoke test for the `map-similar' function with \"entry results\"." + (flet ((expected-entry (spelling class distance) + (let ((word (find-if (a:of-type class) (spell:english-lookup spelling)))) + (list spelling word distance))) + (collect (string threshold &rest args) + (let ((results '())) + (flet ((collect (spelling entry distance) + (push (list spelling entry distance) results))) + (apply #'spell:map-similar + #'collect string spell::*english-dictionary* threshold + args)) + results))) + (is (set-equal/entry (list (expected-entry "pitchfork" 'spell::verb 1) + (expected-entry "pitchfork" 'spell::noun 1)) + (collect "pitchfort" 1))) + (is (set-equal/entry (list (expected-entry "pitchfork" 'spell::verb 0) + (expected-entry "pitchforks" 'spell::verb 1) + (expected-entry "pitchfork" 'spell::noun 0) + (expected-entry "pitchforks" 'spell::noun 1)) + (collect "pitchfork" 1))) + ;; The following query string is specifically chosen to trigger a + ;; somewhat uncommon code path in `map-similar'. + (is (set-equal/entry (list (expected-entry "willful" 'spell::adjective 0) + (expected-entry "wilful" 'spell::adjective 1)) + (collect "willful" 1))))) + +(test map-similar.spelling + "Smoke test for the `map-similar' function with \"spelling results\"." + (flet ((collect (string threshold &rest args) + (let ((results '())) + (flet ((collect (spelling distance) + (push (list spelling distance) results))) + (apply #'spell:map-similar + #'collect string spell::*english-dictionary* threshold + :group-by :spelling args)) + results))) + (is (set-equal/equal '(("pitchfork" 1)) + (collect "pitchfort" 1))) + (is (set-equal/equal '(("pitchfork" 0) ("pitchforks" 1)) + (collect "pitchfork" 1))) + ;; The following query string is specifically chosen to trigger a + ;; somewhat uncommon code path in `%map-similar'. + (is (set-equal/equal '(("banded" 0) ("bandied" 1) ("banted" 1) + ("banned" 1) ("banked" 1) ("banged" 1) + ("barded" 1) ("branded" 1) ("bonded" 1) + ("sanded" 1) ("landed" 1) ("handed" 1)) + (collect "banded" 1))))) diff --git a/test/utilities.lisp b/test/utilities.lisp index fdbd88d..65dd6b7 100644 --- a/test/utilities.lisp +++ b/test/utilities.lisp @@ -28,3 +28,23 @@ :for function = (find-symbol (symbol-name key) (find-package '#:spell)) :always (equal (funcall function result) value)))) + +(defun set-equal/equal (a b) + (a:set-equal a b :test #'equal)) + +(defun set-equal/string (a b) + (a:set-equal a b :test #'string=)) + +(defun entry-equal (entry1 entry2) + (destructuring-bind (spelling1 word1 distance1) entry1 + (destructuring-bind (spelling2 word2 distance2) entry2 + (and (string= spelling1 spelling2) + (eq (class-of word1) (class-of word2)) + (loop :for slot :in (spell::bitfield-slots (class-of word1)) + :for name = (c2mop:slot-definition-name slot) + :always (eql (slot-value word1 name) + (slot-value word2 name))) + (eql distance1 distance2))))) + +(defun set-equal/entry (set1 set2) + (a:set-equal set1 set2 :test #'entry-equal))