diff --git a/.gitignore b/.gitignore deleted file mode 100644 index aae886d..0000000 --- a/.gitignore +++ /dev/null @@ -1 +0,0 @@ -/recipes/ diff --git a/recipe-search.scm b/recipe-search.scm deleted file mode 100644 index c19935a..0000000 --- a/recipe-search.scm +++ /dev/null @@ -1,110 +0,0 @@ -(define-module (recipe-search) - #:use-module (ice-9 ftw) - #:use-module (ice-9 textual-ports) - #:use-module (ice-9 vlist) - #:use-module (srfi srfi-1) ; lists - #:use-module (srfi srfi-9) ; records - #:use-module (srfi srfi-26) ; cut/cute - #:use-module (srfi srfi-69) ; hash tables - #:use-module (srfi srfi-71) ; extended let - #:use-module (text-utils)) - -(define (get-tokens s) - (map (compose porter-stem remove-diacritics string-downcase) - (string-tokenize s char-set:letter+digit))) - -;; The term index is a map from tokens to a list of document ids -;; containing that token. This is used for retrieving documents contaning -;; the search term. - -(define term-index (make-hash-table)) - -(define (add-to-term-index! doc-id terms) - (for-each (lambda (t) - (hash-table-set! term-index t (cons doc-id (hash-table-ref/default term-index t '())))) - (delete-duplicates terms))) - -(define (retrieve-doc-ids search-terms) - (delete-duplicates (append-map (cut hash-table-ref/default term-index <> '()) - (remove stop-word? search-terms)))) - -(define (term-frequencies terms) - (let loop ((terms (sort terms stringvhash accum) - (let ((this more (span (cute string=? <> (first terms)) terms))) - (loop more (acons (car this) (length this) accum)))))) - -(define-record-type - (make-document id length term-frequencies) - document? - (id document-id) - (length document-length) - (term-frequencies document-term-frequencies)) - -;; Document index is a map from document id to a record. -(define document-index (make-hash-table)) - -;; Track number of recipes indexed -(define num-recipes 0) - -;; Track total number of tokens (used to compute average doc length) -(define num-tokens 0) - -(define (add-to-document-index! doc-id terms) - (hash-table-set! document-index doc-id (make-document doc-id (length terms) (term-frequencies terms)))) - -(define (index-recipe path) - (let ((doc-id (basename path)) - (tokens (get-tokens (call-with-input-file path get-string-all)))) - (add-to-document-index! doc-id tokens) - (add-to-term-index! doc-id tokens) - (set! num-recipes (1+ num-recipes)) - (set! num-tokens (+ num-tokens (length tokens))))) - -(define (index-recipes dir) - (for-each (lambda (filename) - (index-recipe (string-append dir "/" filename))) - (scandir dir (cut string-suffix? ".txt" <>)))) - -;; Compute BM25 score for the given document and search terms -;; -;; This is based on the description of the algorithm in https://emschwartz.me/understanding-the-bm25-full-text-search-algorithm/ -;; - -;; k is a tuning parameter that controls how quickly the returns to term repetition diminish -(define k 1.2) - -;; b is a tuning parameter that controls how much we nomalize by document length -(define b 0.75) - -(define (bm25-score doc-id search-terms) - (let* ((doc (hash-table-ref document-index doc-id)) - (N num-recipes) - (n (lambda (q) (length (hash-table-ref/default term-index q '())))) - (f (lambda (q) (or (and=> (vhash-assoc q (document-term-frequencies doc)) cdr) 0))) - (|D| (document-length doc)) - (avgdl (/ num-tokens N))) - (fold + 0.0 (map (lambda (q) - (* (log (+ 1.0 (/ (+ N (- (n q)) 0.5) - (+ (n q) 0.5)))) - (/ (* (f q) (+ k 1.0)) - (* (+ (f q) k) (+ 1.0 (- b) (/ (* b |D|) avgdl)))))) - search-terms)))) - -(define* (find-recipes search-str #:key (num-results 5)) - (let* ((search-terms (delete-duplicates (get-tokens search-str))) - (doc-ids (retrieve-doc-ids search-terms))) - (when (zero? (length doc-ids)) - (error "No results found. Please try a different search term")) - (take - (sort - (map (lambda (doc-id) - (cons doc-id (bm25-score doc-id search-terms))) - doc-ids) - (lambda (x y) (> (cdr x) (cdr y)))) - (min num-results (length doc-ids))))) - -(define (main args) - (index-recipes "./recipes") - (find-recipes "broccoli and stilton soup")) diff --git a/text-utils.scm b/text-utils.scm deleted file mode 100644 index 60e4632..0000000 --- a/text-utils.scm +++ /dev/null @@ -1,339 +0,0 @@ -(define-module (text-utils) - #:use-module (srfi srfi-1) ; lists - #:use-module (srfi srfi-9) ; records - #:use-module (srfi srfi-26) ; cut/cute - #:use-module (srfi srfi-69) ; hash tables - #:use-module (rnrs unicode) - #:export (remove-diacritics - porter-stem - stop-word?)) - -(define (remove-diacritics str) - ;; Helper function for checking if a character is a combining mark - ;; (rnrs unicode) provides `char-general-category` which returns a symbol - ;; representing the character's general category. Combining marks generally - ;; fall under the 'Mn' (Mark, Nonspacing) category. - (define (char-combining-mark? char) - (eq? (char-general-category char) 'Mn)) - (let* ((nfd-str (string-normalize-nfd str)) ; Step 1: Normalize to NFD - (chars (string->list nfd-str))) ; Convert to a list of characters - (list->string - (filter (lambda (char) - (not (char-combining-mark? char))) ; Step 2: Filter out combining marks - chars)))) - -;; Porter stemming is a popular stemming algorithm used for English. -;; -;; For a complete description, see: -;; -;; Algorithm published in Porter, 1980, "An algorithm for suffix stripping," -;; _Program_, Vol. 14, no. 3, pp 130-137. -;; -;; This implementation by Peter Lane, https://codeberg.org/peterlane/r7rs-libs/src/branch/trunk/robin/text.sld -;; -;; An error is raised if the word does not consist entirely of ASCII characters. -;; This is because the Porter stemmer is designed for stemming English words. -(define (porter-stem initial-word) - ; - ; Data structure holds the word and information on where we are in - ; processing it. - ; - ; b: array of character strings representing the word we are stemming - ; j: an alternate 'end point' in the array, used, e.g., after searching - ; to mark where a suffix begins - ; k: the end of the array - (define-record-type - (new-data b j k) - data? - (b b-get) - (j j-get j-set!) - (k k-get k-set!)) - ; - ; checks if there is a double consonant at position i, - ; with i being the second consonant's index - (define (double-consonant? data i) - (and (> i 0) - (char=? (list-ref (b-get data) i) - (list-ref (b-get data) (- i 1))) - (is-consonant? data i))) - ; - ; returns true if the suffix is found at the end of the current - ; set of bytes, with k holding the last index in bytes. - ; sets j to the end point, as if the suffix is removed. - (define (ends-with? data suffix) - (let ((suffix-array (string->list suffix)) - (suffix-length (string-length suffix))) - (if (and (<= suffix-length (k-get data)) - (equal? (take-right (take (b-get data) (+ 1 (k-get data))) suffix-length) - suffix-array)) - (begin - (j-set! data (- (k-get data) suffix-length)) - #t) - #f))) - ; - ; Checks if there is a consonant at position i - ; -- note special handling of 'y', which is treated - ; as a consonant at start of word or after a vowel. - (define (is-consonant? data i) - (cond ((member (list-ref (b-get data) i) - '(#\a #\e #\i #\o #\u)) - #f) - ((char=? (list-ref (b-get data) i) - #\y) - (or (= i 0) - (not (is-consonant? data (- i 1))))) - (else - #t))) - ; - ; cvc is true if i-2, i-1, i has form consonant, vowel, consonant - ; (and last must not be w, x, or y) - (define (is-cvc? data i) - (cond ((or (< i 2) - (not (is-consonant? data i)) - (is-consonant? data (- i 1)) - (not (is-consonant? data (- i 2)))) - #f) - ((let ((ch (list-ref (b-get data) i))) - (or (char=? ch #\w) - (char=? ch #\x) - (char=? ch #\y))) - #f) - (else - #t))) - ; - ; Given a set of letter=>[(suffix, replacement)] pairs and a letter to match, - ; applies replacements for the given letter - (define (make-replacements data letter replacements) - (for-each (lambda (rule) - (when (char=? letter (car rule)) - (for-each (lambda (pairing) - (when (ends-with? data (car pairing)) - (set-to data (cadr pairing) #t))) - (cdr rule)))) - replacements)) - ; - ; counts the number of vowel-consonant sequences between 0 and j - (define (seq-count data) - (let ((count 0) - (i 0)) - ; find index of first vowel, by skipping consonants - (let loop () - (when (and (<= i (j-get data)) - (is-consonant? data i)) - (set! i (+ 1 i)) - (loop))) - ; repeat CV search to get full count until all of range is seen - (let loop () - (when (<= i (j-get data)) - ; find index of next consonant, by skipping non-consonants - (let loop-consonant () - (when (and (<= i (j-get data)) - (not (is-consonant? data i))) - (set! i (+ 1 i)) - (loop-consonant))) - ; VC pair found with more to come, so increment count - (when (<= i (j-get data)) - (set! count (+ 1 count))) - ; find index of next vowel, by skipping consonants - (let loop-vowel () - (when (and (<= i (j-get data)) - (is-consonant? data i)) - (set! i (+ 1 i)) - (loop-vowel))) - (loop))) - ; - count)) - ; - ; sets positions j+1 to k to the suffix, - ; adjusting k to j+length(suffix) (the new end of the word) - ; -- supports a flag for only allowing change if seq_count > 0 - ; this prevents words becoming too small when suffix changed - (define set-to - (case-lambda - ((data suffix) - (set-to data suffix #f)) - ((data suffix restrict) - (when (or (not restrict) - (> (seq-count data) 0)) - (for-each (lambda (index value) - (list-set! (b-get data) (+ 1 (j-get data) index) value)) - (iota (string-length suffix)) - (string->list suffix)) - (k-set! data (+ (j-get data) - (string-length suffix))))))) - ; - ; looks for a vowel in range 0 to j in bytes - (define (vowel-in-stem? data) - (any (lambda (i) (not (is-consonant? data i))) - (iota (+ 1 (j-get data))))) - ; - (define STEP-2-RULES (list (cons #\a '(("ational" "ate") ("tional" "tion"))) - (cons #\c '(("enci" "ence") ("anci" "ance"))) - (cons #\e '(("izer" "ize"))) - (cons #\g '(("logi" "log"))) - (cons #\l '(("bli" "ble") ("alli" "al") ("entli" "ent") ("eli" "e") ("ousli" "ous"))) - (cons #\o '(("ization" "ize") ("ation" "ate") ("ator" "ate"))) - (cons #\s '(("alism" "al") ("iveness" "ive") ("fulness" "ful") ("ousness" "ous"))) - (cons #\t '(("aliti" "al") ("iviti" "ive") ("biliti" "ble"))))) - ; - (define STEP-3-RULES (list (cons #\e '(("icate" "ic") ("ative" "") ("alize" "al"))) - (cons #\i '(("iciti" "ic"))) - (cons #\l '(("ical" "ic") ("ful" ""))) - (cons #\s '(("ness" ""))))) - ; - ; removes plurals and -ed -ing endings - (define (step-1ab data) - (when (char=? (list-ref (b-get data) (k-get data)) #\s) - (cond ((ends-with? data "sses") - (k-set! data (- (k-get data) 2))) - ((ends-with? data "ies") - (set-to data "i")) - ((not (char=? (list-ref (b-get data) (- (k-get data) 1)) - #\s)) - (k-set! data (- (k-get data) 1))))) - (if (ends-with? data "eed") - (when (> (seq-count data) 0) - (k-set! data (- (k-get data) 1))) - (when (and (or (ends-with? data "ed") - (ends-with? data "ing")) - (vowel-in-stem? data)) - (k-set! data (j-get data)) - (cond ((ends-with? data "at") - (set-to data "ate")) - ((ends-with? data "bl") - (set-to data "ble")) - ((ends-with? data "iz") - (set-to data "ize")) - ((double-consonant? data (k-get data)) - (k-set! data (- (k-get data) 1)) - (let ((ch (list-ref (b-get data) (k-get data)))) - (when (or (char=? #\l ch) - (char=? #\s ch) - (char=? #\z ch)) - (k-set! data (+ (k-get data) 1))))) - ((and (= 1 (seq-count data)) - (is-cvc? data (k-get data))) - (set-to data "e")))))) - ; - ; turns terminal 'y' to 'i' when there is another vowel in stem - (define (step-1c data) - (when (and (ends-with? data "y") - (vowel-in-stem? data)) - (list-set! (b-get data) (k-get data) #\i))) - ; - ; double suffixes are mapped to single (shorter) ones - (define (step-2 data) - (make-replacements data - (list-ref (b-get data) (- (k-get data) 1)) - STEP-2-RULES)) - ; - ; deals with -ic -full -ness - (define (step-3 data) - (make-replacements data - (list-ref (b-get data) (k-get data)) - STEP-3-RULES)) - ; - ; removes -ant -ence when seq-count is 2 - (define (step-4 data) - (call-with-current-continuation - (lambda (return) - (let ((letter (list-ref (b-get data) (- (k-get data) 1)))) ; penultimate letter - (cond ((char=? letter #\a) - (when (not (ends-with? data "al")) - (return))) - ((char=? letter #\c) - (when (and (not (ends-with? data "ance")) - (not (ends-with? data "ence"))) - (return))) - ((char=? letter #\e) - (when (not (ends-with? data "er")) - (return))) - ((char=? letter #\i) - (when (not (ends-with? data "ic")) - (return))) - ((char=? letter #\l) - (when (and (not (ends-with? data "able")) - (not (ends-with? data "ible"))) - (return))) - ((char=? letter #\n) - (when (and (not (ends-with? data "ant")) - (not (ends-with? data "ement")) - (not (ends-with? data "ment")) - (not (ends-with? data "ent"))) - (return))) - ((char=? letter #\o) - (if (and (ends-with? data "ion") - (or (char=? (list-ref (b-get data) (j-get data)) #\s) - (char=? (list-ref (b-get data) (j-get data)) #\t))) - '() ; break - (when (not (ends-with? data "ou")) - (return)))) - ((char=? letter #\s) - (when (not (ends-with? data "ism")) - (return))) - ((char=? letter #\t) - (when (and (not (ends-with? data "ate")) - (not (ends-with? data "iti"))) - (return))) - ((char=? letter #\u) - (when (not (ends-with? data "ous")) - (return))) - ((char=? letter #\v) - (when (not (ends-with? data "ive")) - (return))) - ((char=? letter #\z) - (when (not (ends-with? data "ize")) - (return))) - (else - (return))) - (when (> (seq-count data) 1) - (k-set! data (j-get data))))))) - ; - ; if seq-count > 1, removes a final -e and changes -ll to -l - (define (step-5 data) - (j-set! data (k-get data)) - ; remove final -e - (when (char=? (list-ref (b-get data) (k-get data)) - #\e) - (let ((count (seq-count data))) - (when (or (> count 1) - (and (= count 1) - (not (is-cvc? data (- (k-get data) 1))))) - (k-set! data (- (k-get data) 1))))) - ; -ll to -l - (when (and (char=? (list-ref (b-get data) (k-get data)) - #\l) - (double-consonant? data (k-get data)) - (> (seq-count data) 1)) - (k-set! data (- (k-get data) 1)))) - ; - ; - main part of function - ; - (let ((word (string-downcase initial-word))) ; TODO - check ASCII - (if (< (string-length word) 3) ; do not process short words - word - (let ((data (new-data (string->list word) 0 (- (string-length word) 1)))) - (step-1ab data) - (when (> (k-get data) 0) - (step-1c data) - (step-2 data) - (step-3 data) - (step-4 data) - (step-5 data)) - (list->string (take (b-get data) (+ 1 (k-get data)))))))) - -(define stop-word? - (let ((h (make-hash-table))) - (for-each (cut hash-table-set! h <> #t) - (list "i" "me" "my" "myself" "we" "our" "ours" "ourselves" "you" "your" "yours" "yourself" "yourselves" - "he" "him" "his" "himself" "she" "her" "hers" "herself" "it" "its" "itself" "they" "them" "their" - "theirs" "themselves" "what" "which" "who" "whom" "this" "that" "these" "those" "am" "is" "are" - "was" "were" "be" "been" "being" "have" "has" "had" "having" "do" "does" "did" "doing" "a" "an" - "the" "and" "but" "if" "or" "because" "as" "until" "while" "of" "at" "by" "for" "with" "about" - "against" "between" "into" "through" "during" "before" "after" "above" "below" "to" "from" "up" - "down" "in" "out" "on" "off" "over" "under" "again" "further" "then" "once" "here" "there" - "when" "where" "why" "how" "all" "any" "both" "each" "few" "more" "most" "other" "some" "such" - "no" "nor" "not" "only" "own" "same" "so" "than" "too" "very" "s" "t" "d" "can" "will" "just" - "don" "should" "now")) - (lambda (w) - (hash-table-ref/default h w #f))))