From 68c165faced621378ce88957f3bf15f2769aecde Mon Sep 17 00:00:00 2001
From: Ray Miller <ray@metail.co.uk>
Date: Tue, 1 Jul 2025 18:01:08 +0100
Subject: [PATCH] First pass at a search solution.

---
 recipe-search.scm |  82 +++++++++++
 text-utils.scm    | 339 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 421 insertions(+)
 create mode 100644 recipe-search.scm
 create mode 100644 text-utils.scm

diff --git a/recipe-search.scm b/recipe-search.scm
new file mode 100644
index 0000000..acf905a
--- /dev/null
+++ b/recipe-search.scm
@@ -0,0 +1,82 @@
+(define-module (recipe-search)
+  #:use-module (ice-9 ftw)
+  #:use-module (ice-9 textual-ports)
+  #:use-module (srfi srfi-1)  ; lists
+  #:use-module (srfi srfi-26) ; cut/cute
+  #:use-module (srfi srfi-69) ; hash tables
+  #:use-module (text-utils))
+
+(define (get-tokens s)
+  (map porter-stem
+       (remove stop-word?
+               (map (compose remove-diacritics string-downcase)
+                    (string-tokenize s char-set:letter)))))
+
+(define (digrams xs)
+  (if (< (length xs) 2)
+      '()
+      (map list xs (cdr xs))))
+
+(define (trigrams xs)
+  (if (< (length xs) 3)
+      '()
+      (map list xs (cdr xs) (cddr xs))))
+
+(define stem-index (make-hash-table))
+(define digram-index (make-hash-table))
+(define trigram-index (make-hash-table))
+
+(define (add-to-index! ix term val)
+  (let ((curr-val (hash-table-ref/default ix term '())))
+    (hash-table-set! ix term (cons val curr-val))))
+
+(define (frequencies xs)
+  (let ((result (make-hash-table)))
+    (for-each (lambda (x) (hash-table-set! result x (1+ (hash-table-ref/default result x 0)))) xs)
+    (hash-table->alist result)))
+
+(define (sort-by-frequency xs)
+  (map car
+       (sort (frequencies xs)
+             (lambda (a b) (> (cdr a) (cdr b))))))
+
+(define (search-index ix xs)
+  (sort-by-frequency (append-map (cut hash-table-ref/default ix <> '()) (delete-duplicates xs))))
+
+(define (search-stem-index tokens)
+  (search-index stem-index tokens))
+
+(define (search-digram-index tokens)
+  (search-index digram-index (digrams tokens)))
+
+(define (search-trigram-index tokens)
+  (search-index trigram-index (trigrams tokens)))
+
+(define (load-recipe path)
+  (let ((name (basename path))
+        (tokens (get-tokens (call-with-input-file path get-string-all))))
+    (for-each (cut add-to-index! stem-index <> name) (delete-duplicates tokens))
+    (for-each (cut add-to-index! digram-index <> name) (delete-duplicates (digrams tokens)))
+    (for-each (cut add-to-index! trigram-index <> name) (delete-duplicates (trigrams tokens)))))
+
+(define (load-recipes dir)
+  (for-each (lambda (filename)
+              (load-recipe (string-append dir "/" filename)))
+            (scandir dir (cut string-suffix? ".txt" <>))))
+
+(define* (find-recipes search-term #:key (num-results 5))
+  (let ((tokens (get-tokens search-term)))
+    (when (zero? (length tokens))
+      (error "Please try a more specific search term"))
+    (let ((result (search-trigram-index tokens)))
+      (if (>= (length result) num-results)
+          (take result num-results)
+          (let ((result (delete-duplicates (append result (search-digram-index tokens)))))
+            (if (>= (length result) num-results)
+                (take result num-results)
+                (let ((result (delete-duplicates (append result (search-stem-index tokens)))))
+                  (take result (min (length result) num-results)))))))))
+
+(define (main args)
+  (load-recipes "./recipes")
+  (find-recipes "broccoli and stilton soup"))
diff --git a/text-utils.scm b/text-utils.scm
new file mode 100644
index 0000000..60e4632
--- /dev/null
+++ b/text-utils.scm
@@ -0,0 +1,339 @@
+(define-module (text-utils)
+  #:use-module (srfi srfi-1)  ; lists
+  #:use-module (srfi srfi-9)  ; records
+  #:use-module (srfi srfi-26) ; cut/cute
+  #:use-module (srfi srfi-69) ; hash tables
+  #:use-module (rnrs unicode)
+  #:export (remove-diacritics
+            porter-stem
+            stop-word?))
+
+(define (remove-diacritics str)
+  ;; Helper function for checking if a character is a combining mark
+  ;; (rnrs unicode) provides `char-general-category` which returns a symbol
+  ;; representing the character's general category. Combining marks generally
+  ;; fall under the 'Mn' (Mark, Nonspacing) category.
+  (define (char-combining-mark? char)
+    (eq? (char-general-category char) 'Mn))
+  (let* ((nfd-str (string-normalize-nfd str)) ; Step 1: Normalize to NFD
+         (chars (string->list nfd-str))) ; Convert to a list of characters
+    (list->string
+     (filter (lambda (char)
+               (not (char-combining-mark? char))) ; Step 2: Filter out combining marks
+             chars))))
+
+;; Porter stemming is a popular stemming algorithm used for English.
+;;
+;; For a complete description, see: <https://tartarus.org/martin/PorterStemmer/>
+;;
+;; Algorithm published in Porter, 1980, "An algorithm for suffix stripping,"
+;; _Program_, Vol. 14, no. 3, pp 130-137.
+;;
+;; This implementation by Peter Lane, https://codeberg.org/peterlane/r7rs-libs/src/branch/trunk/robin/text.sld
+;;
+;; An error is raised if the word does not consist entirely of ASCII characters.
+;; This is because the Porter stemmer is designed for stemming English words.
+(define (porter-stem initial-word)
+                                        ;
+                                        ; Data structure holds the word and information on where we are in
+                                        ; processing it.
+                                        ;
+                                        ; b: array of character strings representing the word we are stemming
+                                        ; j: an alternate 'end point' in the array, used, e.g., after searching
+                                        ;    to mark where a suffix begins
+                                        ; k: the end of the array
+  (define-record-type <data>
+    (new-data b j k)
+    data?
+    (b b-get)
+    (j j-get j-set!)
+    (k k-get k-set!))
+                                        ;
+                                        ; checks if there is a double consonant at position i,
+                                        ; with i being the second consonant's index
+  (define (double-consonant? data i)
+    (and (> i 0)
+         (char=? (list-ref (b-get data) i)
+                 (list-ref (b-get data) (- i 1)))
+         (is-consonant? data i)))
+                                        ;
+                                        ; returns true if the suffix is found at the end of the current
+                                        ; set of bytes, with k holding the last index in bytes.
+                                        ; sets j to the end point, as if the suffix is removed.
+  (define (ends-with? data suffix)
+    (let ((suffix-array (string->list suffix))
+          (suffix-length (string-length suffix)))
+      (if (and (<= suffix-length (k-get data))
+               (equal? (take-right (take (b-get data) (+ 1 (k-get data))) suffix-length)
+                       suffix-array))
+          (begin
+            (j-set! data (- (k-get data) suffix-length))
+            #t)
+          #f)))
+                                        ;
+                                        ; Checks if there is a consonant at position i
+                                        ; -- note special handling of 'y', which is treated
+                                        ; as a consonant at start of word or after a vowel.
+  (define (is-consonant? data i)
+    (cond ((member (list-ref (b-get data) i)
+                   '(#\a #\e #\i #\o #\u))
+           #f)
+          ((char=? (list-ref (b-get data) i)
+                   #\y)
+           (or (= i 0)
+               (not (is-consonant? data (- i 1)))))
+          (else
+           #t)))
+                                        ;
+                                        ; cvc is true if i-2, i-1, i has form consonant, vowel, consonant
+                                        ; (and last must not be w, x, or y)
+  (define (is-cvc? data i)
+    (cond ((or (< i 2)
+               (not (is-consonant? data i))
+               (is-consonant? data (- i 1))
+               (not (is-consonant? data (- i 2))))
+           #f)
+          ((let ((ch (list-ref (b-get data) i)))
+             (or (char=? ch #\w)
+                 (char=? ch #\x)
+                 (char=? ch #\y)))
+           #f)
+          (else
+           #t)))
+                                        ;
+                                        ; Given a set of letter=>[(suffix, replacement)] pairs and a letter to match,
+                                        ; applies replacements for the given letter
+  (define (make-replacements data letter replacements)
+    (for-each (lambda (rule)
+                (when (char=? letter (car rule))
+                  (for-each (lambda (pairing)
+                              (when (ends-with? data (car pairing))
+                                (set-to data (cadr pairing) #t)))
+                            (cdr rule))))
+              replacements))
+                                        ;
+                                        ; counts the number of vowel-consonant sequences between 0 and j
+  (define (seq-count data)
+    (let ((count 0)
+          (i 0))
+                                        ; find index of first vowel, by skipping consonants
+      (let loop ()
+        (when (and (<= i (j-get data))
+                   (is-consonant? data i))
+          (set! i (+ 1 i))
+          (loop)))
+                                        ; repeat CV search to get full count until all of range is seen
+      (let loop ()
+        (when (<= i (j-get data))
+                                        ; find index of next consonant, by skipping non-consonants
+          (let loop-consonant ()
+            (when (and (<= i (j-get data))
+                       (not (is-consonant? data i)))
+              (set! i (+ 1 i))
+              (loop-consonant)))
+                                        ; VC pair found with more to come, so increment count
+          (when (<= i (j-get data))
+            (set! count (+ 1 count)))
+                                        ; find index of next vowel, by skipping consonants
+          (let loop-vowel ()
+            (when (and (<= i (j-get data))
+                       (is-consonant? data i))
+              (set! i (+ 1 i))
+              (loop-vowel)))
+          (loop)))
+                                        ;
+      count))
+                                        ;
+                                        ; sets positions j+1 to k to the suffix,
+                                        ; adjusting k to j+length(suffix) (the new end of the word)
+                                        ; -- supports a flag for only allowing change if seq_count > 0
+                                        ; this prevents words becoming too small when suffix changed
+  (define set-to
+    (case-lambda
+      ((data suffix)
+       (set-to data suffix #f))
+      ((data suffix restrict)
+       (when (or (not restrict)
+                 (> (seq-count data) 0))
+         (for-each (lambda (index value)
+                     (list-set! (b-get data) (+ 1 (j-get data) index) value))
+                   (iota (string-length suffix))
+                   (string->list suffix))
+         (k-set! data (+ (j-get data)
+                         (string-length suffix)))))))
+                                        ;
+                                        ; looks for a vowel in range 0 to j in bytes
+  (define (vowel-in-stem? data)
+    (any (lambda (i) (not (is-consonant? data i)))
+         (iota (+ 1 (j-get data)))))
+                                        ;
+  (define STEP-2-RULES (list (cons #\a '(("ational" "ate") ("tional" "tion")))
+                             (cons #\c '(("enci" "ence") ("anci" "ance")))
+                             (cons #\e '(("izer" "ize")))
+                             (cons #\g '(("logi" "log")))
+                             (cons #\l '(("bli" "ble") ("alli" "al") ("entli" "ent") ("eli" "e") ("ousli" "ous")))
+                             (cons #\o '(("ization" "ize") ("ation" "ate") ("ator" "ate")))
+                             (cons #\s '(("alism" "al") ("iveness" "ive") ("fulness" "ful") ("ousness" "ous")))
+                             (cons #\t '(("aliti" "al") ("iviti" "ive") ("biliti" "ble")))))
+                                        ;
+  (define STEP-3-RULES (list (cons #\e '(("icate" "ic") ("ative" "") ("alize" "al")))
+                             (cons #\i '(("iciti" "ic")))
+                             (cons #\l '(("ical" "ic") ("ful" "")))
+                             (cons #\s '(("ness" "")))))
+                                        ;
+                                        ; removes plurals and -ed -ing endings
+  (define (step-1ab data)
+    (when (char=? (list-ref (b-get data) (k-get data)) #\s)
+      (cond ((ends-with? data "sses")
+             (k-set! data (- (k-get data) 2)))
+            ((ends-with? data "ies")
+             (set-to data "i"))
+            ((not (char=? (list-ref (b-get data) (- (k-get data) 1))
+                          #\s))
+             (k-set! data (- (k-get data) 1)))))
+    (if (ends-with? data "eed")
+        (when (> (seq-count data) 0)
+          (k-set! data (- (k-get data) 1)))
+        (when (and (or (ends-with? data "ed")
+                       (ends-with? data "ing"))
+                   (vowel-in-stem? data))
+          (k-set! data (j-get data))
+          (cond ((ends-with? data "at")
+                 (set-to data "ate"))
+                ((ends-with? data "bl")
+                 (set-to data "ble"))
+                ((ends-with? data "iz")
+                 (set-to data "ize"))
+                ((double-consonant? data (k-get data))
+                 (k-set! data (- (k-get data) 1))
+                 (let ((ch (list-ref (b-get data) (k-get data))))
+                   (when (or (char=? #\l ch)
+                             (char=? #\s ch)
+                             (char=? #\z ch))
+                     (k-set! data (+ (k-get data) 1)))))
+                ((and (= 1 (seq-count data))
+                      (is-cvc? data (k-get data)))
+                 (set-to data "e"))))))
+                                        ;
+                                        ; turns terminal 'y' to 'i' when there is another vowel in stem
+  (define (step-1c data)
+    (when (and (ends-with? data "y")
+               (vowel-in-stem? data))
+      (list-set! (b-get data) (k-get data) #\i)))
+                                        ;
+                                        ; double suffixes are mapped to single (shorter) ones
+  (define (step-2 data)
+    (make-replacements data
+                       (list-ref (b-get data) (- (k-get data) 1))
+                       STEP-2-RULES))
+                                        ;
+                                        ; deals with -ic -full -ness
+  (define (step-3 data)
+    (make-replacements data
+                       (list-ref (b-get data) (k-get data))
+                       STEP-3-RULES))
+                                        ;
+                                        ; removes -ant -ence when seq-count is 2
+  (define (step-4 data)
+    (call-with-current-continuation
+     (lambda (return)
+       (let ((letter (list-ref (b-get data) (- (k-get data) 1)))) ; penultimate letter
+         (cond ((char=? letter #\a)
+                (when (not (ends-with? data "al"))
+                  (return)))
+               ((char=? letter #\c)
+                (when (and (not (ends-with? data "ance"))
+                           (not (ends-with? data "ence")))
+                  (return)))
+               ((char=? letter #\e)
+                (when (not (ends-with? data "er"))
+                  (return)))
+               ((char=? letter #\i)
+                (when (not (ends-with? data "ic"))
+                  (return)))
+               ((char=? letter #\l)
+                (when (and (not (ends-with? data "able"))
+                           (not (ends-with? data "ible")))
+                  (return)))
+               ((char=? letter #\n)
+                (when (and (not (ends-with? data "ant"))
+                           (not (ends-with? data "ement"))
+                           (not (ends-with? data "ment"))
+                           (not (ends-with? data "ent")))
+                  (return)))
+               ((char=? letter #\o)
+                (if (and (ends-with? data "ion")
+                         (or (char=? (list-ref (b-get data) (j-get data)) #\s)
+                             (char=? (list-ref (b-get data) (j-get data)) #\t)))
+                    '() ; break
+                    (when (not (ends-with? data "ou"))
+                      (return))))
+               ((char=? letter #\s)
+                (when (not (ends-with? data "ism"))
+                  (return)))
+               ((char=? letter #\t)
+                (when (and (not (ends-with? data "ate"))
+                           (not (ends-with? data "iti")))
+                  (return)))
+               ((char=? letter #\u)
+                (when (not (ends-with? data "ous"))
+                  (return)))
+               ((char=? letter #\v)
+                (when (not (ends-with? data "ive"))
+                  (return)))
+               ((char=? letter #\z)
+                (when (not (ends-with? data "ize"))
+                  (return)))
+               (else
+                (return)))
+         (when (> (seq-count data) 1)
+           (k-set! data (j-get data)))))))
+                                        ;
+                                        ; if seq-count > 1, removes a final -e and changes -ll to -l
+  (define (step-5 data)
+    (j-set! data (k-get data))
+                                        ; remove final -e
+    (when (char=? (list-ref (b-get data) (k-get data))
+                  #\e)
+      (let ((count (seq-count data)))
+        (when (or (> count 1)
+                  (and (= count 1)
+                       (not (is-cvc? data (- (k-get data) 1)))))
+          (k-set! data (- (k-get data) 1)))))
+                                        ; -ll to -l
+    (when (and (char=? (list-ref (b-get data) (k-get data))
+                       #\l)
+               (double-consonant? data (k-get data))
+               (> (seq-count data) 1))
+      (k-set! data (- (k-get data) 1))))
+                                        ;
+                                        ; - main part of function
+                                        ;
+  (let ((word (string-downcase initial-word))) ; TODO - check ASCII
+    (if (< (string-length word) 3) ; do not process short words
+        word
+        (let ((data (new-data (string->list word) 0 (- (string-length word) 1))))
+          (step-1ab data)
+          (when (> (k-get data) 0)
+            (step-1c data)
+            (step-2 data)
+            (step-3 data)
+            (step-4 data)
+            (step-5 data))
+          (list->string (take (b-get data) (+ 1 (k-get data))))))))
+
+(define stop-word?
+  (let ((h (make-hash-table)))
+    (for-each (cut hash-table-set! h <> #t)
+              (list "i" "me" "my" "myself" "we" "our" "ours" "ourselves" "you" "your" "yours" "yourself" "yourselves"
+                    "he" "him" "his" "himself" "she" "her" "hers" "herself" "it" "its" "itself" "they" "them" "their"
+                    "theirs" "themselves" "what" "which" "who" "whom" "this" "that" "these" "those" "am" "is" "are"
+                    "was" "were" "be" "been" "being" "have" "has" "had" "having" "do" "does" "did" "doing" "a" "an"
+                    "the" "and" "but" "if" "or" "because" "as" "until" "while" "of" "at" "by" "for" "with" "about"
+                    "against" "between" "into" "through" "during" "before" "after" "above" "below" "to" "from" "up"
+                    "down" "in" "out" "on" "off" "over" "under" "again" "further" "then" "once" "here" "there"
+                    "when" "where" "why" "how" "all" "any" "both" "each" "few" "more" "most" "other" "some" "such"
+                    "no" "nor" "not" "only" "own" "same" "so" "than" "too" "very" "s" "t" "d" "can" "will" "just"
+                    "don" "should" "now"))
+    (lambda (w)
+      (hash-table-ref/default h w #f))))