Compare commits

...
Sign in to create a new pull request.

2 commits

Author SHA1 Message Date
Ray Miller
b8d3ae0f5f Ignore recipes folder 2025-07-03 13:58:54 +01:00
Ray Miller
68c165face First pass at a search solution. 2025-07-01 18:01:08 +01:00
3 changed files with 422 additions and 0 deletions

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
/recipes/

82
recipe-search.scm Normal file
View file

@ -0,0 +1,82 @@
(define-module (recipe-search)
#:use-module (ice-9 ftw)
#:use-module (ice-9 textual-ports)
#:use-module (srfi srfi-1) ; lists
#:use-module (srfi srfi-26) ; cut/cute
#:use-module (srfi srfi-69) ; hash tables
#:use-module (text-utils))
(define (get-tokens s)
(map porter-stem
(remove stop-word?
(map (compose remove-diacritics string-downcase)
(string-tokenize s char-set:letter)))))
(define (digrams xs)
(if (< (length xs) 2)
'()
(map list xs (cdr xs))))
(define (trigrams xs)
(if (< (length xs) 3)
'()
(map list xs (cdr xs) (cddr xs))))
(define stem-index (make-hash-table))
(define digram-index (make-hash-table))
(define trigram-index (make-hash-table))
(define (add-to-index! ix term val)
(let ((curr-val (hash-table-ref/default ix term '())))
(hash-table-set! ix term (cons val curr-val))))
(define (frequencies xs)
(let ((result (make-hash-table)))
(for-each (lambda (x) (hash-table-set! result x (1+ (hash-table-ref/default result x 0)))) xs)
(hash-table->alist result)))
(define (sort-by-frequency xs)
(map car
(sort (frequencies xs)
(lambda (a b) (> (cdr a) (cdr b))))))
(define (search-index ix xs)
(sort-by-frequency (append-map (cut hash-table-ref/default ix <> '()) (delete-duplicates xs))))
(define (search-stem-index tokens)
(search-index stem-index tokens))
(define (search-digram-index tokens)
(search-index digram-index (digrams tokens)))
(define (search-trigram-index tokens)
(search-index trigram-index (trigrams tokens)))
(define (load-recipe path)
(let ((name (basename path))
(tokens (get-tokens (call-with-input-file path get-string-all))))
(for-each (cut add-to-index! stem-index <> name) (delete-duplicates tokens))
(for-each (cut add-to-index! digram-index <> name) (delete-duplicates (digrams tokens)))
(for-each (cut add-to-index! trigram-index <> name) (delete-duplicates (trigrams tokens)))))
(define (load-recipes dir)
(for-each (lambda (filename)
(load-recipe (string-append dir "/" filename)))
(scandir dir (cut string-suffix? ".txt" <>))))
(define* (find-recipes search-term #:key (num-results 5))
(let ((tokens (get-tokens search-term)))
(when (zero? (length tokens))
(error "Please try a more specific search term"))
(let ((result (search-trigram-index tokens)))
(if (>= (length result) num-results)
(take result num-results)
(let ((result (delete-duplicates (append result (search-digram-index tokens)))))
(if (>= (length result) num-results)
(take result num-results)
(let ((result (delete-duplicates (append result (search-stem-index tokens)))))
(take result (min (length result) num-results)))))))))
(define (main args)
(load-recipes "./recipes")
(find-recipes "broccoli and stilton soup"))

339
text-utils.scm Normal file
View file

@ -0,0 +1,339 @@
(define-module (text-utils)
#:use-module (srfi srfi-1) ; lists
#:use-module (srfi srfi-9) ; records
#:use-module (srfi srfi-26) ; cut/cute
#:use-module (srfi srfi-69) ; hash tables
#:use-module (rnrs unicode)
#:export (remove-diacritics
porter-stem
stop-word?))
(define (remove-diacritics str)
;; Helper function for checking if a character is a combining mark
;; (rnrs unicode) provides `char-general-category` which returns a symbol
;; representing the character's general category. Combining marks generally
;; fall under the 'Mn' (Mark, Nonspacing) category.
(define (char-combining-mark? char)
(eq? (char-general-category char) 'Mn))
(let* ((nfd-str (string-normalize-nfd str)) ; Step 1: Normalize to NFD
(chars (string->list nfd-str))) ; Convert to a list of characters
(list->string
(filter (lambda (char)
(not (char-combining-mark? char))) ; Step 2: Filter out combining marks
chars))))
;; Porter stemming is a popular stemming algorithm used for English.
;;
;; For a complete description, see: <https://tartarus.org/martin/PorterStemmer/>
;;
;; Algorithm published in Porter, 1980, "An algorithm for suffix stripping,"
;; _Program_, Vol. 14, no. 3, pp 130-137.
;;
;; This implementation by Peter Lane, https://codeberg.org/peterlane/r7rs-libs/src/branch/trunk/robin/text.sld
;;
;; An error is raised if the word does not consist entirely of ASCII characters.
;; This is because the Porter stemmer is designed for stemming English words.
(define (porter-stem initial-word)
;
; Data structure holds the word and information on where we are in
; processing it.
;
; b: array of character strings representing the word we are stemming
; j: an alternate 'end point' in the array, used, e.g., after searching
; to mark where a suffix begins
; k: the end of the array
(define-record-type <data>
(new-data b j k)
data?
(b b-get)
(j j-get j-set!)
(k k-get k-set!))
;
; checks if there is a double consonant at position i,
; with i being the second consonant's index
(define (double-consonant? data i)
(and (> i 0)
(char=? (list-ref (b-get data) i)
(list-ref (b-get data) (- i 1)))
(is-consonant? data i)))
;
; returns true if the suffix is found at the end of the current
; set of bytes, with k holding the last index in bytes.
; sets j to the end point, as if the suffix is removed.
(define (ends-with? data suffix)
(let ((suffix-array (string->list suffix))
(suffix-length (string-length suffix)))
(if (and (<= suffix-length (k-get data))
(equal? (take-right (take (b-get data) (+ 1 (k-get data))) suffix-length)
suffix-array))
(begin
(j-set! data (- (k-get data) suffix-length))
#t)
#f)))
;
; Checks if there is a consonant at position i
; -- note special handling of 'y', which is treated
; as a consonant at start of word or after a vowel.
(define (is-consonant? data i)
(cond ((member (list-ref (b-get data) i)
'(#\a #\e #\i #\o #\u))
#f)
((char=? (list-ref (b-get data) i)
#\y)
(or (= i 0)
(not (is-consonant? data (- i 1)))))
(else
#t)))
;
; cvc is true if i-2, i-1, i has form consonant, vowel, consonant
; (and last must not be w, x, or y)
(define (is-cvc? data i)
(cond ((or (< i 2)
(not (is-consonant? data i))
(is-consonant? data (- i 1))
(not (is-consonant? data (- i 2))))
#f)
((let ((ch (list-ref (b-get data) i)))
(or (char=? ch #\w)
(char=? ch #\x)
(char=? ch #\y)))
#f)
(else
#t)))
;
; Given a set of letter=>[(suffix, replacement)] pairs and a letter to match,
; applies replacements for the given letter
(define (make-replacements data letter replacements)
(for-each (lambda (rule)
(when (char=? letter (car rule))
(for-each (lambda (pairing)
(when (ends-with? data (car pairing))
(set-to data (cadr pairing) #t)))
(cdr rule))))
replacements))
;
; counts the number of vowel-consonant sequences between 0 and j
(define (seq-count data)
(let ((count 0)
(i 0))
; find index of first vowel, by skipping consonants
(let loop ()
(when (and (<= i (j-get data))
(is-consonant? data i))
(set! i (+ 1 i))
(loop)))
; repeat CV search to get full count until all of range is seen
(let loop ()
(when (<= i (j-get data))
; find index of next consonant, by skipping non-consonants
(let loop-consonant ()
(when (and (<= i (j-get data))
(not (is-consonant? data i)))
(set! i (+ 1 i))
(loop-consonant)))
; VC pair found with more to come, so increment count
(when (<= i (j-get data))
(set! count (+ 1 count)))
; find index of next vowel, by skipping consonants
(let loop-vowel ()
(when (and (<= i (j-get data))
(is-consonant? data i))
(set! i (+ 1 i))
(loop-vowel)))
(loop)))
;
count))
;
; sets positions j+1 to k to the suffix,
; adjusting k to j+length(suffix) (the new end of the word)
; -- supports a flag for only allowing change if seq_count > 0
; this prevents words becoming too small when suffix changed
(define set-to
(case-lambda
((data suffix)
(set-to data suffix #f))
((data suffix restrict)
(when (or (not restrict)
(> (seq-count data) 0))
(for-each (lambda (index value)
(list-set! (b-get data) (+ 1 (j-get data) index) value))
(iota (string-length suffix))
(string->list suffix))
(k-set! data (+ (j-get data)
(string-length suffix)))))))
;
; looks for a vowel in range 0 to j in bytes
(define (vowel-in-stem? data)
(any (lambda (i) (not (is-consonant? data i)))
(iota (+ 1 (j-get data)))))
;
(define STEP-2-RULES (list (cons #\a '(("ational" "ate") ("tional" "tion")))
(cons #\c '(("enci" "ence") ("anci" "ance")))
(cons #\e '(("izer" "ize")))
(cons #\g '(("logi" "log")))
(cons #\l '(("bli" "ble") ("alli" "al") ("entli" "ent") ("eli" "e") ("ousli" "ous")))
(cons #\o '(("ization" "ize") ("ation" "ate") ("ator" "ate")))
(cons #\s '(("alism" "al") ("iveness" "ive") ("fulness" "ful") ("ousness" "ous")))
(cons #\t '(("aliti" "al") ("iviti" "ive") ("biliti" "ble")))))
;
(define STEP-3-RULES (list (cons #\e '(("icate" "ic") ("ative" "") ("alize" "al")))
(cons #\i '(("iciti" "ic")))
(cons #\l '(("ical" "ic") ("ful" "")))
(cons #\s '(("ness" "")))))
;
; removes plurals and -ed -ing endings
(define (step-1ab data)
(when (char=? (list-ref (b-get data) (k-get data)) #\s)
(cond ((ends-with? data "sses")
(k-set! data (- (k-get data) 2)))
((ends-with? data "ies")
(set-to data "i"))
((not (char=? (list-ref (b-get data) (- (k-get data) 1))
#\s))
(k-set! data (- (k-get data) 1)))))
(if (ends-with? data "eed")
(when (> (seq-count data) 0)
(k-set! data (- (k-get data) 1)))
(when (and (or (ends-with? data "ed")
(ends-with? data "ing"))
(vowel-in-stem? data))
(k-set! data (j-get data))
(cond ((ends-with? data "at")
(set-to data "ate"))
((ends-with? data "bl")
(set-to data "ble"))
((ends-with? data "iz")
(set-to data "ize"))
((double-consonant? data (k-get data))
(k-set! data (- (k-get data) 1))
(let ((ch (list-ref (b-get data) (k-get data))))
(when (or (char=? #\l ch)
(char=? #\s ch)
(char=? #\z ch))
(k-set! data (+ (k-get data) 1)))))
((and (= 1 (seq-count data))
(is-cvc? data (k-get data)))
(set-to data "e"))))))
;
; turns terminal 'y' to 'i' when there is another vowel in stem
(define (step-1c data)
(when (and (ends-with? data "y")
(vowel-in-stem? data))
(list-set! (b-get data) (k-get data) #\i)))
;
; double suffixes are mapped to single (shorter) ones
(define (step-2 data)
(make-replacements data
(list-ref (b-get data) (- (k-get data) 1))
STEP-2-RULES))
;
; deals with -ic -full -ness
(define (step-3 data)
(make-replacements data
(list-ref (b-get data) (k-get data))
STEP-3-RULES))
;
; removes -ant -ence when seq-count is 2
(define (step-4 data)
(call-with-current-continuation
(lambda (return)
(let ((letter (list-ref (b-get data) (- (k-get data) 1)))) ; penultimate letter
(cond ((char=? letter #\a)
(when (not (ends-with? data "al"))
(return)))
((char=? letter #\c)
(when (and (not (ends-with? data "ance"))
(not (ends-with? data "ence")))
(return)))
((char=? letter #\e)
(when (not (ends-with? data "er"))
(return)))
((char=? letter #\i)
(when (not (ends-with? data "ic"))
(return)))
((char=? letter #\l)
(when (and (not (ends-with? data "able"))
(not (ends-with? data "ible")))
(return)))
((char=? letter #\n)
(when (and (not (ends-with? data "ant"))
(not (ends-with? data "ement"))
(not (ends-with? data "ment"))
(not (ends-with? data "ent")))
(return)))
((char=? letter #\o)
(if (and (ends-with? data "ion")
(or (char=? (list-ref (b-get data) (j-get data)) #\s)
(char=? (list-ref (b-get data) (j-get data)) #\t)))
'() ; break
(when (not (ends-with? data "ou"))
(return))))
((char=? letter #\s)
(when (not (ends-with? data "ism"))
(return)))
((char=? letter #\t)
(when (and (not (ends-with? data "ate"))
(not (ends-with? data "iti")))
(return)))
((char=? letter #\u)
(when (not (ends-with? data "ous"))
(return)))
((char=? letter #\v)
(when (not (ends-with? data "ive"))
(return)))
((char=? letter #\z)
(when (not (ends-with? data "ize"))
(return)))
(else
(return)))
(when (> (seq-count data) 1)
(k-set! data (j-get data)))))))
;
; if seq-count > 1, removes a final -e and changes -ll to -l
(define (step-5 data)
(j-set! data (k-get data))
; remove final -e
(when (char=? (list-ref (b-get data) (k-get data))
#\e)
(let ((count (seq-count data)))
(when (or (> count 1)
(and (= count 1)
(not (is-cvc? data (- (k-get data) 1)))))
(k-set! data (- (k-get data) 1)))))
; -ll to -l
(when (and (char=? (list-ref (b-get data) (k-get data))
#\l)
(double-consonant? data (k-get data))
(> (seq-count data) 1))
(k-set! data (- (k-get data) 1))))
;
; - main part of function
;
(let ((word (string-downcase initial-word))) ; TODO - check ASCII
(if (< (string-length word) 3) ; do not process short words
word
(let ((data (new-data (string->list word) 0 (- (string-length word) 1))))
(step-1ab data)
(when (> (k-get data) 0)
(step-1c data)
(step-2 data)
(step-3 data)
(step-4 data)
(step-5 data))
(list->string (take (b-get data) (+ 1 (k-get data))))))))
(define stop-word?
(let ((h (make-hash-table)))
(for-each (cut hash-table-set! h <> #t)
(list "i" "me" "my" "myself" "we" "our" "ours" "ourselves" "you" "your" "yours" "yourself" "yourselves"
"he" "him" "his" "himself" "she" "her" "hers" "herself" "it" "its" "itself" "they" "them" "their"
"theirs" "themselves" "what" "which" "who" "whom" "this" "that" "these" "those" "am" "is" "are"
"was" "were" "be" "been" "being" "have" "has" "had" "having" "do" "does" "did" "doing" "a" "an"
"the" "and" "but" "if" "or" "because" "as" "until" "while" "of" "at" "by" "for" "with" "about"
"against" "between" "into" "through" "during" "before" "after" "above" "below" "to" "from" "up"
"down" "in" "out" "on" "off" "over" "under" "again" "further" "then" "once" "here" "there"
"when" "where" "why" "how" "all" "any" "both" "each" "few" "more" "most" "other" "some" "such"
"no" "nor" "not" "only" "own" "same" "so" "than" "too" "very" "s" "t" "d" "can" "will" "just"
"don" "should" "now"))
(lambda (w)
(hash-table-ref/default h w #f))))