From 68c165faced621378ce88957f3bf15f2769aecde Mon Sep 17 00:00:00 2001 From: Ray Miller Date: Tue, 1 Jul 2025 18:01:08 +0100 Subject: [PATCH] First pass at a search solution. --- recipe-search.scm | 82 +++++++++++ text-utils.scm | 339 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 421 insertions(+) create mode 100644 recipe-search.scm create mode 100644 text-utils.scm diff --git a/recipe-search.scm b/recipe-search.scm new file mode 100644 index 0000000..acf905a --- /dev/null +++ b/recipe-search.scm @@ -0,0 +1,82 @@ +(define-module (recipe-search) + #:use-module (ice-9 ftw) + #:use-module (ice-9 textual-ports) + #:use-module (srfi srfi-1) ; lists + #:use-module (srfi srfi-26) ; cut/cute + #:use-module (srfi srfi-69) ; hash tables + #:use-module (text-utils)) + +(define (get-tokens s) + (map porter-stem + (remove stop-word? + (map (compose remove-diacritics string-downcase) + (string-tokenize s char-set:letter))))) + +(define (digrams xs) + (if (< (length xs) 2) + '() + (map list xs (cdr xs)))) + +(define (trigrams xs) + (if (< (length xs) 3) + '() + (map list xs (cdr xs) (cddr xs)))) + +(define stem-index (make-hash-table)) +(define digram-index (make-hash-table)) +(define trigram-index (make-hash-table)) + +(define (add-to-index! ix term val) + (let ((curr-val (hash-table-ref/default ix term '()))) + (hash-table-set! ix term (cons val curr-val)))) + +(define (frequencies xs) + (let ((result (make-hash-table))) + (for-each (lambda (x) (hash-table-set! result x (1+ (hash-table-ref/default result x 0)))) xs) + (hash-table->alist result))) + +(define (sort-by-frequency xs) + (map car + (sort (frequencies xs) + (lambda (a b) (> (cdr a) (cdr b)))))) + +(define (search-index ix xs) + (sort-by-frequency (append-map (cut hash-table-ref/default ix <> '()) (delete-duplicates xs)))) + +(define (search-stem-index tokens) + (search-index stem-index tokens)) + +(define (search-digram-index tokens) + (search-index digram-index (digrams tokens))) + +(define (search-trigram-index tokens) + (search-index trigram-index (trigrams tokens))) + +(define (load-recipe path) + (let ((name (basename path)) + (tokens (get-tokens (call-with-input-file path get-string-all)))) + (for-each (cut add-to-index! stem-index <> name) (delete-duplicates tokens)) + (for-each (cut add-to-index! digram-index <> name) (delete-duplicates (digrams tokens))) + (for-each (cut add-to-index! trigram-index <> name) (delete-duplicates (trigrams tokens))))) + +(define (load-recipes dir) + (for-each (lambda (filename) + (load-recipe (string-append dir "/" filename))) + (scandir dir (cut string-suffix? ".txt" <>)))) + +(define* (find-recipes search-term #:key (num-results 5)) + (let ((tokens (get-tokens search-term))) + (when (zero? (length tokens)) + (error "Please try a more specific search term")) + (let ((result (search-trigram-index tokens))) + (if (>= (length result) num-results) + (take result num-results) + (let ((result (delete-duplicates (append result (search-digram-index tokens))))) + (if (>= (length result) num-results) + (take result num-results) + (let ((result (delete-duplicates (append result (search-stem-index tokens))))) + (take result (min (length result) num-results))))))))) + +(define (main args) + (load-recipes "./recipes") + (find-recipes "broccoli and stilton soup")) diff --git a/text-utils.scm b/text-utils.scm new file mode 100644 index 0000000..60e4632 --- /dev/null +++ b/text-utils.scm @@ -0,0 +1,339 @@ +(define-module (text-utils) + #:use-module (srfi srfi-1) ; lists + #:use-module (srfi srfi-9) ; records + #:use-module (srfi srfi-26) ; cut/cute + #:use-module (srfi srfi-69) ; hash tables + #:use-module (rnrs unicode) + #:export (remove-diacritics + porter-stem + stop-word?)) + +(define (remove-diacritics str) + ;; Helper function for checking if a character is a combining mark + ;; (rnrs unicode) provides `char-general-category` which returns a symbol + ;; representing the character's general category. Combining marks generally + ;; fall under the 'Mn' (Mark, Nonspacing) category. + (define (char-combining-mark? char) + (eq? (char-general-category char) 'Mn)) + (let* ((nfd-str (string-normalize-nfd str)) ; Step 1: Normalize to NFD + (chars (string->list nfd-str))) ; Convert to a list of characters + (list->string + (filter (lambda (char) + (not (char-combining-mark? char))) ; Step 2: Filter out combining marks + chars)))) + +;; Porter stemming is a popular stemming algorithm used for English. +;; +;; For a complete description, see: +;; +;; Algorithm published in Porter, 1980, "An algorithm for suffix stripping," +;; _Program_, Vol. 14, no. 3, pp 130-137. +;; +;; This implementation by Peter Lane, https://codeberg.org/peterlane/r7rs-libs/src/branch/trunk/robin/text.sld +;; +;; An error is raised if the word does not consist entirely of ASCII characters. +;; This is because the Porter stemmer is designed for stemming English words. +(define (porter-stem initial-word) + ; + ; Data structure holds the word and information on where we are in + ; processing it. + ; + ; b: array of character strings representing the word we are stemming + ; j: an alternate 'end point' in the array, used, e.g., after searching + ; to mark where a suffix begins + ; k: the end of the array + (define-record-type + (new-data b j k) + data? + (b b-get) + (j j-get j-set!) + (k k-get k-set!)) + ; + ; checks if there is a double consonant at position i, + ; with i being the second consonant's index + (define (double-consonant? data i) + (and (> i 0) + (char=? (list-ref (b-get data) i) + (list-ref (b-get data) (- i 1))) + (is-consonant? data i))) + ; + ; returns true if the suffix is found at the end of the current + ; set of bytes, with k holding the last index in bytes. + ; sets j to the end point, as if the suffix is removed. + (define (ends-with? data suffix) + (let ((suffix-array (string->list suffix)) + (suffix-length (string-length suffix))) + (if (and (<= suffix-length (k-get data)) + (equal? (take-right (take (b-get data) (+ 1 (k-get data))) suffix-length) + suffix-array)) + (begin + (j-set! data (- (k-get data) suffix-length)) + #t) + #f))) + ; + ; Checks if there is a consonant at position i + ; -- note special handling of 'y', which is treated + ; as a consonant at start of word or after a vowel. + (define (is-consonant? data i) + (cond ((member (list-ref (b-get data) i) + '(#\a #\e #\i #\o #\u)) + #f) + ((char=? (list-ref (b-get data) i) + #\y) + (or (= i 0) + (not (is-consonant? data (- i 1))))) + (else + #t))) + ; + ; cvc is true if i-2, i-1, i has form consonant, vowel, consonant + ; (and last must not be w, x, or y) + (define (is-cvc? data i) + (cond ((or (< i 2) + (not (is-consonant? data i)) + (is-consonant? data (- i 1)) + (not (is-consonant? data (- i 2)))) + #f) + ((let ((ch (list-ref (b-get data) i))) + (or (char=? ch #\w) + (char=? ch #\x) + (char=? ch #\y))) + #f) + (else + #t))) + ; + ; Given a set of letter=>[(suffix, replacement)] pairs and a letter to match, + ; applies replacements for the given letter + (define (make-replacements data letter replacements) + (for-each (lambda (rule) + (when (char=? letter (car rule)) + (for-each (lambda (pairing) + (when (ends-with? data (car pairing)) + (set-to data (cadr pairing) #t))) + (cdr rule)))) + replacements)) + ; + ; counts the number of vowel-consonant sequences between 0 and j + (define (seq-count data) + (let ((count 0) + (i 0)) + ; find index of first vowel, by skipping consonants + (let loop () + (when (and (<= i (j-get data)) + (is-consonant? data i)) + (set! i (+ 1 i)) + (loop))) + ; repeat CV search to get full count until all of range is seen + (let loop () + (when (<= i (j-get data)) + ; find index of next consonant, by skipping non-consonants + (let loop-consonant () + (when (and (<= i (j-get data)) + (not (is-consonant? data i))) + (set! i (+ 1 i)) + (loop-consonant))) + ; VC pair found with more to come, so increment count + (when (<= i (j-get data)) + (set! count (+ 1 count))) + ; find index of next vowel, by skipping consonants + (let loop-vowel () + (when (and (<= i (j-get data)) + (is-consonant? data i)) + (set! i (+ 1 i)) + (loop-vowel))) + (loop))) + ; + count)) + ; + ; sets positions j+1 to k to the suffix, + ; adjusting k to j+length(suffix) (the new end of the word) + ; -- supports a flag for only allowing change if seq_count > 0 + ; this prevents words becoming too small when suffix changed + (define set-to + (case-lambda + ((data suffix) + (set-to data suffix #f)) + ((data suffix restrict) + (when (or (not restrict) + (> (seq-count data) 0)) + (for-each (lambda (index value) + (list-set! (b-get data) (+ 1 (j-get data) index) value)) + (iota (string-length suffix)) + (string->list suffix)) + (k-set! data (+ (j-get data) + (string-length suffix))))))) + ; + ; looks for a vowel in range 0 to j in bytes + (define (vowel-in-stem? data) + (any (lambda (i) (not (is-consonant? data i))) + (iota (+ 1 (j-get data))))) + ; + (define STEP-2-RULES (list (cons #\a '(("ational" "ate") ("tional" "tion"))) + (cons #\c '(("enci" "ence") ("anci" "ance"))) + (cons #\e '(("izer" "ize"))) + (cons #\g '(("logi" "log"))) + (cons #\l '(("bli" "ble") ("alli" "al") ("entli" "ent") ("eli" "e") ("ousli" "ous"))) + (cons #\o '(("ization" "ize") ("ation" "ate") ("ator" "ate"))) + (cons #\s '(("alism" "al") ("iveness" "ive") ("fulness" "ful") ("ousness" "ous"))) + (cons #\t '(("aliti" "al") ("iviti" "ive") ("biliti" "ble"))))) + ; + (define STEP-3-RULES (list (cons #\e '(("icate" "ic") ("ative" "") ("alize" "al"))) + (cons #\i '(("iciti" "ic"))) + (cons #\l '(("ical" "ic") ("ful" ""))) + (cons #\s '(("ness" ""))))) + ; + ; removes plurals and -ed -ing endings + (define (step-1ab data) + (when (char=? (list-ref (b-get data) (k-get data)) #\s) + (cond ((ends-with? data "sses") + (k-set! data (- (k-get data) 2))) + ((ends-with? data "ies") + (set-to data "i")) + ((not (char=? (list-ref (b-get data) (- (k-get data) 1)) + #\s)) + (k-set! data (- (k-get data) 1))))) + (if (ends-with? data "eed") + (when (> (seq-count data) 0) + (k-set! data (- (k-get data) 1))) + (when (and (or (ends-with? data "ed") + (ends-with? data "ing")) + (vowel-in-stem? data)) + (k-set! data (j-get data)) + (cond ((ends-with? data "at") + (set-to data "ate")) + ((ends-with? data "bl") + (set-to data "ble")) + ((ends-with? data "iz") + (set-to data "ize")) + ((double-consonant? data (k-get data)) + (k-set! data (- (k-get data) 1)) + (let ((ch (list-ref (b-get data) (k-get data)))) + (when (or (char=? #\l ch) + (char=? #\s ch) + (char=? #\z ch)) + (k-set! data (+ (k-get data) 1))))) + ((and (= 1 (seq-count data)) + (is-cvc? data (k-get data))) + (set-to data "e")))))) + ; + ; turns terminal 'y' to 'i' when there is another vowel in stem + (define (step-1c data) + (when (and (ends-with? data "y") + (vowel-in-stem? data)) + (list-set! (b-get data) (k-get data) #\i))) + ; + ; double suffixes are mapped to single (shorter) ones + (define (step-2 data) + (make-replacements data + (list-ref (b-get data) (- (k-get data) 1)) + STEP-2-RULES)) + ; + ; deals with -ic -full -ness + (define (step-3 data) + (make-replacements data + (list-ref (b-get data) (k-get data)) + STEP-3-RULES)) + ; + ; removes -ant -ence when seq-count is 2 + (define (step-4 data) + (call-with-current-continuation + (lambda (return) + (let ((letter (list-ref (b-get data) (- (k-get data) 1)))) ; penultimate letter + (cond ((char=? letter #\a) + (when (not (ends-with? data "al")) + (return))) + ((char=? letter #\c) + (when (and (not (ends-with? data "ance")) + (not (ends-with? data "ence"))) + (return))) + ((char=? letter #\e) + (when (not (ends-with? data "er")) + (return))) + ((char=? letter #\i) + (when (not (ends-with? data "ic")) + (return))) + ((char=? letter #\l) + (when (and (not (ends-with? data "able")) + (not (ends-with? data "ible"))) + (return))) + ((char=? letter #\n) + (when (and (not (ends-with? data "ant")) + (not (ends-with? data "ement")) + (not (ends-with? data "ment")) + (not (ends-with? data "ent"))) + (return))) + ((char=? letter #\o) + (if (and (ends-with? data "ion") + (or (char=? (list-ref (b-get data) (j-get data)) #\s) + (char=? (list-ref (b-get data) (j-get data)) #\t))) + '() ; break + (when (not (ends-with? data "ou")) + (return)))) + ((char=? letter #\s) + (when (not (ends-with? data "ism")) + (return))) + ((char=? letter #\t) + (when (and (not (ends-with? data "ate")) + (not (ends-with? data "iti"))) + (return))) + ((char=? letter #\u) + (when (not (ends-with? data "ous")) + (return))) + ((char=? letter #\v) + (when (not (ends-with? data "ive")) + (return))) + ((char=? letter #\z) + (when (not (ends-with? data "ize")) + (return))) + (else + (return))) + (when (> (seq-count data) 1) + (k-set! data (j-get data))))))) + ; + ; if seq-count > 1, removes a final -e and changes -ll to -l + (define (step-5 data) + (j-set! data (k-get data)) + ; remove final -e + (when (char=? (list-ref (b-get data) (k-get data)) + #\e) + (let ((count (seq-count data))) + (when (or (> count 1) + (and (= count 1) + (not (is-cvc? data (- (k-get data) 1))))) + (k-set! data (- (k-get data) 1))))) + ; -ll to -l + (when (and (char=? (list-ref (b-get data) (k-get data)) + #\l) + (double-consonant? data (k-get data)) + (> (seq-count data) 1)) + (k-set! data (- (k-get data) 1)))) + ; + ; - main part of function + ; + (let ((word (string-downcase initial-word))) ; TODO - check ASCII + (if (< (string-length word) 3) ; do not process short words + word + (let ((data (new-data (string->list word) 0 (- (string-length word) 1)))) + (step-1ab data) + (when (> (k-get data) 0) + (step-1c data) + (step-2 data) + (step-3 data) + (step-4 data) + (step-5 data)) + (list->string (take (b-get data) (+ 1 (k-get data)))))))) + +(define stop-word? + (let ((h (make-hash-table))) + (for-each (cut hash-table-set! h <> #t) + (list "i" "me" "my" "myself" "we" "our" "ours" "ourselves" "you" "your" "yours" "yourself" "yourselves" + "he" "him" "his" "himself" "she" "her" "hers" "herself" "it" "its" "itself" "they" "them" "their" + "theirs" "themselves" "what" "which" "who" "whom" "this" "that" "these" "those" "am" "is" "are" + "was" "were" "be" "been" "being" "have" "has" "had" "having" "do" "does" "did" "doing" "a" "an" + "the" "and" "but" "if" "or" "because" "as" "until" "while" "of" "at" "by" "for" "with" "about" + "against" "between" "into" "through" "during" "before" "after" "above" "below" "to" "from" "up" + "down" "in" "out" "on" "off" "over" "under" "again" "further" "then" "once" "here" "there" + "when" "where" "why" "how" "all" "any" "both" "each" "few" "more" "most" "other" "some" "such" + "no" "nor" "not" "only" "own" "same" "so" "than" "too" "very" "s" "t" "d" "can" "will" "just" + "don" "should" "now")) + (lambda (w) + (hash-table-ref/default h w #f))))