(define-module (recipe-search) #:use-module (ice-9 ftw) #:use-module (ice-9 textual-ports) #:use-module (ice-9 vlist) #:use-module (srfi srfi-1) ; lists #:use-module (srfi srfi-9) ; records #:use-module (srfi srfi-26) ; cut/cute #:use-module (srfi srfi-69) ; hash tables #:use-module (srfi srfi-71) ; extended let #:use-module (text-utils)) (define (get-tokens s) (map (compose porter-stem remove-diacritics string-downcase) (string-tokenize s char-set:letter+digit))) ;; The term index is a map from tokens to a list of document ids ;; containing that token. This is used for retrieving documents contaning ;; the search term. (define term-index (make-hash-table)) (define (add-to-term-index! doc-id terms) (for-each (lambda (t) (hash-table-set! term-index t (cons doc-id (hash-table-ref/default term-index t '())))) (delete-duplicates terms))) (define (retrieve-doc-ids search-terms) (delete-duplicates (append-map (cut hash-table-ref/default term-index <> '()) (remove stop-word? search-terms)))) (define (term-frequencies terms) (let loop ((terms (sort terms stringvhash accum) (let ((this more (span (cute string=? <> (first terms)) terms))) (loop more (acons (car this) (length this) accum)))))) (define-record-type (make-document id length term-frequencies) document? (id document-id) (length document-length) (term-frequencies document-term-frequencies)) ;; Document index is a map from document id to a record. (define document-index (make-hash-table)) ;; Track number of recipes indexed (define num-recipes 0) ;; Track total number of tokens (used to compute average doc length) (define num-tokens 0) (define (add-to-document-index! doc-id terms) (hash-table-set! document-index doc-id (make-document doc-id (length terms) (term-frequencies terms)))) (define (index-recipe path) (let ((doc-id (basename path)) (tokens (get-tokens (call-with-input-file path get-string-all)))) (add-to-document-index! doc-id tokens) (add-to-term-index! doc-id tokens) (set! num-recipes (1+ num-recipes)) (set! num-tokens (+ num-tokens (length tokens))))) (define (index-recipes dir) (for-each (lambda (filename) (index-recipe (string-append dir "/" filename))) (scandir dir (cut string-suffix? ".txt" <>)))) ;; Compute BM25 score for the given document and search terms ;; ;; This is based on the description of the algorithm in https://emschwartz.me/understanding-the-bm25-full-text-search-algorithm/ ;; ;; k is a tuning parameter that controls how quickly the returns to term repetition diminish (define k 1.2) ;; b is a tuning parameter that controls how much we nomalize by document length (define b 0.75) (define (bm25-score doc-id search-terms) (let* ((doc (hash-table-ref document-index doc-id)) (N num-recipes) (n (lambda (q) (length (hash-table-ref/default term-index q '())))) (f (lambda (q) (or (and=> (vhash-assoc q (document-term-frequencies doc)) cdr) 0))) (|D| (document-length doc)) (avgdl (/ num-tokens N))) (fold + 0.0 (map (lambda (q) (* (log (+ 1.0 (/ (+ N (- (n q)) 0.5) (+ (n q) 0.5)))) (/ (* (f q) (+ k 1.0)) (* (+ (f q) k) (+ 1.0 (- b) (/ (* b |D|) avgdl)))))) search-terms)))) (define* (find-recipes search-str #:key (num-results 5)) (let* ((search-terms (delete-duplicates (get-tokens search-str))) (doc-ids (retrieve-doc-ids search-terms))) (when (zero? (length doc-ids)) (error "No results found. Please try a different search term")) (take (sort (map (lambda (doc-id) (cons doc-id (bm25-score doc-id search-terms))) doc-ids) (lambda (x y) (> (cdr x) (cdr y)))) (min num-results (length doc-ids))))) (define (main args) (index-recipes "./recipes") (find-recipes "broccoli and stilton soup"))