recipe-search-tech-test/recipe-search.scm
2025-07-03 15:44:00 +01:00

110 lines
4.1 KiB
Scheme

(define-module (recipe-search)
#:use-module (ice-9 ftw)
#:use-module (ice-9 textual-ports)
#:use-module (ice-9 vlist)
#:use-module (srfi srfi-1) ; lists
#:use-module (srfi srfi-9) ; records
#:use-module (srfi srfi-26) ; cut/cute
#:use-module (srfi srfi-69) ; hash tables
#:use-module (srfi srfi-71) ; extended let
#:use-module (text-utils))
(define (get-tokens s)
(map (compose porter-stem remove-diacritics string-downcase)
(string-tokenize s char-set:letter+digit)))
;; The term index is a map from tokens to a list of document ids
;; containing that token. This is used for retrieving documents contaning
;; the search term.
(define term-index (make-hash-table))
(define (add-to-term-index! doc-id terms)
(for-each (lambda (t)
(hash-table-set! term-index t (cons doc-id (hash-table-ref/default term-index t '()))))
(delete-duplicates terms)))
(define (retrieve-doc-ids search-terms)
(delete-duplicates (append-map (cut hash-table-ref/default term-index <> '())
(remove stop-word? search-terms))))
(define (term-frequencies terms)
(let loop ((terms (sort terms string<?)) (accum '()))
(if (null? terms)
(alist->vhash accum)
(let ((this more (span (cute string=? <> (first terms)) terms)))
(loop more (acons (car this) (length this) accum))))))
(define-record-type <document>
(make-document id length term-frequencies)
document?
(id document-id)
(length document-length)
(term-frequencies document-term-frequencies))
;; Document index is a map from document id to a <document> record.
(define document-index (make-hash-table))
;; Track number of recipes indexed
(define num-recipes 0)
;; Track total number of tokens (used to compute average doc length)
(define num-tokens 0)
(define (add-to-document-index! doc-id terms)
(hash-table-set! document-index doc-id (make-document doc-id (length terms) (term-frequencies terms))))
(define (index-recipe path)
(let ((doc-id (basename path))
(tokens (get-tokens (call-with-input-file path get-string-all))))
(add-to-document-index! doc-id tokens)
(add-to-term-index! doc-id tokens)
(set! num-recipes (1+ num-recipes))
(set! num-tokens (+ num-tokens (length tokens)))))
(define (index-recipes dir)
(for-each (lambda (filename)
(index-recipe (string-append dir "/" filename)))
(scandir dir (cut string-suffix? ".txt" <>))))
;; Compute BM25 score for the given document and search terms
;;
;; This is based on the description of the algorithm in https://emschwartz.me/understanding-the-bm25-full-text-search-algorithm/
;;
;; k is a tuning parameter that controls how quickly the returns to term repetition diminish
(define k 1.2)
;; b is a tuning parameter that controls how much we nomalize by document length
(define b 0.75)
(define (bm25-score doc-id search-terms)
(let* ((doc (hash-table-ref document-index doc-id))
(N num-recipes)
(n (lambda (q) (length (hash-table-ref/default term-index q '()))))
(f (lambda (q) (or (and=> (vhash-assoc q (document-term-frequencies doc)) cdr) 0)))
(|D| (document-length doc))
(avgdl (/ num-tokens N)))
(fold + 0.0 (map (lambda (q)
(* (log (+ 1.0 (/ (+ N (- (n q)) 0.5)
(+ (n q) 0.5))))
(/ (* (f q) (+ k 1.0))
(* (+ (f q) k) (+ 1.0 (- b) (/ (* b |D|) avgdl))))))
search-terms))))
(define* (find-recipes search-str #:key (num-results 5))
(let* ((search-terms (delete-duplicates (get-tokens search-str)))
(doc-ids (retrieve-doc-ids search-terms)))
(when (zero? (length doc-ids))
(error "No results found. Please try a different search term"))
(take
(sort
(map (lambda (doc-id)
(cons doc-id (bm25-score doc-id search-terms)))
doc-ids)
(lambda (x y) (> (cdr x) (cdr y))))
(min num-results (length doc-ids)))))
(define (main args)
(index-recipes "./recipes")
(find-recipes "broccoli and stilton soup"))