110 lines
4.1 KiB
Scheme
110 lines
4.1 KiB
Scheme
(define-module (recipe-search)
|
|
#:use-module (ice-9 ftw)
|
|
#:use-module (ice-9 textual-ports)
|
|
#:use-module (ice-9 vlist)
|
|
#:use-module (srfi srfi-1) ; lists
|
|
#:use-module (srfi srfi-9) ; records
|
|
#:use-module (srfi srfi-26) ; cut/cute
|
|
#:use-module (srfi srfi-69) ; hash tables
|
|
#:use-module (srfi srfi-71) ; extended let
|
|
#:use-module (text-utils))
|
|
|
|
(define (get-tokens s)
|
|
(map (compose porter-stem remove-diacritics string-downcase)
|
|
(string-tokenize s char-set:letter+digit)))
|
|
|
|
;; The term index is a map from tokens to a list of document ids
|
|
;; containing that token. This is used for retrieving documents contaning
|
|
;; the search term.
|
|
|
|
(define term-index (make-hash-table))
|
|
|
|
(define (add-to-term-index! doc-id terms)
|
|
(for-each (lambda (t)
|
|
(hash-table-set! term-index t (cons doc-id (hash-table-ref/default term-index t '()))))
|
|
(delete-duplicates terms)))
|
|
|
|
(define (retrieve-doc-ids search-terms)
|
|
(delete-duplicates (append-map (cut hash-table-ref/default term-index <> '())
|
|
(remove stop-word? search-terms))))
|
|
|
|
(define (term-frequencies terms)
|
|
(let loop ((terms (sort terms string<?)) (accum '()))
|
|
(if (null? terms)
|
|
(alist->vhash accum)
|
|
(let ((this more (span (cute string=? <> (first terms)) terms)))
|
|
(loop more (acons (car this) (length this) accum))))))
|
|
|
|
(define-record-type <document>
|
|
(make-document id length term-frequencies)
|
|
document?
|
|
(id document-id)
|
|
(length document-length)
|
|
(term-frequencies document-term-frequencies))
|
|
|
|
;; Document index is a map from document id to a <document> record.
|
|
(define document-index (make-hash-table))
|
|
|
|
;; Track number of recipes indexed
|
|
(define num-recipes 0)
|
|
|
|
;; Track total number of tokens (used to compute average doc length)
|
|
(define num-tokens 0)
|
|
|
|
(define (add-to-document-index! doc-id terms)
|
|
(hash-table-set! document-index doc-id (make-document doc-id (length terms) (term-frequencies terms))))
|
|
|
|
(define (index-recipe path)
|
|
(let ((doc-id (basename path))
|
|
(tokens (get-tokens (call-with-input-file path get-string-all))))
|
|
(add-to-document-index! doc-id tokens)
|
|
(add-to-term-index! doc-id tokens)
|
|
(set! num-recipes (1+ num-recipes))
|
|
(set! num-tokens (+ num-tokens (length tokens)))))
|
|
|
|
(define (index-recipes dir)
|
|
(for-each (lambda (filename)
|
|
(index-recipe (string-append dir "/" filename)))
|
|
(scandir dir (cut string-suffix? ".txt" <>))))
|
|
|
|
;; Compute BM25 score for the given document and search terms
|
|
;;
|
|
;; This is based on the description of the algorithm in https://emschwartz.me/understanding-the-bm25-full-text-search-algorithm/
|
|
;;
|
|
|
|
;; k is a tuning parameter that controls how quickly the returns to term repetition diminish
|
|
(define k 1.2)
|
|
|
|
;; b is a tuning parameter that controls how much we nomalize by document length
|
|
(define b 0.75)
|
|
|
|
(define (bm25-score doc-id search-terms)
|
|
(let* ((doc (hash-table-ref document-index doc-id))
|
|
(N num-recipes)
|
|
(n (lambda (q) (length (hash-table-ref/default term-index q '()))))
|
|
(f (lambda (q) (or (and=> (vhash-assoc q (document-term-frequencies doc)) cdr) 0)))
|
|
(|D| (document-length doc))
|
|
(avgdl (/ num-tokens N)))
|
|
(fold + 0.0 (map (lambda (q)
|
|
(* (log (+ 1.0 (/ (+ N (- (n q)) 0.5)
|
|
(+ (n q) 0.5))))
|
|
(/ (* (f q) (+ k 1.0))
|
|
(* (+ (f q) k) (+ 1.0 (- b) (/ (* b |D|) avgdl))))))
|
|
search-terms))))
|
|
|
|
(define* (find-recipes search-str #:key (num-results 5))
|
|
(let* ((search-terms (delete-duplicates (get-tokens search-str)))
|
|
(doc-ids (retrieve-doc-ids search-terms)))
|
|
(when (zero? (length doc-ids))
|
|
(error "No results found. Please try a different search term"))
|
|
(take
|
|
(sort
|
|
(map (lambda (doc-id)
|
|
(cons doc-id (bm25-score doc-id search-terms)))
|
|
doc-ids)
|
|
(lambda (x y) (> (cdr x) (cdr y))))
|
|
(min num-results (length doc-ids)))))
|
|
|
|
(define (main args)
|
|
(index-recipes "./recipes")
|
|
(find-recipes "broccoli and stilton soup"))
|