diff --git a/recipe-search.scm b/recipe-search.scm index acf905a..c19935a 100644 --- a/recipe-search.scm +++ b/recipe-search.scm @@ -1,82 +1,110 @@ (define-module (recipe-search) #:use-module (ice-9 ftw) #:use-module (ice-9 textual-ports) + #:use-module (ice-9 vlist) #:use-module (srfi srfi-1) ; lists + #:use-module (srfi srfi-9) ; records #:use-module (srfi srfi-26) ; cut/cute #:use-module (srfi srfi-69) ; hash tables + #:use-module (srfi srfi-71) ; extended let #:use-module (text-utils)) (define (get-tokens s) - (map porter-stem - (remove stop-word? - (map (compose remove-diacritics string-downcase) - (string-tokenize s char-set:letter))))) + (map (compose porter-stem remove-diacritics string-downcase) + (string-tokenize s char-set:letter+digit))) -(define (digrams xs) - (if (< (length xs) 2) - '() - (map list xs (cdr xs)))) +;; The term index is a map from tokens to a list of document ids +;; containing that token. This is used for retrieving documents contaning +;; the search term. -(define (trigrams xs) - (if (< (length xs) 3) - '() - (map list xs (cdr xs) (cddr xs)))) +(define term-index (make-hash-table)) -(define stem-index (make-hash-table)) -(define digram-index (make-hash-table)) -(define trigram-index (make-hash-table)) +(define (add-to-term-index! doc-id terms) + (for-each (lambda (t) + (hash-table-set! term-index t (cons doc-id (hash-table-ref/default term-index t '())))) + (delete-duplicates terms))) -(define (add-to-index! ix term val) - (let ((curr-val (hash-table-ref/default ix term '()))) - (hash-table-set! ix term (cons val curr-val)))) +(define (retrieve-doc-ids search-terms) + (delete-duplicates (append-map (cut hash-table-ref/default term-index <> '()) + (remove stop-word? search-terms)))) -(define (frequencies xs) - (let ((result (make-hash-table))) - (for-each (lambda (x) (hash-table-set! result x (1+ (hash-table-ref/default result x 0)))) xs) - (hash-table->alist result))) +(define (term-frequencies terms) + (let loop ((terms (sort terms stringvhash accum) + (let ((this more (span (cute string=? <> (first terms)) terms))) + (loop more (acons (car this) (length this) accum)))))) -(define (sort-by-frequency xs) - (map car - (sort (frequencies xs) - (lambda (a b) (> (cdr a) (cdr b)))))) +(define-record-type + (make-document id length term-frequencies) + document? + (id document-id) + (length document-length) + (term-frequencies document-term-frequencies)) -(define (search-index ix xs) - (sort-by-frequency (append-map (cut hash-table-ref/default ix <> '()) (delete-duplicates xs)))) +;; Document index is a map from document id to a record. +(define document-index (make-hash-table)) -(define (search-stem-index tokens) - (search-index stem-index tokens)) +;; Track number of recipes indexed +(define num-recipes 0) -(define (search-digram-index tokens) - (search-index digram-index (digrams tokens))) +;; Track total number of tokens (used to compute average doc length) +(define num-tokens 0) -(define (search-trigram-index tokens) - (search-index trigram-index (trigrams tokens))) +(define (add-to-document-index! doc-id terms) + (hash-table-set! document-index doc-id (make-document doc-id (length terms) (term-frequencies terms)))) -(define (load-recipe path) - (let ((name (basename path)) +(define (index-recipe path) + (let ((doc-id (basename path)) (tokens (get-tokens (call-with-input-file path get-string-all)))) - (for-each (cut add-to-index! stem-index <> name) (delete-duplicates tokens)) - (for-each (cut add-to-index! digram-index <> name) (delete-duplicates (digrams tokens))) - (for-each (cut add-to-index! trigram-index <> name) (delete-duplicates (trigrams tokens))))) + (add-to-document-index! doc-id tokens) + (add-to-term-index! doc-id tokens) + (set! num-recipes (1+ num-recipes)) + (set! num-tokens (+ num-tokens (length tokens))))) -(define (load-recipes dir) +(define (index-recipes dir) (for-each (lambda (filename) - (load-recipe (string-append dir "/" filename))) + (index-recipe (string-append dir "/" filename))) (scandir dir (cut string-suffix? ".txt" <>)))) -(define* (find-recipes search-term #:key (num-results 5)) - (let ((tokens (get-tokens search-term))) - (when (zero? (length tokens)) - (error "Please try a more specific search term")) - (let ((result (search-trigram-index tokens))) - (if (>= (length result) num-results) - (take result num-results) - (let ((result (delete-duplicates (append result (search-digram-index tokens))))) - (if (>= (length result) num-results) - (take result num-results) - (let ((result (delete-duplicates (append result (search-stem-index tokens))))) - (take result (min (length result) num-results))))))))) +;; Compute BM25 score for the given document and search terms +;; +;; This is based on the description of the algorithm in https://emschwartz.me/understanding-the-bm25-full-text-search-algorithm/ +;; + +;; k is a tuning parameter that controls how quickly the returns to term repetition diminish +(define k 1.2) + +;; b is a tuning parameter that controls how much we nomalize by document length +(define b 0.75) + +(define (bm25-score doc-id search-terms) + (let* ((doc (hash-table-ref document-index doc-id)) + (N num-recipes) + (n (lambda (q) (length (hash-table-ref/default term-index q '())))) + (f (lambda (q) (or (and=> (vhash-assoc q (document-term-frequencies doc)) cdr) 0))) + (|D| (document-length doc)) + (avgdl (/ num-tokens N))) + (fold + 0.0 (map (lambda (q) + (* (log (+ 1.0 (/ (+ N (- (n q)) 0.5) + (+ (n q) 0.5)))) + (/ (* (f q) (+ k 1.0)) + (* (+ (f q) k) (+ 1.0 (- b) (/ (* b |D|) avgdl)))))) + search-terms)))) + +(define* (find-recipes search-str #:key (num-results 5)) + (let* ((search-terms (delete-duplicates (get-tokens search-str))) + (doc-ids (retrieve-doc-ids search-terms))) + (when (zero? (length doc-ids)) + (error "No results found. Please try a different search term")) + (take + (sort + (map (lambda (doc-id) + (cons doc-id (bm25-score doc-id search-terms))) + doc-ids) + (lambda (x y) (> (cdr x) (cdr y)))) + (min num-results (length doc-ids))))) (define (main args) - (load-recipes "./recipes") + (index-recipes "./recipes") (find-recipes "broccoli and stilton soup"))