Implement BM25 algorithm for ranking results

This commit is contained in:
Ray Miller 2025-07-03 15:44:00 +01:00
parent b8d3ae0f5f
commit d99b3dac00

View file

@ -1,82 +1,110 @@
(define-module (recipe-search) (define-module (recipe-search)
#:use-module (ice-9 ftw) #:use-module (ice-9 ftw)
#:use-module (ice-9 textual-ports) #:use-module (ice-9 textual-ports)
#:use-module (ice-9 vlist)
#:use-module (srfi srfi-1) ; lists #:use-module (srfi srfi-1) ; lists
#:use-module (srfi srfi-9) ; records
#:use-module (srfi srfi-26) ; cut/cute #:use-module (srfi srfi-26) ; cut/cute
#:use-module (srfi srfi-69) ; hash tables #:use-module (srfi srfi-69) ; hash tables
#:use-module (srfi srfi-71) ; extended let
#:use-module (text-utils)) #:use-module (text-utils))
(define (get-tokens s) (define (get-tokens s)
(map porter-stem (map (compose porter-stem remove-diacritics string-downcase)
(remove stop-word? (string-tokenize s char-set:letter+digit)))
(map (compose remove-diacritics string-downcase)
(string-tokenize s char-set:letter)))))
(define (digrams xs) ;; The term index is a map from tokens to a list of document ids
(if (< (length xs) 2) ;; containing that token. This is used for retrieving documents contaning
'() ;; the search term.
(map list xs (cdr xs))))
(define (trigrams xs) (define term-index (make-hash-table))
(if (< (length xs) 3)
'()
(map list xs (cdr xs) (cddr xs))))
(define stem-index (make-hash-table)) (define (add-to-term-index! doc-id terms)
(define digram-index (make-hash-table)) (for-each (lambda (t)
(define trigram-index (make-hash-table)) (hash-table-set! term-index t (cons doc-id (hash-table-ref/default term-index t '()))))
(delete-duplicates terms)))
(define (add-to-index! ix term val) (define (retrieve-doc-ids search-terms)
(let ((curr-val (hash-table-ref/default ix term '()))) (delete-duplicates (append-map (cut hash-table-ref/default term-index <> '())
(hash-table-set! ix term (cons val curr-val)))) (remove stop-word? search-terms))))
(define (frequencies xs) (define (term-frequencies terms)
(let ((result (make-hash-table))) (let loop ((terms (sort terms string<?)) (accum '()))
(for-each (lambda (x) (hash-table-set! result x (1+ (hash-table-ref/default result x 0)))) xs) (if (null? terms)
(hash-table->alist result))) (alist->vhash accum)
(let ((this more (span (cute string=? <> (first terms)) terms)))
(loop more (acons (car this) (length this) accum))))))
(define (sort-by-frequency xs) (define-record-type <document>
(map car (make-document id length term-frequencies)
(sort (frequencies xs) document?
(lambda (a b) (> (cdr a) (cdr b)))))) (id document-id)
(length document-length)
(term-frequencies document-term-frequencies))
(define (search-index ix xs) ;; Document index is a map from document id to a <document> record.
(sort-by-frequency (append-map (cut hash-table-ref/default ix <> '()) (delete-duplicates xs)))) (define document-index (make-hash-table))
(define (search-stem-index tokens) ;; Track number of recipes indexed
(search-index stem-index tokens)) (define num-recipes 0)
(define (search-digram-index tokens) ;; Track total number of tokens (used to compute average doc length)
(search-index digram-index (digrams tokens))) (define num-tokens 0)
(define (search-trigram-index tokens) (define (add-to-document-index! doc-id terms)
(search-index trigram-index (trigrams tokens))) (hash-table-set! document-index doc-id (make-document doc-id (length terms) (term-frequencies terms))))
(define (load-recipe path) (define (index-recipe path)
(let ((name (basename path)) (let ((doc-id (basename path))
(tokens (get-tokens (call-with-input-file path get-string-all)))) (tokens (get-tokens (call-with-input-file path get-string-all))))
(for-each (cut add-to-index! stem-index <> name) (delete-duplicates tokens)) (add-to-document-index! doc-id tokens)
(for-each (cut add-to-index! digram-index <> name) (delete-duplicates (digrams tokens))) (add-to-term-index! doc-id tokens)
(for-each (cut add-to-index! trigram-index <> name) (delete-duplicates (trigrams tokens))))) (set! num-recipes (1+ num-recipes))
(set! num-tokens (+ num-tokens (length tokens)))))
(define (load-recipes dir) (define (index-recipes dir)
(for-each (lambda (filename) (for-each (lambda (filename)
(load-recipe (string-append dir "/" filename))) (index-recipe (string-append dir "/" filename)))
(scandir dir (cut string-suffix? ".txt" <>)))) (scandir dir (cut string-suffix? ".txt" <>))))
(define* (find-recipes search-term #:key (num-results 5)) ;; Compute BM25 score for the given document and search terms
(let ((tokens (get-tokens search-term))) ;;
(when (zero? (length tokens)) ;; This is based on the description of the algorithm in https://emschwartz.me/understanding-the-bm25-full-text-search-algorithm/
(error "Please try a more specific search term")) ;;
(let ((result (search-trigram-index tokens)))
(if (>= (length result) num-results) ;; k is a tuning parameter that controls how quickly the returns to term repetition diminish
(take result num-results) (define k 1.2)
(let ((result (delete-duplicates (append result (search-digram-index tokens)))))
(if (>= (length result) num-results) ;; b is a tuning parameter that controls how much we nomalize by document length
(take result num-results) (define b 0.75)
(let ((result (delete-duplicates (append result (search-stem-index tokens)))))
(take result (min (length result) num-results))))))))) (define (bm25-score doc-id search-terms)
(let* ((doc (hash-table-ref document-index doc-id))
(N num-recipes)
(n (lambda (q) (length (hash-table-ref/default term-index q '()))))
(f (lambda (q) (or (and=> (vhash-assoc q (document-term-frequencies doc)) cdr) 0)))
(|D| (document-length doc))
(avgdl (/ num-tokens N)))
(fold + 0.0 (map (lambda (q)
(* (log (+ 1.0 (/ (+ N (- (n q)) 0.5)
(+ (n q) 0.5))))
(/ (* (f q) (+ k 1.0))
(* (+ (f q) k) (+ 1.0 (- b) (/ (* b |D|) avgdl))))))
search-terms))))
(define* (find-recipes search-str #:key (num-results 5))
(let* ((search-terms (delete-duplicates (get-tokens search-str)))
(doc-ids (retrieve-doc-ids search-terms)))
(when (zero? (length doc-ids))
(error "No results found. Please try a different search term"))
(take
(sort
(map (lambda (doc-id)
(cons doc-id (bm25-score doc-id search-terms)))
doc-ids)
(lambda (x y) (> (cdr x) (cdr y))))
(min num-results (length doc-ids)))))
(define (main args) (define (main args)
(load-recipes "./recipes") (index-recipes "./recipes")
(find-recipes "broccoli and stilton soup")) (find-recipes "broccoli and stilton soup"))