Add core functionality

This commit is contained in:
Ray Miller 2024-08-15 16:45:32 +01:00
parent 6f56fae985
commit dd1078ea66

124
spam-filter/core.scm Normal file
View file

@ -0,0 +1,124 @@
(define-module (spam-filter core)
#:use-module (oop goops)
#:use-module (ice-9 match)
#:use-module ((srfi srfi-1) #:select (delete-duplicates fold any)))
(define (get-words doc)
(delete-duplicates
(map string-downcase
(filter (lambda (s) (< 2 (string-length s) 20))
(string-tokenize doc)))))
(define-class <classifier> ()
(get-features #:init-keyword #:get-features)
(feature-category-count #:init-thunk make-hash-table)
(category-count #:init-form '() #:accessor category-count)
(thresholds #:init-form '() #:accessor thresholds)
(weight #:init-value 1.0 #:init-keyword #:weight #:accessor weight)
(assumed-prob #:init-value 0.5 #:init-keyword #:assumed-prob #:accessor assumed-prob))
(define-method (get-features (c <classifier>) doc)
((slot-ref c 'get-features) doc))
(define-method (set-threshold! (c <classifier>) cat t)
(set! (thresholds c) (assoc-set! (thresholds c) cat t)))
(define-method (get-threshold (c <classifier>) cat)
(or (assoc-ref (thresholds c) cat) 1.0))
(define-method (incf (c <classifier>) feature cat)
"Increase the count of a feature/category pair"
(let* ((fcc (slot-ref c 'feature-category-count))
(cc (hash-ref fcc feature '()))
(n (or (assoc-ref cc cat) 0.0)))
(hash-set! fcc feature (assoc-set! cc cat (1+ n)))))
(define-method (incc (c <classifier>) cat)
"Increase the count of a category"
(let ((n (or (assq-ref (category-count c) cat) 0.0)))
(set! (category-count c) (assq-set! (category-count c) cat (1+ n)))))
(define-method (fcount (c <classifier>) feature cat)
"The number of times a feature has appeared in a category"
(let* ((fcc (slot-ref c 'feature-category-count))
(cc (hash-ref fcc feature '())))
(or (assoc-ref cc cat) 0.0)))
(define-method (catcount (c <classifier>) cat)
"The number of items in a category"
(or (assq-ref (category-count c) cat) 0.0))
(define-method (totalcount (c <classifier>))
(fold + 0.0 (map cdr (category-count c))))
(define-method (categories (c <classifier>))
(map car (category-count c)))
(define-method (train (c <classifier>) doc cat)
(for-each (lambda (feature) (incf c feature cat))
(get-features c doc))
(incc c cat))
(define-method (fprob (c <classifier>) f cat)
(if (zero? (catcount c cat))
0.0
(/ (fcount c f cat)
(catcount c cat))))
(define-method (weighted-prob (c <classifier>) f cat)
(let ((basic-prob (fprob c f cat))
(totals (fold (lambda (cat accum) (+ accum (fcount c f cat))) 0.0 (categories c))))
(/ (+ (* (weight c) (assumed-prob c))
(* totals basic-prob))
(+ (weight c) totals))))
(define-class <naive-bayes> (<classifier>))
(define-method (doc-prob (c <naive-bayes>) doc cat)
(fold (lambda (feature p) (* p (weighted-prob c feature cat)))
1.0
(get-features c doc)))
(define-method (prob (c <naive-bayes>) doc cat)
(let ((cat-prob (/ (catcount c cat) (totalcount c))))
(* cat-prob (doc-prob c doc cat))))
(define-method (classify (c <naive-bayes>) doc)
(define max 0.0)
(define probs '())
(define best #f)
(for-each (lambda (cat)
(let ((p (prob c doc cat)))
(set! probs (assq-set! probs cat p))
(when (> p max)
(set! max p)
(set! best cat))))
(categories c))
;; Make sure the probabilty exceeds threshold*next best
(display probs) (newline)
(if (any (match-lambda ((k . v)
(and (not (eq? k best))
(> (* (get-threshold c best) v) max))))
probs)
#f
best))
(define-method (sample-train (c <classifier>))
(train c "Nobody owns the water." 'ham)
(train c "the quick rabbit jumps fences" 'ham)
(train c "buy pharmaceuticals now" 'spam)
(train c "make quick money at the online casino" 'spam)
(train c "the quick brown fox jumps" 'ham))
(define-method (sample-train-n-times (c <classifier>) n)
(let loop ((n n))
(when (> n 0)
(sample-train c)
(loop (1- n)))))
(define (make-test-classifier)
(let ((c (make <naive-bayes> #:get-features get-words)))
(set-threshold! c 'spam 3.0)
(set-threshold! c 'ham 1.0)
(sample-train c)
c))