diff --git a/spam-filter/core.scm b/spam-filter/core.scm new file mode 100644 index 0000000..ee831d2 --- /dev/null +++ b/spam-filter/core.scm @@ -0,0 +1,124 @@ +(define-module (spam-filter core) + #:use-module (oop goops) + #:use-module (ice-9 match) + #:use-module ((srfi srfi-1) #:select (delete-duplicates fold any))) + +(define (get-words doc) + (delete-duplicates + (map string-downcase + (filter (lambda (s) (< 2 (string-length s) 20)) + (string-tokenize doc))))) + +(define-class () + (get-features #:init-keyword #:get-features) + (feature-category-count #:init-thunk make-hash-table) + (category-count #:init-form '() #:accessor category-count) + (thresholds #:init-form '() #:accessor thresholds) + (weight #:init-value 1.0 #:init-keyword #:weight #:accessor weight) + (assumed-prob #:init-value 0.5 #:init-keyword #:assumed-prob #:accessor assumed-prob)) + +(define-method (get-features (c ) doc) + ((slot-ref c 'get-features) doc)) + +(define-method (set-threshold! (c ) cat t) + (set! (thresholds c) (assoc-set! (thresholds c) cat t))) + +(define-method (get-threshold (c ) cat) + (or (assoc-ref (thresholds c) cat) 1.0)) + +(define-method (incf (c ) feature cat) + "Increase the count of a feature/category pair" + (let* ((fcc (slot-ref c 'feature-category-count)) + (cc (hash-ref fcc feature '())) + (n (or (assoc-ref cc cat) 0.0))) + (hash-set! fcc feature (assoc-set! cc cat (1+ n))))) + +(define-method (incc (c ) cat) + "Increase the count of a category" + (let ((n (or (assq-ref (category-count c) cat) 0.0))) + (set! (category-count c) (assq-set! (category-count c) cat (1+ n))))) + +(define-method (fcount (c ) feature cat) + "The number of times a feature has appeared in a category" + (let* ((fcc (slot-ref c 'feature-category-count)) + (cc (hash-ref fcc feature '()))) + (or (assoc-ref cc cat) 0.0))) + +(define-method (catcount (c ) cat) + "The number of items in a category" + (or (assq-ref (category-count c) cat) 0.0)) + +(define-method (totalcount (c )) + (fold + 0.0 (map cdr (category-count c)))) + +(define-method (categories (c )) + (map car (category-count c))) + +(define-method (train (c ) doc cat) + (for-each (lambda (feature) (incf c feature cat)) + (get-features c doc)) + (incc c cat)) + +(define-method (fprob (c ) f cat) + (if (zero? (catcount c cat)) + 0.0 + (/ (fcount c f cat) + (catcount c cat)))) + +(define-method (weighted-prob (c ) f cat) + (let ((basic-prob (fprob c f cat)) + (totals (fold (lambda (cat accum) (+ accum (fcount c f cat))) 0.0 (categories c)))) + (/ (+ (* (weight c) (assumed-prob c)) + (* totals basic-prob)) + (+ (weight c) totals)))) + +(define-class ()) + +(define-method (doc-prob (c ) doc cat) + (fold (lambda (feature p) (* p (weighted-prob c feature cat))) + 1.0 + (get-features c doc))) + +(define-method (prob (c ) doc cat) + (let ((cat-prob (/ (catcount c cat) (totalcount c)))) + (* cat-prob (doc-prob c doc cat)))) + +(define-method (classify (c ) doc) + (define max 0.0) + (define probs '()) + (define best #f) + (for-each (lambda (cat) + (let ((p (prob c doc cat))) + (set! probs (assq-set! probs cat p)) + (when (> p max) + (set! max p) + (set! best cat)))) + (categories c)) + ;; Make sure the probabilty exceeds threshold*next best + (display probs) (newline) + (if (any (match-lambda ((k . v) + (and (not (eq? k best)) + (> (* (get-threshold c best) v) max)))) + probs) + #f + best)) + +(define-method (sample-train (c )) + (train c "Nobody owns the water." 'ham) + (train c "the quick rabbit jumps fences" 'ham) + (train c "buy pharmaceuticals now" 'spam) + (train c "make quick money at the online casino" 'spam) + (train c "the quick brown fox jumps" 'ham)) + +(define-method (sample-train-n-times (c ) n) + (let loop ((n n)) + (when (> n 0) + (sample-train c) + (loop (1- n))))) + +(define (make-test-classifier) + (let ((c (make #:get-features get-words))) + (set-threshold! c 'spam 3.0) + (set-threshold! c 'ham 1.0) + (sample-train c) + c))