From 55b2804f0155e90790aa24c23865f60519bf7148 Mon Sep 17 00:00:00 2001 From: Ray Miller Date: Thu, 12 Sep 2024 15:51:12 +0100 Subject: [PATCH] Implement fisher classifier. --- spam-filter/core.scm | 46 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/spam-filter/core.scm b/spam-filter/core.scm index ee831d2..4f672c4 100644 --- a/spam-filter/core.scm +++ b/spam-filter/core.scm @@ -65,8 +65,8 @@ (/ (fcount c f cat) (catcount c cat)))) -(define-method (weighted-prob (c ) f cat) - (let ((basic-prob (fprob c f cat)) +(define-method (weighted-prob (c ) f cat prf) + (let ((basic-prob (prf c f cat)) (totals (fold (lambda (cat accum) (+ accum (fcount c f cat))) 0.0 (categories c)))) (/ (+ (* (weight c) (assumed-prob c)) (* totals basic-prob)) @@ -75,7 +75,7 @@ (define-class ()) (define-method (doc-prob (c ) doc cat) - (fold (lambda (feature p) (* p (weighted-prob c feature cat))) + (fold (lambda (feature p) (* p (weighted-prob c feature cat fprob))) 1.0 (get-features c doc))) @@ -116,9 +116,47 @@ (sample-train c) (loop (1- n))))) -(define (make-test-classifier) +(define (make-test-naive-bayes-classifier) (let ((c (make #:get-features get-words))) (set-threshold! c 'spam 3.0) (set-threshold! c 'ham 1.0) (sample-train c) c)) + +(define-class ()) + +(define-method (cprob (c ) f cat) + (let ((clf (fprob c f cat))) + (if (zero? clf) + 0 + (let ((freqsum (fold (lambda (cat accum) + (+ accum (fprob c f cat))) + 0.0 + (categories c)))) + (/ clf freqsum))))) + +(define (inv-chi chi df) + (let* ((m (/ chi 2.0)) + (sum (exp (- m))) + (term sum)) + (for-each (lambda (i) + (set! term (* term (/ m i))) + (set! sum (+ sum term))) + (iota (floor (/ df 2.0)) 1)) + (min sum 1.0))) + +(define-method (fisher-prob (c ) doc cat) + (define features (get-features c doc)) + (define p (fold (lambda (f accum) + (* accum (weighted-prob c f cat cprob))) + 1.0 + features)) + (define fscore (* -2.0 (log p))) + (inv-chi fscore (* 2 (length features)))) + +(define (make-test-fisher-classifier) + (let ((c (make #:get-features get-words))) + (set-threshold! c 'spam 3.0) + (set-threshold! c 'ham 1.0) + (sample-train c) + c))