Add core functionality
This commit is contained in:
parent
6f56fae985
commit
dd1078ea66
1 changed files with 124 additions and 0 deletions
124
spam-filter/core.scm
Normal file
124
spam-filter/core.scm
Normal file
|
@ -0,0 +1,124 @@
|
|||
(define-module (spam-filter core)
|
||||
#:use-module (oop goops)
|
||||
#:use-module (ice-9 match)
|
||||
#:use-module ((srfi srfi-1) #:select (delete-duplicates fold any)))
|
||||
|
||||
(define (get-words doc)
|
||||
(delete-duplicates
|
||||
(map string-downcase
|
||||
(filter (lambda (s) (< 2 (string-length s) 20))
|
||||
(string-tokenize doc)))))
|
||||
|
||||
(define-class <classifier> ()
|
||||
(get-features #:init-keyword #:get-features)
|
||||
(feature-category-count #:init-thunk make-hash-table)
|
||||
(category-count #:init-form '() #:accessor category-count)
|
||||
(thresholds #:init-form '() #:accessor thresholds)
|
||||
(weight #:init-value 1.0 #:init-keyword #:weight #:accessor weight)
|
||||
(assumed-prob #:init-value 0.5 #:init-keyword #:assumed-prob #:accessor assumed-prob))
|
||||
|
||||
(define-method (get-features (c <classifier>) doc)
|
||||
((slot-ref c 'get-features) doc))
|
||||
|
||||
(define-method (set-threshold! (c <classifier>) cat t)
|
||||
(set! (thresholds c) (assoc-set! (thresholds c) cat t)))
|
||||
|
||||
(define-method (get-threshold (c <classifier>) cat)
|
||||
(or (assoc-ref (thresholds c) cat) 1.0))
|
||||
|
||||
(define-method (incf (c <classifier>) feature cat)
|
||||
"Increase the count of a feature/category pair"
|
||||
(let* ((fcc (slot-ref c 'feature-category-count))
|
||||
(cc (hash-ref fcc feature '()))
|
||||
(n (or (assoc-ref cc cat) 0.0)))
|
||||
(hash-set! fcc feature (assoc-set! cc cat (1+ n)))))
|
||||
|
||||
(define-method (incc (c <classifier>) cat)
|
||||
"Increase the count of a category"
|
||||
(let ((n (or (assq-ref (category-count c) cat) 0.0)))
|
||||
(set! (category-count c) (assq-set! (category-count c) cat (1+ n)))))
|
||||
|
||||
(define-method (fcount (c <classifier>) feature cat)
|
||||
"The number of times a feature has appeared in a category"
|
||||
(let* ((fcc (slot-ref c 'feature-category-count))
|
||||
(cc (hash-ref fcc feature '())))
|
||||
(or (assoc-ref cc cat) 0.0)))
|
||||
|
||||
(define-method (catcount (c <classifier>) cat)
|
||||
"The number of items in a category"
|
||||
(or (assq-ref (category-count c) cat) 0.0))
|
||||
|
||||
(define-method (totalcount (c <classifier>))
|
||||
(fold + 0.0 (map cdr (category-count c))))
|
||||
|
||||
(define-method (categories (c <classifier>))
|
||||
(map car (category-count c)))
|
||||
|
||||
(define-method (train (c <classifier>) doc cat)
|
||||
(for-each (lambda (feature) (incf c feature cat))
|
||||
(get-features c doc))
|
||||
(incc c cat))
|
||||
|
||||
(define-method (fprob (c <classifier>) f cat)
|
||||
(if (zero? (catcount c cat))
|
||||
0.0
|
||||
(/ (fcount c f cat)
|
||||
(catcount c cat))))
|
||||
|
||||
(define-method (weighted-prob (c <classifier>) f cat)
|
||||
(let ((basic-prob (fprob c f cat))
|
||||
(totals (fold (lambda (cat accum) (+ accum (fcount c f cat))) 0.0 (categories c))))
|
||||
(/ (+ (* (weight c) (assumed-prob c))
|
||||
(* totals basic-prob))
|
||||
(+ (weight c) totals))))
|
||||
|
||||
(define-class <naive-bayes> (<classifier>))
|
||||
|
||||
(define-method (doc-prob (c <naive-bayes>) doc cat)
|
||||
(fold (lambda (feature p) (* p (weighted-prob c feature cat)))
|
||||
1.0
|
||||
(get-features c doc)))
|
||||
|
||||
(define-method (prob (c <naive-bayes>) doc cat)
|
||||
(let ((cat-prob (/ (catcount c cat) (totalcount c))))
|
||||
(* cat-prob (doc-prob c doc cat))))
|
||||
|
||||
(define-method (classify (c <naive-bayes>) doc)
|
||||
(define max 0.0)
|
||||
(define probs '())
|
||||
(define best #f)
|
||||
(for-each (lambda (cat)
|
||||
(let ((p (prob c doc cat)))
|
||||
(set! probs (assq-set! probs cat p))
|
||||
(when (> p max)
|
||||
(set! max p)
|
||||
(set! best cat))))
|
||||
(categories c))
|
||||
;; Make sure the probabilty exceeds threshold*next best
|
||||
(display probs) (newline)
|
||||
(if (any (match-lambda ((k . v)
|
||||
(and (not (eq? k best))
|
||||
(> (* (get-threshold c best) v) max))))
|
||||
probs)
|
||||
#f
|
||||
best))
|
||||
|
||||
(define-method (sample-train (c <classifier>))
|
||||
(train c "Nobody owns the water." 'ham)
|
||||
(train c "the quick rabbit jumps fences" 'ham)
|
||||
(train c "buy pharmaceuticals now" 'spam)
|
||||
(train c "make quick money at the online casino" 'spam)
|
||||
(train c "the quick brown fox jumps" 'ham))
|
||||
|
||||
(define-method (sample-train-n-times (c <classifier>) n)
|
||||
(let loop ((n n))
|
||||
(when (> n 0)
|
||||
(sample-train c)
|
||||
(loop (1- n)))))
|
||||
|
||||
(define (make-test-classifier)
|
||||
(let ((c (make <naive-bayes> #:get-features get-words)))
|
||||
(set-threshold! c 'spam 3.0)
|
||||
(set-threshold! c 'ham 1.0)
|
||||
(sample-train c)
|
||||
c))
|
Loading…
Reference in a new issue