From 55b2804f0155e90790aa24c23865f60519bf7148 Mon Sep 17 00:00:00 2001 From: Ray Miller Date: Thu, 12 Sep 2024 15:51:12 +0100 Subject: [PATCH 1/2] Implement fisher classifier. --- spam-filter/core.scm | 46 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/spam-filter/core.scm b/spam-filter/core.scm index ee831d2..4f672c4 100644 --- a/spam-filter/core.scm +++ b/spam-filter/core.scm @@ -65,8 +65,8 @@ (/ (fcount c f cat) (catcount c cat)))) -(define-method (weighted-prob (c ) f cat) - (let ((basic-prob (fprob c f cat)) +(define-method (weighted-prob (c ) f cat prf) + (let ((basic-prob (prf c f cat)) (totals (fold (lambda (cat accum) (+ accum (fcount c f cat))) 0.0 (categories c)))) (/ (+ (* (weight c) (assumed-prob c)) (* totals basic-prob)) @@ -75,7 +75,7 @@ (define-class ()) (define-method (doc-prob (c ) doc cat) - (fold (lambda (feature p) (* p (weighted-prob c feature cat))) + (fold (lambda (feature p) (* p (weighted-prob c feature cat fprob))) 1.0 (get-features c doc))) @@ -116,9 +116,47 @@ (sample-train c) (loop (1- n))))) -(define (make-test-classifier) +(define (make-test-naive-bayes-classifier) (let ((c (make #:get-features get-words))) (set-threshold! c 'spam 3.0) (set-threshold! c 'ham 1.0) (sample-train c) c)) + +(define-class ()) + +(define-method (cprob (c ) f cat) + (let ((clf (fprob c f cat))) + (if (zero? clf) + 0 + (let ((freqsum (fold (lambda (cat accum) + (+ accum (fprob c f cat))) + 0.0 + (categories c)))) + (/ clf freqsum))))) + +(define (inv-chi chi df) + (let* ((m (/ chi 2.0)) + (sum (exp (- m))) + (term sum)) + (for-each (lambda (i) + (set! term (* term (/ m i))) + (set! sum (+ sum term))) + (iota (floor (/ df 2.0)) 1)) + (min sum 1.0))) + +(define-method (fisher-prob (c ) doc cat) + (define features (get-features c doc)) + (define p (fold (lambda (f accum) + (* accum (weighted-prob c f cat cprob))) + 1.0 + features)) + (define fscore (* -2.0 (log p))) + (inv-chi fscore (* 2 (length features)))) + +(define (make-test-fisher-classifier) + (let ((c (make #:get-features get-words))) + (set-threshold! c 'spam 3.0) + (set-threshold! c 'ham 1.0) + (sample-train c) + c)) From 3c5cb254aa055ffe2040ecd67df52a1b0079c2f4 Mon Sep 17 00:00:00 2001 From: Ray Miller Date: Thu, 12 Sep 2024 17:38:00 +0100 Subject: [PATCH 2/2] Simplify package build. --- HACKING | 54 ----------------------------------------- guix.scm | 16 +++--------- hall.scm | 35 -------------------------- spam-filter/hconfig.scm | 35 -------------------------- 4 files changed, 4 insertions(+), 136 deletions(-) delete mode 100644 HACKING delete mode 100644 hall.scm delete mode 100644 spam-filter/hconfig.scm diff --git a/HACKING b/HACKING deleted file mode 100644 index 4470c0d..0000000 --- a/HACKING +++ /dev/null @@ -1,54 +0,0 @@ -# -*- mode: org; coding: utf-8; -*- - -#+TITLE: Hacking spam-filter - -* Contributing - -By far the easiest way to hack on spam-filter is to develop using Guix: - -#+BEGIN_SRC bash - # Obtain the source code - cd /path/to/source-code - guix shell -Df guix.scm - # In the new shell, run: - hall build --execute && autoreconf -vif && ./configure && make check -#+END_SRC - -You may also want to set your directory as an authorized directory for -`guix shell' so it works without arguments. To do that, simply run - -#+BEGIN_SRC bash - echo $(pwd) >> $HOME/.config/guix/shell-authorized-directories -#+END_SRC - -You can now hack this project's files to your heart's content, whilst -testing them from your `guix shell' shell. - -To try out any scripts in the project you can now use - -#+BEGIN_SRC bash - ./pre-inst-env scripts/${script-name} -#+END_SRC - -If you'd like to tidy the project again, but retain the ability to test the -project from the commandline, simply run: - -#+BEGIN_SRC bash - ./hall clean --skip "scripts/${script-name},pre-inst-env" --execute -#+END_SRC - -** Manual Installation - -If you do not yet use Guix, you will have to install this project's -dependencies manually: - - autoconf - - automake - - pkg-config - - texinfo - - guile-hall - -Once those dependencies are installed you can run: - -#+BEGIN_SRC bash - hall build -x && autoreconf -vif && ./configure && make check -#+END_SRC diff --git a/guix.scm b/guix.scm index 19bc390..8e495ab 100644 --- a/guix.scm +++ b/guix.scm @@ -1,12 +1,7 @@ (use-modules (gnu packages) - (gnu packages autotools) (gnu packages guile) - (gnu packages guile-xyz) - (gnu packages pkg-config) - (gnu packages texinfo) - (guix build-system gnu) - (guix download) + (guix build-system guile) (guix gexp) ((guix licenses) #:prefix license:) (guix packages) @@ -14,7 +9,7 @@ (package (name "guile-spam-filter") - (version "0.1") + (version "0.1.0") (source (local-file (dirname (current-filename)) @@ -25,12 +20,9 @@ (not (any (lambda (my-string) (string-contains file my-string)) (list ".git" ".dir-locals.el" "guix.scm")))))) - (build-system gnu-build-system) - (arguments `()) + (build-system guile-build-system) (native-inputs - (list autoconf automake pkg-config texinfo)) - (inputs (list guile-3.0)) - (propagated-inputs (list)) + (list guile-3.0)) (synopsis "") (description "") (home-page "") diff --git a/hall.scm b/hall.scm deleted file mode 100644 index e1ddd9a..0000000 --- a/hall.scm +++ /dev/null @@ -1,35 +0,0 @@ -(hall-description - (name "spam-filter") - (prefix "guile") - (version "0.1") - (author "Ray Miller") - (email "ray@1729.org.uk") - (copyright (2024)) - (synopsis "") - (description "") - (home-page "") - (license gpl3+) - (dependencies `()) - (skip ()) - (features - ((guix #f) - (use-guix-specs-for-dependencies #f) - (native-language-support #f) - (licensing #f))) - (files (libraries - ((scheme-file "spam-filter") - (directory - "spam-filter" - ((scheme-file "hconfig"))))) - (tests ((directory "tests" ()))) - (programs ((directory "scripts" ()))) - (documentation - ((org-file "README") - (symlink "README" "README.org") - (text-file "HACKING") - (text-file "COPYING") - (directory "doc" ((texi-file "spam-filter"))))) - (infrastructure - ((scheme-file "guix") - (text-file ".gitignore") - (scheme-file "hall"))))) diff --git a/spam-filter/hconfig.scm b/spam-filter/hconfig.scm deleted file mode 100644 index 0c5842f..0000000 --- a/spam-filter/hconfig.scm +++ /dev/null @@ -1,35 +0,0 @@ -(define-module - (spam-filter hconfig) - #:use-module - (srfi srfi-26) - #:export - (%version - %author - %license - %copyright - %gettext-domain - G_ - N_ - init-nls - init-locale)) - -(define %version "0.1") - -(define %author "Ray Miller") - -(define %license 'gpl3+) - -(define %copyright '(2024)) - -(define %gettext-domain "guile-spam-filter") - -(define G_ identity) - -(define N_ identity) - -(define (init-nls) "Dummy as no NLS is used" #t) - -(define (init-locale) - "Dummy as no NLS is used" - #t) -