339 lines
16 KiB
Scheme
339 lines
16 KiB
Scheme
(define-module (text-utils)
|
|
#:use-module (srfi srfi-1) ; lists
|
|
#:use-module (srfi srfi-9) ; records
|
|
#:use-module (srfi srfi-26) ; cut/cute
|
|
#:use-module (srfi srfi-69) ; hash tables
|
|
#:use-module (rnrs unicode)
|
|
#:export (remove-diacritics
|
|
porter-stem
|
|
stop-word?))
|
|
|
|
(define (remove-diacritics str)
|
|
;; Helper function for checking if a character is a combining mark
|
|
;; (rnrs unicode) provides `char-general-category` which returns a symbol
|
|
;; representing the character's general category. Combining marks generally
|
|
;; fall under the 'Mn' (Mark, Nonspacing) category.
|
|
(define (char-combining-mark? char)
|
|
(eq? (char-general-category char) 'Mn))
|
|
(let* ((nfd-str (string-normalize-nfd str)) ; Step 1: Normalize to NFD
|
|
(chars (string->list nfd-str))) ; Convert to a list of characters
|
|
(list->string
|
|
(filter (lambda (char)
|
|
(not (char-combining-mark? char))) ; Step 2: Filter out combining marks
|
|
chars))))
|
|
|
|
;; Porter stemming is a popular stemming algorithm used for English.
|
|
;;
|
|
;; For a complete description, see: <https://tartarus.org/martin/PorterStemmer/>
|
|
;;
|
|
;; Algorithm published in Porter, 1980, "An algorithm for suffix stripping,"
|
|
;; _Program_, Vol. 14, no. 3, pp 130-137.
|
|
;;
|
|
;; This implementation by Peter Lane, https://codeberg.org/peterlane/r7rs-libs/src/branch/trunk/robin/text.sld
|
|
;;
|
|
;; An error is raised if the word does not consist entirely of ASCII characters.
|
|
;; This is because the Porter stemmer is designed for stemming English words.
|
|
(define (porter-stem initial-word)
|
|
;
|
|
; Data structure holds the word and information on where we are in
|
|
; processing it.
|
|
;
|
|
; b: array of character strings representing the word we are stemming
|
|
; j: an alternate 'end point' in the array, used, e.g., after searching
|
|
; to mark where a suffix begins
|
|
; k: the end of the array
|
|
(define-record-type <data>
|
|
(new-data b j k)
|
|
data?
|
|
(b b-get)
|
|
(j j-get j-set!)
|
|
(k k-get k-set!))
|
|
;
|
|
; checks if there is a double consonant at position i,
|
|
; with i being the second consonant's index
|
|
(define (double-consonant? data i)
|
|
(and (> i 0)
|
|
(char=? (list-ref (b-get data) i)
|
|
(list-ref (b-get data) (- i 1)))
|
|
(is-consonant? data i)))
|
|
;
|
|
; returns true if the suffix is found at the end of the current
|
|
; set of bytes, with k holding the last index in bytes.
|
|
; sets j to the end point, as if the suffix is removed.
|
|
(define (ends-with? data suffix)
|
|
(let ((suffix-array (string->list suffix))
|
|
(suffix-length (string-length suffix)))
|
|
(if (and (<= suffix-length (k-get data))
|
|
(equal? (take-right (take (b-get data) (+ 1 (k-get data))) suffix-length)
|
|
suffix-array))
|
|
(begin
|
|
(j-set! data (- (k-get data) suffix-length))
|
|
#t)
|
|
#f)))
|
|
;
|
|
; Checks if there is a consonant at position i
|
|
; -- note special handling of 'y', which is treated
|
|
; as a consonant at start of word or after a vowel.
|
|
(define (is-consonant? data i)
|
|
(cond ((member (list-ref (b-get data) i)
|
|
'(#\a #\e #\i #\o #\u))
|
|
#f)
|
|
((char=? (list-ref (b-get data) i)
|
|
#\y)
|
|
(or (= i 0)
|
|
(not (is-consonant? data (- i 1)))))
|
|
(else
|
|
#t)))
|
|
;
|
|
; cvc is true if i-2, i-1, i has form consonant, vowel, consonant
|
|
; (and last must not be w, x, or y)
|
|
(define (is-cvc? data i)
|
|
(cond ((or (< i 2)
|
|
(not (is-consonant? data i))
|
|
(is-consonant? data (- i 1))
|
|
(not (is-consonant? data (- i 2))))
|
|
#f)
|
|
((let ((ch (list-ref (b-get data) i)))
|
|
(or (char=? ch #\w)
|
|
(char=? ch #\x)
|
|
(char=? ch #\y)))
|
|
#f)
|
|
(else
|
|
#t)))
|
|
;
|
|
; Given a set of letter=>[(suffix, replacement)] pairs and a letter to match,
|
|
; applies replacements for the given letter
|
|
(define (make-replacements data letter replacements)
|
|
(for-each (lambda (rule)
|
|
(when (char=? letter (car rule))
|
|
(for-each (lambda (pairing)
|
|
(when (ends-with? data (car pairing))
|
|
(set-to data (cadr pairing) #t)))
|
|
(cdr rule))))
|
|
replacements))
|
|
;
|
|
; counts the number of vowel-consonant sequences between 0 and j
|
|
(define (seq-count data)
|
|
(let ((count 0)
|
|
(i 0))
|
|
; find index of first vowel, by skipping consonants
|
|
(let loop ()
|
|
(when (and (<= i (j-get data))
|
|
(is-consonant? data i))
|
|
(set! i (+ 1 i))
|
|
(loop)))
|
|
; repeat CV search to get full count until all of range is seen
|
|
(let loop ()
|
|
(when (<= i (j-get data))
|
|
; find index of next consonant, by skipping non-consonants
|
|
(let loop-consonant ()
|
|
(when (and (<= i (j-get data))
|
|
(not (is-consonant? data i)))
|
|
(set! i (+ 1 i))
|
|
(loop-consonant)))
|
|
; VC pair found with more to come, so increment count
|
|
(when (<= i (j-get data))
|
|
(set! count (+ 1 count)))
|
|
; find index of next vowel, by skipping consonants
|
|
(let loop-vowel ()
|
|
(when (and (<= i (j-get data))
|
|
(is-consonant? data i))
|
|
(set! i (+ 1 i))
|
|
(loop-vowel)))
|
|
(loop)))
|
|
;
|
|
count))
|
|
;
|
|
; sets positions j+1 to k to the suffix,
|
|
; adjusting k to j+length(suffix) (the new end of the word)
|
|
; -- supports a flag for only allowing change if seq_count > 0
|
|
; this prevents words becoming too small when suffix changed
|
|
(define set-to
|
|
(case-lambda
|
|
((data suffix)
|
|
(set-to data suffix #f))
|
|
((data suffix restrict)
|
|
(when (or (not restrict)
|
|
(> (seq-count data) 0))
|
|
(for-each (lambda (index value)
|
|
(list-set! (b-get data) (+ 1 (j-get data) index) value))
|
|
(iota (string-length suffix))
|
|
(string->list suffix))
|
|
(k-set! data (+ (j-get data)
|
|
(string-length suffix)))))))
|
|
;
|
|
; looks for a vowel in range 0 to j in bytes
|
|
(define (vowel-in-stem? data)
|
|
(any (lambda (i) (not (is-consonant? data i)))
|
|
(iota (+ 1 (j-get data)))))
|
|
;
|
|
(define STEP-2-RULES (list (cons #\a '(("ational" "ate") ("tional" "tion")))
|
|
(cons #\c '(("enci" "ence") ("anci" "ance")))
|
|
(cons #\e '(("izer" "ize")))
|
|
(cons #\g '(("logi" "log")))
|
|
(cons #\l '(("bli" "ble") ("alli" "al") ("entli" "ent") ("eli" "e") ("ousli" "ous")))
|
|
(cons #\o '(("ization" "ize") ("ation" "ate") ("ator" "ate")))
|
|
(cons #\s '(("alism" "al") ("iveness" "ive") ("fulness" "ful") ("ousness" "ous")))
|
|
(cons #\t '(("aliti" "al") ("iviti" "ive") ("biliti" "ble")))))
|
|
;
|
|
(define STEP-3-RULES (list (cons #\e '(("icate" "ic") ("ative" "") ("alize" "al")))
|
|
(cons #\i '(("iciti" "ic")))
|
|
(cons #\l '(("ical" "ic") ("ful" "")))
|
|
(cons #\s '(("ness" "")))))
|
|
;
|
|
; removes plurals and -ed -ing endings
|
|
(define (step-1ab data)
|
|
(when (char=? (list-ref (b-get data) (k-get data)) #\s)
|
|
(cond ((ends-with? data "sses")
|
|
(k-set! data (- (k-get data) 2)))
|
|
((ends-with? data "ies")
|
|
(set-to data "i"))
|
|
((not (char=? (list-ref (b-get data) (- (k-get data) 1))
|
|
#\s))
|
|
(k-set! data (- (k-get data) 1)))))
|
|
(if (ends-with? data "eed")
|
|
(when (> (seq-count data) 0)
|
|
(k-set! data (- (k-get data) 1)))
|
|
(when (and (or (ends-with? data "ed")
|
|
(ends-with? data "ing"))
|
|
(vowel-in-stem? data))
|
|
(k-set! data (j-get data))
|
|
(cond ((ends-with? data "at")
|
|
(set-to data "ate"))
|
|
((ends-with? data "bl")
|
|
(set-to data "ble"))
|
|
((ends-with? data "iz")
|
|
(set-to data "ize"))
|
|
((double-consonant? data (k-get data))
|
|
(k-set! data (- (k-get data) 1))
|
|
(let ((ch (list-ref (b-get data) (k-get data))))
|
|
(when (or (char=? #\l ch)
|
|
(char=? #\s ch)
|
|
(char=? #\z ch))
|
|
(k-set! data (+ (k-get data) 1)))))
|
|
((and (= 1 (seq-count data))
|
|
(is-cvc? data (k-get data)))
|
|
(set-to data "e"))))))
|
|
;
|
|
; turns terminal 'y' to 'i' when there is another vowel in stem
|
|
(define (step-1c data)
|
|
(when (and (ends-with? data "y")
|
|
(vowel-in-stem? data))
|
|
(list-set! (b-get data) (k-get data) #\i)))
|
|
;
|
|
; double suffixes are mapped to single (shorter) ones
|
|
(define (step-2 data)
|
|
(make-replacements data
|
|
(list-ref (b-get data) (- (k-get data) 1))
|
|
STEP-2-RULES))
|
|
;
|
|
; deals with -ic -full -ness
|
|
(define (step-3 data)
|
|
(make-replacements data
|
|
(list-ref (b-get data) (k-get data))
|
|
STEP-3-RULES))
|
|
;
|
|
; removes -ant -ence when seq-count is 2
|
|
(define (step-4 data)
|
|
(call-with-current-continuation
|
|
(lambda (return)
|
|
(let ((letter (list-ref (b-get data) (- (k-get data) 1)))) ; penultimate letter
|
|
(cond ((char=? letter #\a)
|
|
(when (not (ends-with? data "al"))
|
|
(return)))
|
|
((char=? letter #\c)
|
|
(when (and (not (ends-with? data "ance"))
|
|
(not (ends-with? data "ence")))
|
|
(return)))
|
|
((char=? letter #\e)
|
|
(when (not (ends-with? data "er"))
|
|
(return)))
|
|
((char=? letter #\i)
|
|
(when (not (ends-with? data "ic"))
|
|
(return)))
|
|
((char=? letter #\l)
|
|
(when (and (not (ends-with? data "able"))
|
|
(not (ends-with? data "ible")))
|
|
(return)))
|
|
((char=? letter #\n)
|
|
(when (and (not (ends-with? data "ant"))
|
|
(not (ends-with? data "ement"))
|
|
(not (ends-with? data "ment"))
|
|
(not (ends-with? data "ent")))
|
|
(return)))
|
|
((char=? letter #\o)
|
|
(if (and (ends-with? data "ion")
|
|
(or (char=? (list-ref (b-get data) (j-get data)) #\s)
|
|
(char=? (list-ref (b-get data) (j-get data)) #\t)))
|
|
'() ; break
|
|
(when (not (ends-with? data "ou"))
|
|
(return))))
|
|
((char=? letter #\s)
|
|
(when (not (ends-with? data "ism"))
|
|
(return)))
|
|
((char=? letter #\t)
|
|
(when (and (not (ends-with? data "ate"))
|
|
(not (ends-with? data "iti")))
|
|
(return)))
|
|
((char=? letter #\u)
|
|
(when (not (ends-with? data "ous"))
|
|
(return)))
|
|
((char=? letter #\v)
|
|
(when (not (ends-with? data "ive"))
|
|
(return)))
|
|
((char=? letter #\z)
|
|
(when (not (ends-with? data "ize"))
|
|
(return)))
|
|
(else
|
|
(return)))
|
|
(when (> (seq-count data) 1)
|
|
(k-set! data (j-get data)))))))
|
|
;
|
|
; if seq-count > 1, removes a final -e and changes -ll to -l
|
|
(define (step-5 data)
|
|
(j-set! data (k-get data))
|
|
; remove final -e
|
|
(when (char=? (list-ref (b-get data) (k-get data))
|
|
#\e)
|
|
(let ((count (seq-count data)))
|
|
(when (or (> count 1)
|
|
(and (= count 1)
|
|
(not (is-cvc? data (- (k-get data) 1)))))
|
|
(k-set! data (- (k-get data) 1)))))
|
|
; -ll to -l
|
|
(when (and (char=? (list-ref (b-get data) (k-get data))
|
|
#\l)
|
|
(double-consonant? data (k-get data))
|
|
(> (seq-count data) 1))
|
|
(k-set! data (- (k-get data) 1))))
|
|
;
|
|
; - main part of function
|
|
;
|
|
(let ((word (string-downcase initial-word))) ; TODO - check ASCII
|
|
(if (< (string-length word) 3) ; do not process short words
|
|
word
|
|
(let ((data (new-data (string->list word) 0 (- (string-length word) 1))))
|
|
(step-1ab data)
|
|
(when (> (k-get data) 0)
|
|
(step-1c data)
|
|
(step-2 data)
|
|
(step-3 data)
|
|
(step-4 data)
|
|
(step-5 data))
|
|
(list->string (take (b-get data) (+ 1 (k-get data))))))))
|
|
|
|
(define stop-word?
|
|
(let ((h (make-hash-table)))
|
|
(for-each (cut hash-table-set! h <> #t)
|
|
(list "i" "me" "my" "myself" "we" "our" "ours" "ourselves" "you" "your" "yours" "yourself" "yourselves"
|
|
"he" "him" "his" "himself" "she" "her" "hers" "herself" "it" "its" "itself" "they" "them" "their"
|
|
"theirs" "themselves" "what" "which" "who" "whom" "this" "that" "these" "those" "am" "is" "are"
|
|
"was" "were" "be" "been" "being" "have" "has" "had" "having" "do" "does" "did" "doing" "a" "an"
|
|
"the" "and" "but" "if" "or" "because" "as" "until" "while" "of" "at" "by" "for" "with" "about"
|
|
"against" "between" "into" "through" "during" "before" "after" "above" "below" "to" "from" "up"
|
|
"down" "in" "out" "on" "off" "over" "under" "again" "further" "then" "once" "here" "there"
|
|
"when" "where" "why" "how" "all" "any" "both" "each" "few" "more" "most" "other" "some" "such"
|
|
"no" "nor" "not" "only" "own" "same" "so" "than" "too" "very" "s" "t" "d" "can" "will" "just"
|
|
"don" "should" "now"))
|
|
(lambda (w)
|
|
(hash-table-ref/default h w #f))))
|