recipe-search-tech-test/text-utils.scm
2025-07-01 18:01:08 +01:00

339 lines
16 KiB
Scheme

(define-module (text-utils)
#:use-module (srfi srfi-1) ; lists
#:use-module (srfi srfi-9) ; records
#:use-module (srfi srfi-26) ; cut/cute
#:use-module (srfi srfi-69) ; hash tables
#:use-module (rnrs unicode)
#:export (remove-diacritics
porter-stem
stop-word?))
(define (remove-diacritics str)
;; Helper function for checking if a character is a combining mark
;; (rnrs unicode) provides `char-general-category` which returns a symbol
;; representing the character's general category. Combining marks generally
;; fall under the 'Mn' (Mark, Nonspacing) category.
(define (char-combining-mark? char)
(eq? (char-general-category char) 'Mn))
(let* ((nfd-str (string-normalize-nfd str)) ; Step 1: Normalize to NFD
(chars (string->list nfd-str))) ; Convert to a list of characters
(list->string
(filter (lambda (char)
(not (char-combining-mark? char))) ; Step 2: Filter out combining marks
chars))))
;; Porter stemming is a popular stemming algorithm used for English.
;;
;; For a complete description, see: <https://tartarus.org/martin/PorterStemmer/>
;;
;; Algorithm published in Porter, 1980, "An algorithm for suffix stripping,"
;; _Program_, Vol. 14, no. 3, pp 130-137.
;;
;; This implementation by Peter Lane, https://codeberg.org/peterlane/r7rs-libs/src/branch/trunk/robin/text.sld
;;
;; An error is raised if the word does not consist entirely of ASCII characters.
;; This is because the Porter stemmer is designed for stemming English words.
(define (porter-stem initial-word)
;
; Data structure holds the word and information on where we are in
; processing it.
;
; b: array of character strings representing the word we are stemming
; j: an alternate 'end point' in the array, used, e.g., after searching
; to mark where a suffix begins
; k: the end of the array
(define-record-type <data>
(new-data b j k)
data?
(b b-get)
(j j-get j-set!)
(k k-get k-set!))
;
; checks if there is a double consonant at position i,
; with i being the second consonant's index
(define (double-consonant? data i)
(and (> i 0)
(char=? (list-ref (b-get data) i)
(list-ref (b-get data) (- i 1)))
(is-consonant? data i)))
;
; returns true if the suffix is found at the end of the current
; set of bytes, with k holding the last index in bytes.
; sets j to the end point, as if the suffix is removed.
(define (ends-with? data suffix)
(let ((suffix-array (string->list suffix))
(suffix-length (string-length suffix)))
(if (and (<= suffix-length (k-get data))
(equal? (take-right (take (b-get data) (+ 1 (k-get data))) suffix-length)
suffix-array))
(begin
(j-set! data (- (k-get data) suffix-length))
#t)
#f)))
;
; Checks if there is a consonant at position i
; -- note special handling of 'y', which is treated
; as a consonant at start of word or after a vowel.
(define (is-consonant? data i)
(cond ((member (list-ref (b-get data) i)
'(#\a #\e #\i #\o #\u))
#f)
((char=? (list-ref (b-get data) i)
#\y)
(or (= i 0)
(not (is-consonant? data (- i 1)))))
(else
#t)))
;
; cvc is true if i-2, i-1, i has form consonant, vowel, consonant
; (and last must not be w, x, or y)
(define (is-cvc? data i)
(cond ((or (< i 2)
(not (is-consonant? data i))
(is-consonant? data (- i 1))
(not (is-consonant? data (- i 2))))
#f)
((let ((ch (list-ref (b-get data) i)))
(or (char=? ch #\w)
(char=? ch #\x)
(char=? ch #\y)))
#f)
(else
#t)))
;
; Given a set of letter=>[(suffix, replacement)] pairs and a letter to match,
; applies replacements for the given letter
(define (make-replacements data letter replacements)
(for-each (lambda (rule)
(when (char=? letter (car rule))
(for-each (lambda (pairing)
(when (ends-with? data (car pairing))
(set-to data (cadr pairing) #t)))
(cdr rule))))
replacements))
;
; counts the number of vowel-consonant sequences between 0 and j
(define (seq-count data)
(let ((count 0)
(i 0))
; find index of first vowel, by skipping consonants
(let loop ()
(when (and (<= i (j-get data))
(is-consonant? data i))
(set! i (+ 1 i))
(loop)))
; repeat CV search to get full count until all of range is seen
(let loop ()
(when (<= i (j-get data))
; find index of next consonant, by skipping non-consonants
(let loop-consonant ()
(when (and (<= i (j-get data))
(not (is-consonant? data i)))
(set! i (+ 1 i))
(loop-consonant)))
; VC pair found with more to come, so increment count
(when (<= i (j-get data))
(set! count (+ 1 count)))
; find index of next vowel, by skipping consonants
(let loop-vowel ()
(when (and (<= i (j-get data))
(is-consonant? data i))
(set! i (+ 1 i))
(loop-vowel)))
(loop)))
;
count))
;
; sets positions j+1 to k to the suffix,
; adjusting k to j+length(suffix) (the new end of the word)
; -- supports a flag for only allowing change if seq_count > 0
; this prevents words becoming too small when suffix changed
(define set-to
(case-lambda
((data suffix)
(set-to data suffix #f))
((data suffix restrict)
(when (or (not restrict)
(> (seq-count data) 0))
(for-each (lambda (index value)
(list-set! (b-get data) (+ 1 (j-get data) index) value))
(iota (string-length suffix))
(string->list suffix))
(k-set! data (+ (j-get data)
(string-length suffix)))))))
;
; looks for a vowel in range 0 to j in bytes
(define (vowel-in-stem? data)
(any (lambda (i) (not (is-consonant? data i)))
(iota (+ 1 (j-get data)))))
;
(define STEP-2-RULES (list (cons #\a '(("ational" "ate") ("tional" "tion")))
(cons #\c '(("enci" "ence") ("anci" "ance")))
(cons #\e '(("izer" "ize")))
(cons #\g '(("logi" "log")))
(cons #\l '(("bli" "ble") ("alli" "al") ("entli" "ent") ("eli" "e") ("ousli" "ous")))
(cons #\o '(("ization" "ize") ("ation" "ate") ("ator" "ate")))
(cons #\s '(("alism" "al") ("iveness" "ive") ("fulness" "ful") ("ousness" "ous")))
(cons #\t '(("aliti" "al") ("iviti" "ive") ("biliti" "ble")))))
;
(define STEP-3-RULES (list (cons #\e '(("icate" "ic") ("ative" "") ("alize" "al")))
(cons #\i '(("iciti" "ic")))
(cons #\l '(("ical" "ic") ("ful" "")))
(cons #\s '(("ness" "")))))
;
; removes plurals and -ed -ing endings
(define (step-1ab data)
(when (char=? (list-ref (b-get data) (k-get data)) #\s)
(cond ((ends-with? data "sses")
(k-set! data (- (k-get data) 2)))
((ends-with? data "ies")
(set-to data "i"))
((not (char=? (list-ref (b-get data) (- (k-get data) 1))
#\s))
(k-set! data (- (k-get data) 1)))))
(if (ends-with? data "eed")
(when (> (seq-count data) 0)
(k-set! data (- (k-get data) 1)))
(when (and (or (ends-with? data "ed")
(ends-with? data "ing"))
(vowel-in-stem? data))
(k-set! data (j-get data))
(cond ((ends-with? data "at")
(set-to data "ate"))
((ends-with? data "bl")
(set-to data "ble"))
((ends-with? data "iz")
(set-to data "ize"))
((double-consonant? data (k-get data))
(k-set! data (- (k-get data) 1))
(let ((ch (list-ref (b-get data) (k-get data))))
(when (or (char=? #\l ch)
(char=? #\s ch)
(char=? #\z ch))
(k-set! data (+ (k-get data) 1)))))
((and (= 1 (seq-count data))
(is-cvc? data (k-get data)))
(set-to data "e"))))))
;
; turns terminal 'y' to 'i' when there is another vowel in stem
(define (step-1c data)
(when (and (ends-with? data "y")
(vowel-in-stem? data))
(list-set! (b-get data) (k-get data) #\i)))
;
; double suffixes are mapped to single (shorter) ones
(define (step-2 data)
(make-replacements data
(list-ref (b-get data) (- (k-get data) 1))
STEP-2-RULES))
;
; deals with -ic -full -ness
(define (step-3 data)
(make-replacements data
(list-ref (b-get data) (k-get data))
STEP-3-RULES))
;
; removes -ant -ence when seq-count is 2
(define (step-4 data)
(call-with-current-continuation
(lambda (return)
(let ((letter (list-ref (b-get data) (- (k-get data) 1)))) ; penultimate letter
(cond ((char=? letter #\a)
(when (not (ends-with? data "al"))
(return)))
((char=? letter #\c)
(when (and (not (ends-with? data "ance"))
(not (ends-with? data "ence")))
(return)))
((char=? letter #\e)
(when (not (ends-with? data "er"))
(return)))
((char=? letter #\i)
(when (not (ends-with? data "ic"))
(return)))
((char=? letter #\l)
(when (and (not (ends-with? data "able"))
(not (ends-with? data "ible")))
(return)))
((char=? letter #\n)
(when (and (not (ends-with? data "ant"))
(not (ends-with? data "ement"))
(not (ends-with? data "ment"))
(not (ends-with? data "ent")))
(return)))
((char=? letter #\o)
(if (and (ends-with? data "ion")
(or (char=? (list-ref (b-get data) (j-get data)) #\s)
(char=? (list-ref (b-get data) (j-get data)) #\t)))
'() ; break
(when (not (ends-with? data "ou"))
(return))))
((char=? letter #\s)
(when (not (ends-with? data "ism"))
(return)))
((char=? letter #\t)
(when (and (not (ends-with? data "ate"))
(not (ends-with? data "iti")))
(return)))
((char=? letter #\u)
(when (not (ends-with? data "ous"))
(return)))
((char=? letter #\v)
(when (not (ends-with? data "ive"))
(return)))
((char=? letter #\z)
(when (not (ends-with? data "ize"))
(return)))
(else
(return)))
(when (> (seq-count data) 1)
(k-set! data (j-get data)))))))
;
; if seq-count > 1, removes a final -e and changes -ll to -l
(define (step-5 data)
(j-set! data (k-get data))
; remove final -e
(when (char=? (list-ref (b-get data) (k-get data))
#\e)
(let ((count (seq-count data)))
(when (or (> count 1)
(and (= count 1)
(not (is-cvc? data (- (k-get data) 1)))))
(k-set! data (- (k-get data) 1)))))
; -ll to -l
(when (and (char=? (list-ref (b-get data) (k-get data))
#\l)
(double-consonant? data (k-get data))
(> (seq-count data) 1))
(k-set! data (- (k-get data) 1))))
;
; - main part of function
;
(let ((word (string-downcase initial-word))) ; TODO - check ASCII
(if (< (string-length word) 3) ; do not process short words
word
(let ((data (new-data (string->list word) 0 (- (string-length word) 1))))
(step-1ab data)
(when (> (k-get data) 0)
(step-1c data)
(step-2 data)
(step-3 data)
(step-4 data)
(step-5 data))
(list->string (take (b-get data) (+ 1 (k-get data))))))))
(define stop-word?
(let ((h (make-hash-table)))
(for-each (cut hash-table-set! h <> #t)
(list "i" "me" "my" "myself" "we" "our" "ours" "ourselves" "you" "your" "yours" "yourself" "yourselves"
"he" "him" "his" "himself" "she" "her" "hers" "herself" "it" "its" "itself" "they" "them" "their"
"theirs" "themselves" "what" "which" "who" "whom" "this" "that" "these" "those" "am" "is" "are"
"was" "were" "be" "been" "being" "have" "has" "had" "having" "do" "does" "did" "doing" "a" "an"
"the" "and" "but" "if" "or" "because" "as" "until" "while" "of" "at" "by" "for" "with" "about"
"against" "between" "into" "through" "during" "before" "after" "above" "below" "to" "from" "up"
"down" "in" "out" "on" "off" "over" "under" "again" "further" "then" "once" "here" "there"
"when" "where" "why" "how" "all" "any" "both" "each" "few" "more" "most" "other" "some" "such"
"no" "nor" "not" "only" "own" "same" "so" "than" "too" "very" "s" "t" "d" "can" "will" "just"
"don" "should" "now"))
(lambda (w)
(hash-table-ref/default h w #f))))