Move blc code around to better match blog post.

This commit is contained in:
Ray Miller 2025-02-03 09:56:55 +00:00
parent e54d8f16bc
commit 8cf60c5f0d

View file

@ -32,62 +32,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
(srfi srfi-37)
(srfi srfi-71))
;; List of SXPath expressions to search for links.
(define link-paths (map sxpath
'((// a @ href)
(// img @ src)
(// link @ href)
(// video @ src)
(// video source @ src)
(// audio @ src)
(// audio source @ src)
(// picture source @ srcset)
(// img @ srcset))))
;; Special handling for srcset
(define (parse-srcset s)
"Parse out the list of URLs from an image srcset, discarding whitespace and the
width/density descriptor."
(map (compose first (cut string-split <> #\space))
(map string-trim-both (string-split s #\,))))
(define (extract-links body)
"Extract links from an HTML document, discarding any tel: or mailto: links."
(let ((x (html->sxml body)))
(remove (lambda (s) (or (string-prefix? "tel:" s) (string-prefix? "mailto:" s)))
(delete-duplicates
(fold (lambda (f accum)
(fold (lambda (x accum)
(match-let (((attr val) x))
(if (equal? attr 'srcset)
(append (parse-srcset val) accum)
(cons val accum))))
accum
(f x)))
'()
link-paths)))))
(define (uri-without-fragment uri)
"If uri has a fragment, return a new URI without the fragment."
(if (uri-fragment uri)
(set-field uri (uri-fragment) #f)
uri))
(define (resolve-relative-uri base-uri s)
"If the string s is a relative URI, resolve it relative to base-uri."
(let ((uri (string->uri-reference s)))
(if (uri-scheme uri)
uri
(build-uri (uri-scheme base-uri)
#:userinfo (or (uri-userinfo uri) (uri-userinfo base-uri))
#:host (or (uri-host uri) (uri-host base-uri))
#:port (or (uri-port uri) (uri-port base-uri))
#:path (cond
((string-prefix? "/" (uri-path uri)) (uri-path uri))
((string-suffix? "/" (uri-path base-uri)) (string-append (uri-path base-uri) (uri-path uri)))
(else (string-append (uri-path base-uri) "/" (uri-path uri))))
#:query (uri-query uri)))))
;; Queue of work to do, which is stored as pairs of (referrer . uri) so we
;; can report the referring page if `uri` is invalid
(define queue (list))
@ -172,6 +116,62 @@ width/density descriptor."
((uri-port) (uri-port base-uri)))
uri))
;; List of SXPath expressions to search for links.
(define link-paths (map sxpath
'((// a @ href)
(// img @ src)
(// link @ href)
(// video @ src)
(// video source @ src)
(// audio @ src)
(// audio source @ src)
(// picture source @ srcset)
(// img @ srcset))))
;; Special handling for srcset
(define (parse-srcset s)
"Parse out the list of URLs from an image srcset, discarding whitespace and the
width/density descriptor."
(map (compose first (cut string-split <> #\space))
(map string-trim-both (string-split s #\,))))
(define (extract-links body)
"Extract links from an HTML document, discarding any tel: or mailto: links."
(let ((x (html->sxml body)))
(remove (lambda (s) (or (string-prefix? "tel:" s) (string-prefix? "mailto:" s)))
(delete-duplicates
(fold (lambda (f accum)
(fold (lambda (x accum)
(match-let (((attr val) x))
(if (equal? attr 'srcset)
(append (parse-srcset val) accum)
(cons val accum))))
accum
(f x)))
'()
link-paths)))))
(define (uri-without-fragment uri)
"If uri has a fragment, return a new URI without the fragment."
(if (uri-fragment uri)
(set-field uri (uri-fragment) #f)
uri))
(define (resolve-relative-uri base-uri s)
"If the string s is a relative URI, resolve it relative to base-uri."
(let ((uri (string->uri-reference s)))
(if (uri-scheme uri)
uri
(build-uri (uri-scheme base-uri)
#:userinfo (or (uri-userinfo uri) (uri-userinfo base-uri))
#:host (or (uri-host uri) (uri-host base-uri))
#:port (or (uri-port uri) (uri-port base-uri))
#:path (cond
((string-prefix? "/" (uri-path uri)) (uri-path uri))
((string-suffix? "/" (uri-path base-uri)) (string-append (uri-path base-uri) (uri-path uri)))
(else (string-append (uri-path base-uri) "/" (uri-path uri))))
#:query (uri-query uri)))))
(define (enqueue-links base-uri)
(unless (seen? base-uri)
(hash-set! seen (uri->string base-uri) #t)
@ -208,8 +208,8 @@ width/density descriptor."
(let ((memo (make-hash-table)))
(lambda (uri)
(let ((key (uri->string uri)))
;;(pk 'http-head-memoized key)
(unless (hash-ref memo key #f)
(format #t "[DEBUG] checking link ~a~%" key)
(with-exception-handler
(lambda (e)
(hash-set! memo key e))