Move blc code around to better match blog post.
This commit is contained in:
parent
e54d8f16bc
commit
8cf60c5f0d
1 changed files with 57 additions and 57 deletions
|
@ -32,62 +32,6 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||||
(srfi srfi-37)
|
(srfi srfi-37)
|
||||||
(srfi srfi-71))
|
(srfi srfi-71))
|
||||||
|
|
||||||
;; List of SXPath expressions to search for links.
|
|
||||||
(define link-paths (map sxpath
|
|
||||||
'((// a @ href)
|
|
||||||
(// img @ src)
|
|
||||||
(// link @ href)
|
|
||||||
(// video @ src)
|
|
||||||
(// video source @ src)
|
|
||||||
(// audio @ src)
|
|
||||||
(// audio source @ src)
|
|
||||||
(// picture source @ srcset)
|
|
||||||
(// img @ srcset))))
|
|
||||||
|
|
||||||
;; Special handling for srcset
|
|
||||||
(define (parse-srcset s)
|
|
||||||
"Parse out the list of URLs from an image srcset, discarding whitespace and the
|
|
||||||
width/density descriptor."
|
|
||||||
(map (compose first (cut string-split <> #\space))
|
|
||||||
(map string-trim-both (string-split s #\,))))
|
|
||||||
|
|
||||||
(define (extract-links body)
|
|
||||||
"Extract links from an HTML document, discarding any tel: or mailto: links."
|
|
||||||
(let ((x (html->sxml body)))
|
|
||||||
(remove (lambda (s) (or (string-prefix? "tel:" s) (string-prefix? "mailto:" s)))
|
|
||||||
(delete-duplicates
|
|
||||||
(fold (lambda (f accum)
|
|
||||||
(fold (lambda (x accum)
|
|
||||||
(match-let (((attr val) x))
|
|
||||||
(if (equal? attr 'srcset)
|
|
||||||
(append (parse-srcset val) accum)
|
|
||||||
(cons val accum))))
|
|
||||||
accum
|
|
||||||
(f x)))
|
|
||||||
'()
|
|
||||||
link-paths)))))
|
|
||||||
|
|
||||||
(define (uri-without-fragment uri)
|
|
||||||
"If uri has a fragment, return a new URI without the fragment."
|
|
||||||
(if (uri-fragment uri)
|
|
||||||
(set-field uri (uri-fragment) #f)
|
|
||||||
uri))
|
|
||||||
|
|
||||||
(define (resolve-relative-uri base-uri s)
|
|
||||||
"If the string s is a relative URI, resolve it relative to base-uri."
|
|
||||||
(let ((uri (string->uri-reference s)))
|
|
||||||
(if (uri-scheme uri)
|
|
||||||
uri
|
|
||||||
(build-uri (uri-scheme base-uri)
|
|
||||||
#:userinfo (or (uri-userinfo uri) (uri-userinfo base-uri))
|
|
||||||
#:host (or (uri-host uri) (uri-host base-uri))
|
|
||||||
#:port (or (uri-port uri) (uri-port base-uri))
|
|
||||||
#:path (cond
|
|
||||||
((string-prefix? "/" (uri-path uri)) (uri-path uri))
|
|
||||||
((string-suffix? "/" (uri-path base-uri)) (string-append (uri-path base-uri) (uri-path uri)))
|
|
||||||
(else (string-append (uri-path base-uri) "/" (uri-path uri))))
|
|
||||||
#:query (uri-query uri)))))
|
|
||||||
|
|
||||||
;; Queue of work to do, which is stored as pairs of (referrer . uri) so we
|
;; Queue of work to do, which is stored as pairs of (referrer . uri) so we
|
||||||
;; can report the referring page if `uri` is invalid
|
;; can report the referring page if `uri` is invalid
|
||||||
(define queue (list))
|
(define queue (list))
|
||||||
|
@ -172,6 +116,62 @@ width/density descriptor."
|
||||||
((uri-port) (uri-port base-uri)))
|
((uri-port) (uri-port base-uri)))
|
||||||
uri))
|
uri))
|
||||||
|
|
||||||
|
;; List of SXPath expressions to search for links.
|
||||||
|
(define link-paths (map sxpath
|
||||||
|
'((// a @ href)
|
||||||
|
(// img @ src)
|
||||||
|
(// link @ href)
|
||||||
|
(// video @ src)
|
||||||
|
(// video source @ src)
|
||||||
|
(// audio @ src)
|
||||||
|
(// audio source @ src)
|
||||||
|
(// picture source @ srcset)
|
||||||
|
(// img @ srcset))))
|
||||||
|
|
||||||
|
;; Special handling for srcset
|
||||||
|
(define (parse-srcset s)
|
||||||
|
"Parse out the list of URLs from an image srcset, discarding whitespace and the
|
||||||
|
width/density descriptor."
|
||||||
|
(map (compose first (cut string-split <> #\space))
|
||||||
|
(map string-trim-both (string-split s #\,))))
|
||||||
|
|
||||||
|
(define (extract-links body)
|
||||||
|
"Extract links from an HTML document, discarding any tel: or mailto: links."
|
||||||
|
(let ((x (html->sxml body)))
|
||||||
|
(remove (lambda (s) (or (string-prefix? "tel:" s) (string-prefix? "mailto:" s)))
|
||||||
|
(delete-duplicates
|
||||||
|
(fold (lambda (f accum)
|
||||||
|
(fold (lambda (x accum)
|
||||||
|
(match-let (((attr val) x))
|
||||||
|
(if (equal? attr 'srcset)
|
||||||
|
(append (parse-srcset val) accum)
|
||||||
|
(cons val accum))))
|
||||||
|
accum
|
||||||
|
(f x)))
|
||||||
|
'()
|
||||||
|
link-paths)))))
|
||||||
|
|
||||||
|
(define (uri-without-fragment uri)
|
||||||
|
"If uri has a fragment, return a new URI without the fragment."
|
||||||
|
(if (uri-fragment uri)
|
||||||
|
(set-field uri (uri-fragment) #f)
|
||||||
|
uri))
|
||||||
|
|
||||||
|
(define (resolve-relative-uri base-uri s)
|
||||||
|
"If the string s is a relative URI, resolve it relative to base-uri."
|
||||||
|
(let ((uri (string->uri-reference s)))
|
||||||
|
(if (uri-scheme uri)
|
||||||
|
uri
|
||||||
|
(build-uri (uri-scheme base-uri)
|
||||||
|
#:userinfo (or (uri-userinfo uri) (uri-userinfo base-uri))
|
||||||
|
#:host (or (uri-host uri) (uri-host base-uri))
|
||||||
|
#:port (or (uri-port uri) (uri-port base-uri))
|
||||||
|
#:path (cond
|
||||||
|
((string-prefix? "/" (uri-path uri)) (uri-path uri))
|
||||||
|
((string-suffix? "/" (uri-path base-uri)) (string-append (uri-path base-uri) (uri-path uri)))
|
||||||
|
(else (string-append (uri-path base-uri) "/" (uri-path uri))))
|
||||||
|
#:query (uri-query uri)))))
|
||||||
|
|
||||||
(define (enqueue-links base-uri)
|
(define (enqueue-links base-uri)
|
||||||
(unless (seen? base-uri)
|
(unless (seen? base-uri)
|
||||||
(hash-set! seen (uri->string base-uri) #t)
|
(hash-set! seen (uri->string base-uri) #t)
|
||||||
|
@ -208,8 +208,8 @@ width/density descriptor."
|
||||||
(let ((memo (make-hash-table)))
|
(let ((memo (make-hash-table)))
|
||||||
(lambda (uri)
|
(lambda (uri)
|
||||||
(let ((key (uri->string uri)))
|
(let ((key (uri->string uri)))
|
||||||
;;(pk 'http-head-memoized key)
|
|
||||||
(unless (hash-ref memo key #f)
|
(unless (hash-ref memo key #f)
|
||||||
|
(format #t "[DEBUG] checking link ~a~%" key)
|
||||||
(with-exception-handler
|
(with-exception-handler
|
||||||
(lambda (e)
|
(lambda (e)
|
||||||
(hash-set! memo key e))
|
(hash-set! memo key e))
|
||||||
|
|
Loading…
Add table
Reference in a new issue