308 lines
12 KiB
Scheme
Executable file
308 lines
12 KiB
Scheme
Executable file
#!/usr/bin/guile \
|
|
--no-auto-compile -e main -s
|
|
|
|
Broken Link Checker
|
|
Copyright (C) 2025 Ray Miller <ray@1729.org.uk>.
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
!#
|
|
(use-modules
|
|
(ice-9 format)
|
|
(ice-9 match)
|
|
(web client)
|
|
(web response)
|
|
(web uri)
|
|
((web http) #:select (parse-header))
|
|
((htmlprag) #:select (html->sxml))
|
|
((sxml xpath) #:select (sxpath))
|
|
(srfi srfi-1)
|
|
(srfi srfi-9 gnu)
|
|
(srfi srfi-26)
|
|
(srfi srfi-37)
|
|
(srfi srfi-71))
|
|
|
|
;; List of SXPath expressions to search for links.
|
|
(define link-paths (map sxpath
|
|
'((// a @ href)
|
|
(// img @ src)
|
|
(// link @ href)
|
|
(// video @ src)
|
|
(// video source @ src)
|
|
(// audio @ src)
|
|
(// audio source @ src)
|
|
(// picture source @ srcset)
|
|
(// img @ srcset))))
|
|
|
|
;; Special handling for srcset
|
|
(define (parse-srcset s)
|
|
"Parse out the list of URLs from an image srcset, discarding whitespace and the
|
|
width/density descriptor."
|
|
(pk s)
|
|
(map (compose first (cut string-split <> #\space))
|
|
(map string-trim-both (string-split s #\,))))
|
|
|
|
(define (extract-links body)
|
|
"Extract links from an HTML document, discarding any tel: or mailto: links."
|
|
(let ((x (html->sxml body)))
|
|
(remove (lambda (s) (or (string-prefix? "tel:" s) (string-prefix? "mailto:" s)))
|
|
(delete-duplicates
|
|
(fold (lambda (f accum)
|
|
(fold (lambda (x accum)
|
|
(match-let (((attr val) x))
|
|
(if (equal? attr 'srcset)
|
|
(append (parse-srcset val) accum)
|
|
(cons val accum))))
|
|
accum
|
|
(f x)))
|
|
'()
|
|
link-paths)))))
|
|
|
|
(define (uri-without-fragment uri)
|
|
"If uri has a fragment, return a new URI without the fragment."
|
|
(if (uri-fragment uri)
|
|
(set-field uri (uri-fragment) #f)
|
|
uri))
|
|
|
|
(define (resolve-relative-uri base-uri s)
|
|
"If the string s is a relative URI, resolve it relative to base-uri."
|
|
(let ((uri (string->uri-reference s)))
|
|
(if (uri-scheme uri)
|
|
uri
|
|
(build-uri (uri-scheme base-uri)
|
|
#:userinfo (or (uri-userinfo uri) (uri-userinfo base-uri))
|
|
#:host (or (uri-host uri) (uri-host base-uri))
|
|
#:port (or (uri-port uri) (uri-port base-uri))
|
|
#:path (cond
|
|
((string-prefix? "/" (uri-path uri)) (uri-path uri))
|
|
((string-suffix? "/" (uri-path base-uri)) (string-append (uri-path base-uri) (uri-path uri)))
|
|
(else (string-append (uri-path base-uri) "/" (uri-path uri))))
|
|
#:query (uri-query uri)))))
|
|
|
|
;; Queue of work to do, which is stored as pairs of (referrer . uri) so we
|
|
;; can report the referring page if `uri` is invalid
|
|
(define queue (list))
|
|
|
|
;; Hash of URLs already visited
|
|
(define seen (make-hash-table))
|
|
|
|
;; A count of invalid URLs we've seen.
|
|
(define error-count 0)
|
|
|
|
;; Whether or not we should act like a crawler and recurse into all pages
|
|
;; reachable from the start page. This will be set by a command-line
|
|
;; option.
|
|
(define recurse? #f)
|
|
|
|
;; A list of domains to exclude from checking (for example, sites we know are valid
|
|
;; but that return errors to crawlers). Populated by command-line options.
|
|
(define excluded-domains (list))
|
|
|
|
;; Similar to excluded domains, but the entire URL must match, not just the host
|
|
;; part.
|
|
(define excluded-urls (list))
|
|
|
|
;; Helper functions
|
|
|
|
(define (seen? uri)
|
|
"Return #t if this URI has been visited, otherwise #f."
|
|
(hash-ref seen (uri->string uri) #f))
|
|
|
|
(define (set-seen! uri)
|
|
"Add the given URI to the visited set."
|
|
(hash-set! seen (uri->string uri) #t))
|
|
|
|
(define (exclude-domain! domain-name)
|
|
"Add a domain to the set of excluded domains."
|
|
(set! excluded-domains (lset-adjoin string=? excluded-domains domain-name)))
|
|
|
|
(define (exclude-url! url)
|
|
"Add a URL to the set of excluded URLs."
|
|
(set! excluded-urls (lset-adjoin string=? excluded-urls url)))
|
|
|
|
(define (excluded? uri)
|
|
"Check whether or not the URI is an excluded URL or for an excluded domain."
|
|
(or (member (uri->string uri) excluded-urls string=?)
|
|
(any (cute string-suffix? <> (uri-host uri))
|
|
excluded-domains)))
|
|
|
|
(define (html-content-type? res)
|
|
"Return #t if the response content type is HTML-like, otherwise #f."
|
|
(let ((ct (first (response-content-type res))))
|
|
(or (equal? ct 'text/html)
|
|
(equal? ct 'application/xhtml+xml)
|
|
(equal? ct 'application/xml))))
|
|
|
|
(define (ok-response? res)
|
|
"Return #t if the HTTP response code is 200 (OK), otherwise #f."
|
|
(= 200 (response-code res)))
|
|
|
|
|
|
;; List of host names that are forcibly mapped to the base URI.
|
|
;; This is a work-around for when running Hugo in development; the
|
|
;; site is served on http://127.0.0.1:1313/ but some URLs in the
|
|
;; delivered pages are for http://localhost:1313/ and some URLs in the
|
|
;; content have hard-coded the live hostname - we want to rewrite
|
|
;; both of these to http://127.0.0.1:1313/ to test the development
|
|
;; version of the site.
|
|
;;
|
|
;; The actual overrides are set through command-line arguments rather than
|
|
;; being hard-coded here.
|
|
|
|
(define host-overrides
|
|
'())
|
|
|
|
(define (add-host-override! host-name)
|
|
(set! host-overrides (lset-adjoin equal? host-overrides host-name)))
|
|
|
|
(define (override-host-port base-uri uri)
|
|
(if (member (uri-host uri) host-overrides)
|
|
(set-fields uri
|
|
((uri-scheme) (uri-scheme base-uri))
|
|
((uri-host) (uri-host base-uri))
|
|
((uri-port) (uri-port base-uri)))
|
|
uri))
|
|
|
|
(define (enqueue-links base-uri)
|
|
(unless (seen? base-uri)
|
|
(hash-set! seen (uri->string base-uri) #t)
|
|
(format #t "[INFO] enqueuing links for ~a~%" (uri->string base-uri))
|
|
(let ((res body (http-get base-uri)))
|
|
(if (ok-response? res)
|
|
(if (html-content-type? res)
|
|
(set! queue (append queue
|
|
(map (compose (cut cons base-uri <>)
|
|
(cut override-host-port base-uri <>)
|
|
uri-without-fragment
|
|
(cut resolve-relative-uri base-uri <>))
|
|
(extract-links body))))
|
|
(format #t "[ERROR] unexpected content type ~a for ~a~%" (response-content-type res) (uri->string base-uri)))
|
|
(format #t "[ERROR] unexpected status code ~a for ~a~%" (response-code res) (uri->string base-uri))))))
|
|
|
|
(define (same-host? uri referrer)
|
|
"Return #t if URI has the same host name as REFERRER, otherwise #f."
|
|
(string=? (uri-host uri) (uri-host referrer)))
|
|
|
|
;; Some sites return 403 errors for bot requests, these headers make
|
|
;; us look more like a real browser.
|
|
(define request-headers
|
|
(map (match-lambda ((k . v) (cons k (parse-header k v))))
|
|
'((accept . "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
|
(accept-encoding . "gzip, deflate, br, zstd")
|
|
(accept-language . "en-GB,en;q=0.5")
|
|
(user-agent . "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0"))))
|
|
|
|
;; A memoized version of http-get. This allows us to quickly check a URL we have
|
|
;; seen before (when it is linked from multiple different pages) without generating
|
|
;; extra load on the target server.
|
|
(define http-head-memoized
|
|
(let ((memo (make-hash-table)))
|
|
(lambda (uri)
|
|
(let ((key (uri->string uri)))
|
|
;;(pk 'http-head-memoized key)
|
|
(unless (hash-ref memo key #f)
|
|
(with-exception-handler
|
|
(lambda (e)
|
|
(hash-set! memo key e))
|
|
(lambda ()
|
|
(let ((res _ (http-head uri #:headers request-headers)))
|
|
(hash-set! memo key res)))
|
|
#:unwind? #t))
|
|
(hash-ref memo key)))))
|
|
|
|
(define (process-queue)
|
|
(match queue
|
|
(()
|
|
(format #t "[INFO] processing complete, found ~a broken link~p~%" error-count error-count))
|
|
(((referrer . uri) rest ...)
|
|
(set! queue rest)
|
|
(if (excluded? uri)
|
|
(format #t "[INFO] skipping excluded URL ~a~%" (uri->string uri))
|
|
(let ((res (http-head-memoized uri)))
|
|
(cond
|
|
((exception? res)
|
|
(format #t "[ERROR] HEAD ~a referred to by ~a threw exception ~a~%" (uri->string uri) (uri->string referrer) res)
|
|
(set! error-count (1+ error-count)))
|
|
((> (response-code res) 399)
|
|
(format #t "[ERROR] HEAD ~a referred to by ~a returned HTTP Status ~a~%" (uri->string uri) (uri->string referrer) (response-code res))
|
|
(set! error-count (1+ error-count)))
|
|
((and recurse? (ok-response? res) (html-content-type? res) (same-host? uri referrer))
|
|
(enqueue-links uri)))))
|
|
(process-queue))))
|
|
|
|
(define (show-help)
|
|
(format #t "Usage: broken-link-checker [OPTIONS] URL
|
|
|
|
Run broken link checker on URL.
|
|
|
|
-h, --help Show this help message
|
|
|
|
-X, --exclude-domain=DOMAIN Skip checking links to this domain
|
|
|
|
-x, --exclude-url=URL Skip checking this specific link
|
|
|
|
-o, --override-host=HOSTNAME Rewrite links to HOSTNAME to the base URL
|
|
|
|
-r, --recurse Check all pages on the same host that are reachable
|
|
from the starting URL.
|
|
"))
|
|
|
|
(define %default-options
|
|
'((exclude-domains . ())
|
|
(exclude-urls . ())
|
|
(host-overrides . ())
|
|
(recurse . #f)
|
|
(urls . ())))
|
|
|
|
(define %options
|
|
(list
|
|
(option '(#\h "help") #f #f
|
|
(lambda args
|
|
(show-help)
|
|
(exit 0)))
|
|
(option '(#\X "exclude-domain") #t #f
|
|
(lambda (opt name val result)
|
|
(assoc-set! result 'exclude-domains
|
|
(cons val (assoc-ref result 'exclude-domains)))))
|
|
(option '(#\x "exclude-url") #t #f
|
|
(lambda (opt name val result)
|
|
(assoc-set! result 'exclude-urls
|
|
(cons val (assoc-ref result 'exclude-urls)))))
|
|
(option '(#\o "override-host") #t #f
|
|
(lambda (opt name val result)
|
|
(assoc-set! result 'host-overrides
|
|
(cons val (assoc-ref result 'host-overrides)))))
|
|
(option '(#\r "recurse") #f #f
|
|
(lambda (opt name val result)
|
|
(assoc-set! result 'recurse #t)))))
|
|
|
|
(define (parse-args args)
|
|
(args-fold args %options
|
|
(lambda (opt name arg result)
|
|
(format (current-error-port) "~A: unrecognized option~%" opt)
|
|
(exit 1))
|
|
(lambda (url result)
|
|
(assoc-set! result 'urls (cons url (assoc-ref result 'urls))))
|
|
%default-options))
|
|
|
|
(define (main args)
|
|
(let ((args (parse-args (cdr args))))
|
|
(set! recurse? (assoc-ref args 'recurse))
|
|
(for-each exclude-domain! (assoc-ref args 'exclude-domains))
|
|
(for-each exclude-url! (assoc-ref args 'exclude-urls))
|
|
(for-each add-host-override! (assoc-ref args 'host-overrides))
|
|
(for-each (compose enqueue-links string->uri) (assoc-ref args 'urls))
|
|
(process-queue)
|
|
(if (> error-count 0)
|
|
(exit 2)
|
|
(exit 0))))
|