
This makes the code a bit more readable. We then map over the list of strings to produce the parsed format the web client requires.
291 lines
11 KiB
Scheme
Executable file
291 lines
11 KiB
Scheme
Executable file
#!/usr/bin/guile \
|
|
--no-auto-compile -e main -s
|
|
|
|
Broken Link Checker
|
|
Copyright (C) 2025 Ray Miller <ray@1729.org.uk>.
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
!#
|
|
(use-modules
|
|
(ice-9 format)
|
|
(ice-9 match)
|
|
(web client)
|
|
(web response)
|
|
(web uri)
|
|
((web http) #:select (parse-header))
|
|
((htmlprag) #:select (html->sxml))
|
|
((sxml xpath) #:select (sxpath))
|
|
(srfi srfi-1)
|
|
(srfi srfi-9 gnu)
|
|
(srfi srfi-26)
|
|
(srfi srfi-37)
|
|
(srfi srfi-71))
|
|
|
|
;; List of SXPath expressions to search for links.
|
|
(define link-paths (map sxpath
|
|
'((// a @ href)
|
|
(// img @ src)
|
|
(// link @ href)
|
|
(// video @ src)
|
|
(// video source @ src)
|
|
(// audio @ src)
|
|
(// audio source @ src))))
|
|
|
|
(define (extract-links body)
|
|
"Extract links from an HTML document, discarding any tel: or mailto: links."
|
|
(let ((x (html->sxml body)))
|
|
(remove (lambda (s) (or (string-prefix? "tel:" s) (string-prefix? "mailto:" s)))
|
|
(delete-duplicates
|
|
(map cadr
|
|
(concatenate
|
|
(map (lambda (f) (f x)) link-paths)))))))
|
|
|
|
(define (uri-without-fragment uri)
|
|
"If uri has a fragment, return a new URI without the fragment."
|
|
(if (uri-fragment uri)
|
|
(set-field uri (uri-fragment) #f)
|
|
uri))
|
|
|
|
(define (resolve-relative-uri base-uri s)
|
|
"If the string s is a relative URI, resolve it relative to base-uri."
|
|
(let ((uri (string->uri-reference s)))
|
|
(if (uri-scheme uri)
|
|
uri
|
|
(build-uri (uri-scheme base-uri)
|
|
#:userinfo (or (uri-userinfo uri) (uri-userinfo base-uri))
|
|
#:host (or (uri-host uri) (uri-host base-uri))
|
|
#:port (or (uri-port uri) (uri-port base-uri))
|
|
#:path (cond
|
|
((string-prefix? "/" (uri-path uri)) (uri-path uri))
|
|
((string-suffix? "/" (uri-path base-uri)) (string-append (uri-path base-uri) (uri-path uri)))
|
|
(else (string-append (uri-path base-uri) "/" (uri-path uri))))
|
|
#:query (uri-query uri)))))
|
|
|
|
;; Queue of work to do, which is stored as pairs of (referrer . uri) so we
|
|
;; can report the referring page if `uri` is invalid
|
|
(define queue (list))
|
|
|
|
;; Hash of URLs already visited
|
|
(define seen (make-hash-table))
|
|
|
|
;; A count of invalid URLs we've seen.
|
|
(define error-count 0)
|
|
|
|
;; Whether or not we should act like a crawler and recurse into all pages
|
|
;; reachable from the start page. This will be set by a command-line
|
|
;; option.
|
|
(define recurse? #f)
|
|
|
|
;; A list of domains to exclude from checking (for example, sites we know are valid
|
|
;; but that return errors to crawlers). Populated by command-line options.
|
|
(define excluded-domains (list))
|
|
|
|
;; Similar to excluded domains, but the entire URL must match, not just the host
|
|
;; part.
|
|
(define excluded-urls (list))
|
|
|
|
;; Helper functions
|
|
|
|
(define (seen? uri)
|
|
"Return #t if this URI has been visited, otherwise #f."
|
|
(hash-ref seen (uri->string uri) #f))
|
|
|
|
(define (set-seen! uri)
|
|
"Add the given URI to the visited set."
|
|
(hash-set! seen (uri->string uri) #t))
|
|
|
|
(define (exclude-domain! domain-name)
|
|
"Add a domain to the set of excluded domains."
|
|
(set! excluded-domains (lset-adjoin string=? excluded-domains domain-name)))
|
|
|
|
(define (exclude-url! url)
|
|
"Add a URL to the set of excluded URLs."
|
|
(set! excluded-urls (lset-adjoin string=? excluded-urls url)))
|
|
|
|
(define (excluded? uri)
|
|
"Check whether or not the URI is an excluded URL or for an excluded domain."
|
|
(or (member (uri->string uri) excluded-urls string=?)
|
|
(any (cute string-suffix? <> (uri-host uri))
|
|
excluded-domains)))
|
|
|
|
(define (html-content-type? res)
|
|
"Return #t if the response content type is HTML-like, otherwise #f."
|
|
(let ((ct (first (response-content-type res))))
|
|
(or (equal? ct 'text/html)
|
|
(equal? ct 'application/xhtml+xml)
|
|
(equal? ct 'application/xml))))
|
|
|
|
(define (ok-response? res)
|
|
"Return #t if the HTTP response code is 200 (OK), otherwise #f."
|
|
(= 200 (response-code res)))
|
|
|
|
|
|
;; List of host names that are forcibly mapped to the base URI.
|
|
;; This is a work-around for when running Hugo in development; the
|
|
;; site is served on http://127.0.0.1:1313/ but some URLs in the
|
|
;; delivered pages are for http://localhost:1313/ and some URLs in the
|
|
;; content have hard-coded the live hostname - we want to rewrite
|
|
;; both of these to http://127.0.0.1:1313/ to test the development
|
|
;; version of the site.
|
|
;;
|
|
;; The actual overrides are set through command-line arguments rather than
|
|
;; being hard-coded here.
|
|
|
|
(define host-overrides
|
|
'())
|
|
|
|
(define (add-host-override! host-name)
|
|
(set! host-overrides (lset-adjoin equal? host-overrides host-name)))
|
|
|
|
(define (override-host-port base-uri uri)
|
|
(if (member (uri-host uri) host-overrides)
|
|
(set-fields uri
|
|
((uri-scheme) (uri-scheme base-uri))
|
|
((uri-host) (uri-host base-uri))
|
|
((uri-port) (uri-port base-uri)))
|
|
uri))
|
|
|
|
(define (enqueue-links base-uri)
|
|
(unless (seen? base-uri)
|
|
(hash-set! seen (uri->string base-uri) #t)
|
|
(format #t "[INFO] enqueuing links for ~a~%" (uri->string base-uri))
|
|
(let ((res body (http-get base-uri)))
|
|
(if (ok-response? res)
|
|
(if (html-content-type? res)
|
|
(set! queue (append queue
|
|
(map (compose (cut cons base-uri <>)
|
|
(cut override-host-port base-uri <>)
|
|
uri-without-fragment
|
|
(cut resolve-relative-uri base-uri <>))
|
|
(extract-links body))))
|
|
(format #t "[ERROR] unexpected content type ~a for ~a~%" (response-content-type res) (uri->string base-uri)))
|
|
(format #t "[ERROR] unexpected status code ~a for ~a~%" (response-code res) (uri->string base-uri))))))
|
|
|
|
(define (same-host? uri referrer)
|
|
"Return #t if URI has the same host name as REFERRER, otherwise #f."
|
|
(string=? (uri-host uri) (uri-host referrer)))
|
|
|
|
;; Some sites return 403 errors for bot requests, these headers make
|
|
;; us look more like a real browser.
|
|
(define request-headers
|
|
(map (match-lambda ((k . v) (cons k (parse-header k v))))
|
|
'((accept . "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
|
|
(accept-encoding . "gzip, deflate, br, zstd")
|
|
(accept-language . "en-GB,en;q=0.5")
|
|
(user-agent . "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0"))))
|
|
|
|
;; A memoized version of http-get. This allows us to quickly check a URL we have
|
|
;; seen before (when it is linked from multiple different pages) without generating
|
|
;; extra load on the target server.
|
|
(define http-head-memoized
|
|
(let ((memo (make-hash-table)))
|
|
(lambda (uri)
|
|
(let ((key (uri->string uri)))
|
|
;;(pk 'http-head-memoized key)
|
|
(unless (hash-ref memo key #f)
|
|
(with-exception-handler
|
|
(lambda (e)
|
|
(hash-set! memo key e))
|
|
(lambda ()
|
|
(let ((res _ (http-head uri #:headers request-headers)))
|
|
(hash-set! memo key res)))
|
|
#:unwind? #t))
|
|
(hash-ref memo key)))))
|
|
|
|
(define (process-queue)
|
|
(match queue
|
|
(()
|
|
(format #t "[INFO] processing complete, found ~a broken link~p~%" error-count error-count))
|
|
(((referrer . uri) rest ...)
|
|
(set! queue rest)
|
|
(if (excluded? uri)
|
|
(format #t "[INFO] skipping excluded URL ~a~%" (uri->string uri))
|
|
(let ((res (http-head-memoized uri)))
|
|
(cond
|
|
((exception? res)
|
|
(format #t "[ERROR] HEAD ~a referred to by ~a threw exception ~a~%" (uri->string uri) (uri->string referrer) res)
|
|
(set! error-count (1+ error-count)))
|
|
((> (response-code res) 399)
|
|
(format #t "[ERROR] HEAD ~a referred to by ~a returned HTTP Status ~a~%" (uri->string uri) (uri->string referrer) (response-code res))
|
|
(set! error-count (1+ error-count)))
|
|
((and recurse? (ok-response? res) (html-content-type? res) (same-host? uri referrer))
|
|
(enqueue-links uri)))))
|
|
(process-queue))))
|
|
|
|
(define (show-help)
|
|
(format #t "Usage: broken-link-checker [OPTIONS] URL
|
|
|
|
Run broken link checker on URL.
|
|
|
|
-h, --help Show this help message
|
|
|
|
-X, --exclude-domain=DOMAIN Skip checking links to this domain
|
|
|
|
-x, --exclude-url=URL Skip checking this specific link
|
|
|
|
-o, --override-host=HOSTNAME Rewrite links to HOSTNAME to the base URL
|
|
|
|
-r, --recurse Check all pages on the same host that are reachable
|
|
from the starting URL.
|
|
"))
|
|
|
|
(define %default-options
|
|
'((exclude-domains . ())
|
|
(exclude-urls . ())
|
|
(host-overrides . ())
|
|
(recurse . #f)
|
|
(urls . ())))
|
|
|
|
(define %options
|
|
(list
|
|
(option '(#\h "help") #f #f
|
|
(lambda args
|
|
(show-help)
|
|
(exit 0)))
|
|
(option '(#\X "exclude-domain") #t #f
|
|
(lambda (opt name val result)
|
|
(assoc-set! result 'exclude-domains
|
|
(cons val (assoc-ref result 'exclude-domains)))))
|
|
(option '(#\x "exclude-url") #t #f
|
|
(lambda (opt name val result)
|
|
(assoc-set! result 'exclude-urls
|
|
(cons val (assoc-ref result 'exclude-urls)))))
|
|
(option '(#\o "override-host") #t #f
|
|
(lambda (opt name val result)
|
|
(assoc-set! result 'host-overrides
|
|
(cons val (assoc-ref result 'host-overrides)))))
|
|
(option '(#\r "recurse") #f #f
|
|
(lambda (opt name val result)
|
|
(assoc-set! result 'recurse #t)))))
|
|
|
|
(define (parse-args args)
|
|
(args-fold args %options
|
|
(lambda (opt name arg result)
|
|
(format (current-error-port) "~A: unrecognized option~%" opt)
|
|
(exit 1))
|
|
(lambda (url result)
|
|
(assoc-set! result 'urls (cons url (assoc-ref result 'urls))))
|
|
%default-options))
|
|
|
|
(define (main args)
|
|
(let ((args (parse-args (cdr args))))
|
|
(set! recurse? (assoc-ref args 'recurse))
|
|
(for-each exclude-domain! (assoc-ref args 'exclude-domains))
|
|
(for-each exclude-url! (assoc-ref args 'exclude-urls))
|
|
(for-each add-host-override! (assoc-ref args 'host-overrides))
|
|
(for-each (compose enqueue-links string->uri) (assoc-ref args 'urls))
|
|
(process-queue)
|
|
(if (> error-count 0)
|
|
(exit 2)
|
|
(exit 0))))
|