From d94fe71f96baed0a53226fe423e5e3fb363383c7 Mon Sep 17 00:00:00 2001 From: Ray Miller Date: Wed, 29 Jan 2025 11:23:10 +0000 Subject: [PATCH] Add broken link checker script --- guile/broken-link-checker.scm | 272 ++++++++++++++++++++++++++++++++++ 1 file changed, 272 insertions(+) create mode 100755 guile/broken-link-checker.scm diff --git a/guile/broken-link-checker.scm b/guile/broken-link-checker.scm new file mode 100755 index 0000000..be2c71a --- /dev/null +++ b/guile/broken-link-checker.scm @@ -0,0 +1,272 @@ +#!/usr/bin/guile \ +--no-auto-compile -e main -s +!# +(use-modules + (ice-9 format) + (ice-9 match) + (web client) + (web response) + (web uri) + ((htmlprag) #:select (html->sxml)) + ((sxml xpath) #:select (sxpath)) + (srfi srfi-1) + (srfi srfi-9 gnu) + (srfi srfi-26) + (srfi srfi-37) + (srfi srfi-71)) + +;; List of SXPath expressions to search for links. +(define link-paths (map sxpath + '((// a @ href) + (// img @ src) + (// link @ href) + (// video @ src) + (// video source @ src) + (// audio @ src) + (// audio source @ src)))) + +(define (extract-links body) + "Extract links from an HTML document, discarding any tel: or mailto: links." + (let ((x (html->sxml body))) + (remove (lambda (s) (or (string-prefix? "tel:" s) (string-prefix? "mailto:" s))) + (delete-duplicates + (map cadr + (concatenate + (map (lambda (f) (f x)) link-paths))))))) + +(define (uri-without-fragment uri) + "If uri has a fragment, return a new URI without the fragment." + (if (uri-fragment uri) + (set-field uri (uri-fragment) #f) + uri)) + +(define (resolve-relative-uri base-uri s) + "If the string s is a relative URI, resolve it relative to base-uri." + (let ((uri (string->uri-reference s))) + (if (uri-scheme uri) + uri + (build-uri (uri-scheme base-uri) + #:userinfo (or (uri-userinfo uri) (uri-userinfo base-uri)) + #:host (or (uri-host uri) (uri-host base-uri)) + #:port (or (uri-port uri) (uri-port base-uri)) + #:path (cond + ((string-prefix? "/" (uri-path uri)) (uri-path uri)) + ((string-suffix? "/" (uri-path base-uri)) (string-append (uri-path base-uri) (uri-path uri))) + (else (string-append (uri-path base-uri) "/" (uri-path uri)))) + #:query (uri-query uri))))) + +;; Queue of work to do, which is stored as pairs of (referrer . uri) so we +;; can report the referring page if `uri` is invalid +(define queue (list)) + +;; Hash of URLs already visited +(define seen (make-hash-table)) + +;; A count of invalid URLs we've seen. +(define error-count 0) + +;; Whether or not we should act like a crawler and recurse into all pages +;; reachable from the start page. This will be set by a command-line +;; option. +(define recurse? #f) + +;; A list of domains to exclude from checking (for example, sites we know are valid +;; but that return errors to crawlers). Populated by command-line options. +(define excluded-domains (list)) + +;; Similar to excluded domains, but the entire URL must match, not just the host +;; part. +(define excluded-urls (list)) + +;; Helper functions + +(define (seen? uri) + "Return #t if this URI has been visited, otherwise #f." + (hash-ref seen (uri->string uri) #f)) + +(define (set-seen! uri) + "Add the given URI to the visited set." + (hash-set! seen (uri->string uri) #t)) + +(define (exclude-domain! domain-name) + "Add a domain to the set of excluded domains." + (set! excluded-domains (lset-adjoin string=? excluded-domains domain-name))) + +(define (exclude-url! url) + "Add a URL to the set of excluded URLs." + (set! excluded-urls (lset-adjoin string=? excluded-urls url))) + +(define (excluded? uri) + "Check whether or not the URI is an excluded URL or for an excluded domain." + (or (member (uri->string uri) excluded-urls string=?) + (any (cute string-suffix? <> (uri-host uri)) + excluded-domains))) + +(define (html-content-type? res) + "Return #t if the response content type is HTML-like, otherwise #f." + (let ((ct (first (response-content-type res)))) + (or (equal? ct 'text/html) + (equal? ct 'application/xhtml+xml) + (equal? ct 'application/xml)))) + +(define (ok-response? res) + "Return #t if the HTTP response code is 200 (OK), otherwise #f." + (= 200 (response-code res))) + + +;; List of host names that are forcibly mapped to the base URI. +;; This is a work-around for when running Hugo in development; the +;; site is served on http://127.0.0.1:1313/ but some URLs in the +;; delivered pages are for http://localhost:1313/ and some URLs in the +;; content have been hard-coded as metail.com - we want to rewrite +;; both of these to http://127.0.0.1:1313/ to test the development +;; version of the site. +;; +;; The actual overrides are set through command-line arguments rather than +;; being hard-coded here. + +(define host-overrides + '()) + +(define (add-host-override! host-name) + (set! host-overrides (lset-adjoin equal? host-overrides host-name))) + +(define (override-host-port base-uri uri) + (if (member (uri-host uri) host-overrides) + (set-fields uri + ((uri-scheme) (uri-scheme base-uri)) + ((uri-host) (uri-host base-uri)) + ((uri-port) (uri-port base-uri))) + uri)) + +(define (enqueue-links base-uri) + (unless (seen? base-uri) + (hash-set! seen (uri->string base-uri) #t) + (format #t "[INFO] enqueuing links for ~a~%" (uri->string base-uri)) + (let ((res body (http-get base-uri))) + (if (ok-response? res) + (if (html-content-type? res) + (set! queue (append queue + (map (compose (cut cons base-uri <>) + (cut override-host-port base-uri <>) + uri-without-fragment + (cut resolve-relative-uri base-uri <>)) + (extract-links body)))) + (format #t "[ERROR] unexpected content type ~a for ~a~%" (response-content-type res) (uri->string base-uri))) + (format #t "[ERROR] unexpected status code ~a for ~a~%" (response-code res) (uri->string base-uri)))))) + +(define (same-host? uri referrer) + "Return #t if URI has the same host name as REFERRER, otherwise #f." + (string=? (uri-host uri) (uri-host referrer))) + +;; Some sites return 403 errors for bot requests, these headers make +;; us look more like a real browser. +(define request-headers '((accept . ((text/html) (application/xhtml+xml) (application/xml (q . 900)) (*/* (q . 800)))) + (accept-encoding . ((1000 . "gzip") (1000 . "deflate") (1000 . "br") (1000 . "zstd"))) + (accept-language . ((1000 . "en-GB") (500 . "en"))) + (user-agent . "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0"))) + +;; A memoized version of http-get. This allows us to quickly check a URL we have +;; seen before (when it is linked from multiple different pages) without generating +;; extra load on the target server. +(define http-head-memoized + (let ((memo (make-hash-table))) + (lambda (uri) + (let ((key (uri->string uri))) + (pk 'http-head-memoized key) + (unless (hash-ref memo key #f) + (with-exception-handler + (lambda (e) + (hash-set! memo key e)) + (lambda () + (let ((res _ (http-head uri #:headers request-headers))) + (hash-set! memo key res))) + #:unwind? #t)) + (hash-ref memo key))))) + +(define (process-queue) + (match queue + (() + (format #t "[INFO] processing complete, found ~a broken link~p~%" error-count error-count)) + (((referrer . uri) rest ...) + (set! queue rest) + (if (excluded? uri) + (format #t "[INFO] skipping excluded URL ~a~%" (uri->string uri)) + (let ((res (http-head-memoized uri))) + (cond + ((exception? res) + (format #t "[ERROR] HEAD ~a referred to by ~a threw exception ~a~%" (uri->string uri) (uri->string referrer) res) + (set! error-count (1+ error-count))) + ((> (response-code res) 399) + (format #t "[ERROR] HEAD ~a referred to by ~a returned HTTP Status ~a~%" (uri->string uri) (uri->string referrer) (response-code res)) + (set! error-count (1+ error-count))) + ((and recurse? (ok-response? res) (html-content-type? res) (same-host? uri referrer)) + (enqueue-links uri))))) + (process-queue)))) + +(define (show-help) + (format #t "Usage: broken-link-checker [OPTIONS] URL + +Run broken link checker on URL. + + -h, --help Show this help message + + -X, --exclude-domain=DOMAIN Skip checking links to this domain + + -x, --exclude-url=URL Skip checking this specific link + + -o, --override-host=HOSTNAME Rewrite links to HOSTNAME to the base URL + + -r, --recurse Check all pages on the same host that are reachable + from the starting URL. +")) + +(define %default-options + '((exclude-domains . ()) + (exclude-urls . ()) + (host-overrides . ()) + (recurse . #f) + (urls . ()))) + +(define %options + (list + (option '(#\h "help") #f #f + (lambda args + (show-help) + (exit 0))) + (option '(#\X "exclude-domain") #t #f + (lambda (opt name val result) + (assoc-set! result 'exclude-domains + (cons val (assoc-ref result 'exclude-domains))))) + (option '(#\x "exclude-url") #t #f + (lambda (opt name val result) + (assoc-set! result 'exclude-urls + (cons val (assoc-ref result 'exclude-urls))))) + (option '(#\o "override-host") #t #f + (lambda (opt name val result) + (assoc-set! result 'host-overrides + (cons val (assoc-ref result 'host-overrides))))) + (option '(#\r "recurse") #f #f + (lambda (opt name val result) + (assoc-set! result 'recurse #t))))) + +(define (parse-args args) + (args-fold args %options + (lambda (opt name arg result) + (format (current-error-port) "~A: unrecognized option~%" opt) + (exit 1)) + (lambda (url result) + (assoc-set! result 'urls (cons url (assoc-ref result 'urls)))) + %default-options)) + +(define (main args) + (let ((args (parse-args (cdr args)))) + (set! recurse? (assoc-ref args 'recurse)) + (for-each exclude-domain! (assoc-ref args 'exclude-domains)) + (for-each exclude-url! (assoc-ref args 'exclude-urls)) + (for-each add-host-override! (assoc-ref args 'host-overrides)) + (for-each (compose enqueue-links string->uri) (assoc-ref args 'urls)) + (process-queue) + (if (> error-count 0) + (exit 2) + (exit 0))))