#!/usr/bin/guile \ --no-auto-compile -e main -s Broken Link Checker Copyright (C) 2025 Ray Miller . This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . !# (use-modules (ice-9 format) (ice-9 match) (web client) (web response) (web uri) ((web http) #:select (parse-header)) ((htmlprag) #:select (html->sxml)) ((sxml xpath) #:select (sxpath)) (srfi srfi-1) (srfi srfi-9 gnu) (srfi srfi-26) (srfi srfi-37) (srfi srfi-71)) ;; Queue of work to do, which is stored as pairs of (referrer . uri) so we ;; can report the referring page if `uri` is invalid (define queue (list)) ;; Hash of URLs already visited (define seen (make-hash-table)) ;; A count of invalid URLs we've seen. (define error-count 0) ;; Whether or not we should act like a crawler and recurse into all pages ;; reachable from the start page. This will be set by a command-line ;; option. (define recurse? #f) ;; A list of domains to exclude from checking (for example, sites we know are valid ;; but that return errors to crawlers). Populated by command-line options. (define excluded-domains (list)) ;; Similar to excluded domains, but the entire URL must match, not just the host ;; part. (define excluded-urls (list)) ;; Helper functions (define (seen? uri) "Return #t if this URI has been visited, otherwise #f." (hash-ref seen (uri->string uri) #f)) (define (set-seen! uri) "Add the given URI to the visited set." (hash-set! seen (uri->string uri) #t)) (define (exclude-domain! domain-name) "Add a domain to the set of excluded domains." (set! excluded-domains (lset-adjoin string=? excluded-domains domain-name))) (define (exclude-url! url) "Add a URL to the set of excluded URLs." (set! excluded-urls (lset-adjoin string=? excluded-urls url))) (define (excluded? uri) "Check whether or not the URI is an excluded URL or for an excluded domain." (or (member (uri->string uri) excluded-urls string=?) (any (cute string-suffix? <> (uri-host uri)) excluded-domains))) (define (html-content-type? res) "Return #t if the response content type is HTML-like, otherwise #f." (let ((ct (first (response-content-type res)))) (or (equal? ct 'text/html) (equal? ct 'application/xhtml+xml) (equal? ct 'application/xml)))) (define (ok-response? res) "Return #t if the HTTP response code is 200 (OK), otherwise #f." (= 200 (response-code res))) ;; List of host names that are forcibly mapped to the base URI. ;; This is a work-around for when running Hugo in development; the ;; site is served on http://127.0.0.1:1313/ but some URLs in the ;; delivered pages are for http://localhost:1313/ and some URLs in the ;; content have hard-coded the live hostname - we want to rewrite ;; both of these to http://127.0.0.1:1313/ to test the development ;; version of the site. ;; ;; The actual overrides are set through command-line arguments rather than ;; being hard-coded here. (define host-overrides '()) (define (add-host-override! host-name) (set! host-overrides (lset-adjoin equal? host-overrides host-name))) (define (override-host-port base-uri uri) (if (member (uri-host uri) host-overrides) (set-fields uri ((uri-scheme) (uri-scheme base-uri)) ((uri-host) (uri-host base-uri)) ((uri-port) (uri-port base-uri))) uri)) ;; List of SXPath expressions to search for links. (define link-paths (map sxpath '((// a @ href) (// img @ src) (// link @ href) (// video @ src) (// video source @ src) (// audio @ src) (// audio source @ src) (// picture source @ srcset) (// img @ srcset)))) ;; Special handling for srcset (define (parse-srcset s) "Parse out the list of URLs from an image srcset, discarding whitespace and the width/density descriptor." (map (compose first (cut string-split <> #\space)) (map string-trim-both (string-split s #\,)))) (define (extract-links body) "Extract links from an HTML document, discarding any tel: or mailto: links." (let ((x (html->sxml body))) (remove (lambda (s) (or (string-prefix? "tel:" s) (string-prefix? "mailto:" s))) (delete-duplicates (fold (lambda (f accum) (fold (lambda (x accum) (match-let (((attr val) x)) (if (equal? attr 'srcset) (append (parse-srcset val) accum) (cons val accum)))) accum (f x))) '() link-paths))))) (define (uri-without-fragment uri) "If uri has a fragment, return a new URI without the fragment." (if (uri-fragment uri) (set-field uri (uri-fragment) #f) uri)) (define (resolve-relative-uri base-uri s) "If the string s is a relative URI, resolve it relative to base-uri." (let ((uri (string->uri-reference s))) (if (uri-scheme uri) uri (build-uri (uri-scheme base-uri) #:userinfo (or (uri-userinfo uri) (uri-userinfo base-uri)) #:host (or (uri-host uri) (uri-host base-uri)) #:port (or (uri-port uri) (uri-port base-uri)) #:path (cond ((string-prefix? "/" (uri-path uri)) (uri-path uri)) ((string-suffix? "/" (uri-path base-uri)) (string-append (uri-path base-uri) (uri-path uri))) (else (string-append (uri-path base-uri) "/" (uri-path uri)))) #:query (uri-query uri))))) (define (enqueue-links base-uri) (unless (seen? base-uri) (hash-set! seen (uri->string base-uri) #t) (format #t "[INFO] enqueuing links for ~a~%" (uri->string base-uri)) (let ((res body (http-get base-uri))) (if (ok-response? res) (if (html-content-type? res) (set! queue (append queue (map (compose (cut cons base-uri <>) (cut override-host-port base-uri <>) uri-without-fragment (cut resolve-relative-uri base-uri <>)) (extract-links body)))) (format #t "[ERROR] unexpected content type ~a for ~a~%" (response-content-type res) (uri->string base-uri))) (format #t "[ERROR] unexpected status code ~a for ~a~%" (response-code res) (uri->string base-uri)))))) (define (same-host? uri referrer) "Return #t if URI has the same host name as REFERRER, otherwise #f." (string=? (uri-host uri) (uri-host referrer))) ;; Some sites return 403 errors for bot requests, these headers make ;; us look more like a real browser. (define request-headers (map (match-lambda ((k . v) (cons k (parse-header k v)))) '((accept . "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") (accept-encoding . "gzip, deflate, br, zstd") (accept-language . "en-GB,en;q=0.5") (user-agent . "Mozilla/5.0 (X11; Linux x86_64; rv:133.0) Gecko/20100101 Firefox/133.0")))) ;; A memoized version of http-get. This allows us to quickly check a URL we have ;; seen before (when it is linked from multiple different pages) without generating ;; extra load on the target server. (define http-head-memoized (let ((memo (make-hash-table))) (lambda (uri) (let ((key (uri->string uri))) (unless (hash-ref memo key #f) (format #t "[DEBUG] checking link ~a~%" key) (with-exception-handler (lambda (e) (hash-set! memo key e)) (lambda () (let ((res _ (http-head uri #:headers request-headers))) (hash-set! memo key res))) #:unwind? #t)) (hash-ref memo key))))) (define (process-queue) (match queue (() (format #t "[INFO] processing complete, found ~a broken link~p~%" error-count error-count)) (((referrer . uri) rest ...) (set! queue rest) (if (excluded? uri) (format #t "[INFO] skipping excluded URL ~a~%" (uri->string uri)) (let ((res (http-head-memoized uri))) (cond ((exception? res) (format #t "[ERROR] HEAD ~a referred to by ~a threw exception ~a~%" (uri->string uri) (uri->string referrer) res) (set! error-count (1+ error-count))) ((> (response-code res) 399) (format #t "[ERROR] HEAD ~a referred to by ~a returned HTTP Status ~a~%" (uri->string uri) (uri->string referrer) (response-code res)) (set! error-count (1+ error-count))) ((and recurse? (ok-response? res) (html-content-type? res) (same-host? uri referrer)) (enqueue-links uri))))) (process-queue)))) (define (show-help) (format #t "Usage: broken-link-checker [OPTIONS] URL Run broken link checker on URL. -h, --help Show this help message -X, --exclude-domain=DOMAIN Skip checking links to this domain -x, --exclude-url=URL Skip checking this specific link -o, --override-host=HOSTNAME Rewrite links to HOSTNAME to the base URL -r, --recurse Check all pages on the same host that are reachable from the starting URL. ")) (define %default-options '((exclude-domains . ()) (exclude-urls . ()) (host-overrides . ()) (recurse . #f) (urls . ()))) (define %options (list (option '(#\h "help") #f #f (lambda args (show-help) (exit 0))) (option '(#\X "exclude-domain") #t #f (lambda (opt name val result) (assoc-set! result 'exclude-domains (cons val (assoc-ref result 'exclude-domains))))) (option '(#\x "exclude-url") #t #f (lambda (opt name val result) (assoc-set! result 'exclude-urls (cons val (assoc-ref result 'exclude-urls))))) (option '(#\o "override-host") #t #f (lambda (opt name val result) (assoc-set! result 'host-overrides (cons val (assoc-ref result 'host-overrides))))) (option '(#\r "recurse") #f #f (lambda (opt name val result) (assoc-set! result 'recurse #t))))) (define (parse-args args) (args-fold args %options (lambda (opt name arg result) (format (current-error-port) "~A: unrecognized option~%" opt) (exit 1)) (lambda (url result) (assoc-set! result 'urls (cons url (assoc-ref result 'urls)))) %default-options)) (define (main args) (let ((args (parse-args (cdr args)))) (set! recurse? (assoc-ref args 'recurse)) (for-each exclude-domain! (assoc-ref args 'exclude-domains)) (for-each exclude-url! (assoc-ref args 'exclude-urls)) (for-each add-host-override! (assoc-ref args 'host-overrides)) (for-each (compose enqueue-links string->uri) (assoc-ref args 'urls)) (process-queue) (if (> error-count 0) (exit 2) (exit 0))))