racket-utils/crawler.rkt

42 lines
1.6 KiB
Racket
Raw Permalink Normal View History

2024-05-12 15:25:31 +01:00
#lang racket
(require (prefix-in http: net/http-easy)
net/url
html-parsing
xml/path)
(provide crawl host=? delay-upto)
(define (url-without-fragment u)
(struct-copy url u (fragment #f)))
2024-05-12 15:25:31 +01:00
2024-05-13 16:09:42 +01:00
(define (extract-links base-url xpr)
(list->set (map url->string
(filter (lambda (u) (or (string=? (url-scheme u) "http") (string=? (url-scheme u) "https")))
2024-05-13 16:09:42 +01:00
(map (lambda (u) (url-without-fragment (combine-url/relative base-url u)))
(se-path*/list '(a @ href) xpr))))))
2024-05-12 15:25:31 +01:00
(define (process url handler)
(match (http:get url)
((http:response #:status-code 200 #:headers ((content-type (regexp #"text/html"))) #:body body)
(let ((x (html->xexp (bytes->string/utf-8 body))))
(handler url x)
(extract-links (string->url url) x)))
(_ (set))))
2024-05-12 15:25:31 +01:00
(define (host=? host)
(lambda (u) (string=? host (url-host (string->url u)))))
(define (delay-upto n)
(lambda () (sleep (random n))))
(define (crawl url handler #:limit (limit #f) #:delay (delay (lambda () #f)) #:wanted? (wanted? (lambda (url) #t)))
2024-05-13 16:09:42 +01:00
(let crawl ((frontier (set url)) (visited (set)))
(unless (or (set-empty? frontier) (and limit (>= (set-count visited) limit)))
(if (not (wanted? (set-first frontier)))
(crawl (set-rest frontier) visited)
(let ((links (process (set-first frontier) handler))
(visited (set-add visited (set-first frontier))))
2024-05-12 15:25:31 +01:00
(delay)
2024-05-13 16:09:42 +01:00
(crawl (set-union (set-rest frontier) (set-subtract links visited)) visited))))))