Implement a basic web crawler.

This commit is contained in:
Ray Miller 2024-05-12 15:25:31 +01:00
parent 49f5b85e22
commit 64cefb1f9a
2 changed files with 40 additions and 1 deletions

39
crawler.rkt Normal file
View file

@ -0,0 +1,39 @@
#lang racket
(require (prefix-in http: net/http-easy)
net/url
html-parsing
xml/path)
(provide crawl host=? delay-upto)
(define (url-without-fragment u)
(url->string (struct-copy url u (fragment #f))))
(define (extract-links url x)
(list->set (map (lambda (u) (url-without-fragment (combine-url/relative url u)))
(se-path*/list '(a @ href) x))))
(define (process url handler)
(match (http:get url)
((http:response #:status-code 200 #:headers ((content-type (regexp #"text/html"))) #:body body)
(let ((x (html->xexp (bytes->string/utf-8 body))))
(handler url x)
(extract-links (string->url url) x)))
(_ '())))
(define (host=? host)
(lambda (u) (string=? host (url-host (string->url u)))))
(define (delay-upto n)
(lambda () (sleep (random n))))
(define (crawl url handler #:limit (limit #f) #:delay (delay (lambda () #f)) #:wanted? (wanted? (lambda (url) #t)))
(let crawl ((todo (set url)) (seen (set)))
(unless (or (set-empty? todo) (and limit (>= (set-count seen) limit)))
(if (not (wanted? (set-first todo)))
(crawl (set-rest todo) seen)
(let ((links (process (set-first todo) handler))
(seen (set-add seen (set-first todo))))
(delay)
(crawl (set-union (set-rest todo) (set-subtract links seen)) seen))))))

View file

@ -1,6 +1,6 @@
#lang info #lang info
(define collection "racket-utils") (define collection "racket-utils")
(define deps '("base" "http-easy-lib")) (define deps '("base" "http-easy-lib" "html-parsing"))
(define build-deps '("scribble-lib" "racket-doc" "rackunit-lib")) (define build-deps '("scribble-lib" "racket-doc" "rackunit-lib"))
;;(define scribblings '(("scribblings/tryme.scrbl" ()))) ;;(define scribblings '(("scribblings/tryme.scrbl" ())))
(define pkg-desc "Some utilities for working with racket") (define pkg-desc "Some utilities for working with racket")