diff --git a/crawler.rkt b/crawler.rkt new file mode 100644 index 0000000..4f9693f --- /dev/null +++ b/crawler.rkt @@ -0,0 +1,39 @@ +#lang racket + +(require (prefix-in http: net/http-easy) + net/url + html-parsing + xml/path) + +(provide crawl host=? delay-upto) + +(define (url-without-fragment u) + (url->string (struct-copy url u (fragment #f)))) + +(define (extract-links url x) + (list->set (map (lambda (u) (url-without-fragment (combine-url/relative url u))) + (se-path*/list '(a @ href) x)))) + +(define (process url handler) + (match (http:get url) + ((http:response #:status-code 200 #:headers ((content-type (regexp #"text/html"))) #:body body) + (let ((x (html->xexp (bytes->string/utf-8 body)))) + (handler url x) + (extract-links (string->url url) x))) + (_ '()))) + +(define (host=? host) + (lambda (u) (string=? host (url-host (string->url u))))) + +(define (delay-upto n) + (lambda () (sleep (random n)))) + +(define (crawl url handler #:limit (limit #f) #:delay (delay (lambda () #f)) #:wanted? (wanted? (lambda (url) #t))) + (let crawl ((todo (set url)) (seen (set))) + (unless (or (set-empty? todo) (and limit (>= (set-count seen) limit))) + (if (not (wanted? (set-first todo))) + (crawl (set-rest todo) seen) + (let ((links (process (set-first todo) handler)) + (seen (set-add seen (set-first todo)))) + (delay) + (crawl (set-union (set-rest todo) (set-subtract links seen)) seen)))))) diff --git a/info.rkt b/info.rkt index 9d6a5a6..bcfb03f 100644 --- a/info.rkt +++ b/info.rkt @@ -1,6 +1,6 @@ #lang info (define collection "racket-utils") -(define deps '("base" "http-easy-lib")) +(define deps '("base" "http-easy-lib" "html-parsing")) (define build-deps '("scribble-lib" "racket-doc" "rackunit-lib")) ;;(define scribblings '(("scribblings/tryme.scrbl" ()))) (define pkg-desc "Some utilities for working with racket")