From f52c1882f105fe4cf7e84fbb3705c87ce92b143b Mon Sep 17 00:00:00 2001 From: Ray Miller Date: Mon, 13 May 2024 09:28:01 +0100 Subject: [PATCH] Add a filter to ensure we only enqueue http urls. --- crawler.rkt | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crawler.rkt b/crawler.rkt index ddaf643..61b0595 100644 --- a/crawler.rkt +++ b/crawler.rkt @@ -8,11 +8,13 @@ (provide crawl host=? delay-upto) (define (url-without-fragment u) - (url->string (struct-copy url u (fragment #f)))) + (struct-copy url u (fragment #f))) (define (extract-links url x) - (list->set (map (lambda (u) (url-without-fragment (combine-url/relative url u))) - (se-path*/list '(a @ href) x)))) + (list->set (map url->string + (filter (lambda (u) (or (string=? (url-scheme u) "http") (string=? (url-scheme u) "https"))) + (map (lambda (u) (url-without-fragment (combine-url/relative url u))) + (se-path*/list '(a @ href) x)))))) (define (process url handler) (match (http:get url)