Script to de-blogger markdown.
This script downloads images from blogger and replaces all the image links in the markdown files with links to the local (just-downloaded) files. It also removes Amazon associate links and tracking and replaces links to the orginal Blogger blog with relative links to the new blog.
This commit is contained in:
parent
26bf41e073
commit
b2e883105c
1 changed files with 115 additions and 0 deletions
115
guile/de-blogger.scm
Executable file
115
guile/de-blogger.scm
Executable file
|
@ -0,0 +1,115 @@
|
||||||
|
#!/usr/bin/env -S guile -e main -s
|
||||||
|
!#
|
||||||
|
|
||||||
|
(use-modules (srfi srfi-26)
|
||||||
|
(srfi srfi-71)
|
||||||
|
(srfi srfi-197)
|
||||||
|
(ice-9 regex)
|
||||||
|
(ice-9 textual-ports)
|
||||||
|
(ice-9 binary-ports)
|
||||||
|
(ice-9 ftw)
|
||||||
|
(ice-9 format)
|
||||||
|
(ice-9 string-fun)
|
||||||
|
(web client)
|
||||||
|
(web response))
|
||||||
|
|
||||||
|
(define base-dir "/home/ray/Workspace/personal/start-again-at-zero/")
|
||||||
|
(define posts-dir (string-append base-dir "content/posts/"))
|
||||||
|
(define image-dir (string-append base-dir "static/img/"))
|
||||||
|
|
||||||
|
(define md-img-rx (make-regexp "!\\[[^]]*\\]\\((https?[^)]+)\\)"))
|
||||||
|
|
||||||
|
(define md-img-link-rx (make-regexp "\\((https?[^)]+\\.(png|jpg))\\)" regexp/icase))
|
||||||
|
|
||||||
|
(define img-src-rx (make-regexp "<img src=\"([^\"]+)\""))
|
||||||
|
|
||||||
|
(define amazon-assoc-img-rx (make-regexp "!\\[\\]\\(http://www.assoc-amazon[^)]+\\)"))
|
||||||
|
|
||||||
|
(define (http-get-follow-redirects url)
|
||||||
|
(let ((res body (http-get url #:streaming? #t)))
|
||||||
|
(if (<= 300 (response-code res) 399)
|
||||||
|
(http-get-follow-redirects (response-location res))
|
||||||
|
(if (eq? (car (response-content-type res)) 'text/html)
|
||||||
|
;; Work-around for Google wrapping the image in HTML. Gah!
|
||||||
|
(let* ((content (get-string-all body))
|
||||||
|
(new-url (match:substring (regexp-exec img-src-rx content) 1)))
|
||||||
|
(close-port body)
|
||||||
|
(http-get-follow-redirects new-url))
|
||||||
|
(values res body)))))
|
||||||
|
|
||||||
|
(define download-image
|
||||||
|
(let ((image-num 1)
|
||||||
|
(images (make-hash-table)))
|
||||||
|
(lambda (url)
|
||||||
|
(format #t "download-image ~a~%" url)
|
||||||
|
(if (hash-ref images url)
|
||||||
|
(hash-ref images url)
|
||||||
|
(let* ((res body (http-get-follow-redirects url))
|
||||||
|
(suffix (case (car (response-content-type res))
|
||||||
|
((image/jpeg) ".jpg")
|
||||||
|
((image/png) ".png")
|
||||||
|
(else (error (format #f "download error: unexpected content-type for ~s" url)))))
|
||||||
|
(filename (format #f "IMG_~4,'0d~a" image-num suffix)))
|
||||||
|
(hash-set! images url filename)
|
||||||
|
(set! image-num (1+ image-num))
|
||||||
|
(call-with-output-file (string-append image-dir filename)
|
||||||
|
(lambda (port)
|
||||||
|
(let loop ((data (get-bytevector-some body)))
|
||||||
|
(unless (eof-object? data)
|
||||||
|
(put-bytevector port data)
|
||||||
|
(loop (get-bytevector-some body))))))
|
||||||
|
(close-port body)
|
||||||
|
filename)))))
|
||||||
|
|
||||||
|
(define (replace-images doc)
|
||||||
|
(for-each (lambda (m)
|
||||||
|
(let* ((url (match:substring m 1))
|
||||||
|
(filename (download-image url))
|
||||||
|
(new-url (string-append "/img/" filename)))
|
||||||
|
(set! doc (string-replace-substring doc url new-url))))
|
||||||
|
(list-matches md-img-rx doc))
|
||||||
|
doc)
|
||||||
|
|
||||||
|
(define (replace-links doc)
|
||||||
|
(for-each (lambda (m)
|
||||||
|
(let* ((url (match:substring m 1))
|
||||||
|
(filename (download-image url))
|
||||||
|
(new-url (string-append "/img/" filename)))
|
||||||
|
(set! doc (string-replace-substring doc url new-url))))
|
||||||
|
(list-matches md-img-link-rx doc))
|
||||||
|
doc)
|
||||||
|
|
||||||
|
(define (replace-amazon-tracking-images doc)
|
||||||
|
(for-each (lambda (m)
|
||||||
|
(set! doc (string-replace-substring doc (match:substring m) "")))
|
||||||
|
(list-matches amazon-assoc-img-rx doc))
|
||||||
|
doc)
|
||||||
|
|
||||||
|
(define link-rx (make-regexp "\\[([^]]+)\\]\\(([^)]+)\\)"))
|
||||||
|
|
||||||
|
(define (remove-amazon-links doc)
|
||||||
|
(for-each (lambda (m)
|
||||||
|
(when (string-contains (match:substring m 2) "staagaatzer-21")
|
||||||
|
(set! doc (string-replace-substring doc (match:substring m 0) (match:substring m 1)))))
|
||||||
|
(list-matches link-rx doc))
|
||||||
|
doc)
|
||||||
|
|
||||||
|
(define (replace-self-links doc)
|
||||||
|
(string-replace-substring doc
|
||||||
|
"http://startagainatzero.blogspot.com/"
|
||||||
|
"/"))
|
||||||
|
|
||||||
|
(define (process-file path)
|
||||||
|
(format #t "process-file ~a~%" path)
|
||||||
|
(let ((doc (chain (call-with-input-file path get-string-all)
|
||||||
|
(replace-amazon-tracking-images _)
|
||||||
|
(remove-amazon-links _)
|
||||||
|
(replace-images _)
|
||||||
|
(replace-links _)
|
||||||
|
(replace-self-links _))))
|
||||||
|
(call-with-output-file path (cut put-string <> doc))))
|
||||||
|
|
||||||
|
(define (main args)
|
||||||
|
(for-each process-file
|
||||||
|
(map (cute string-append posts-dir <>)
|
||||||
|
(scandir posts-dir (cute string-suffix? ".md" <>)))))
|
Loading…
Reference in a new issue