[nat] wikipedia image scraper

This commit is contained in:
2021-08-02 21:34:34 -06:00
parent 7db311dffe
commit d3fb044e13
10 changed files with 68 additions and 29 deletions

View File

@@ -0,0 +1,25 @@
(load "../Lib.kiss")
(defNew []
(super
"https://en.wikipedia.org/w/api.php"
->[archive e] (tagsMatch archive e "(and media !wikipediaProcessed)")
scrapeForImages
1))
// named method in case a user will want to run it on selectedEntries instead of on media entries
(method scrapeForImages [archive e]
(let [:String title
(readComponent archive e Name)
:Array<String> wikipediaImageUrls
(queryImageUrls (queryImageTitles [title]))]
(doFor url wikipediaImageUrls
(assertProcess "wget" ["--directory-prefix=${archive.archiveDir}" url]))
(addFiles archive e
(for url wikipediaImageUrls
(joinPath archive.archiveDir
~(.replace
(.urlDecode (url.withoutDirectory))
// Some symbols shouldn't be decoded because they're invalid in file systems!
"\"" "%22"))))
(addTags archive e ["wikipediaProcessed"])))