[nat] wikipedia image scraper
This commit is contained in:
@@ -34,10 +34,16 @@
|
||||
(assert response.ok)
|
||||
response.text))))
|
||||
|
||||
(defmethod queryImageTitles [:Array<String> pageTitles]
|
||||
(flatten (for =>_id page (the haxe.DynamicAccess<Dynamic> .pages .query (query [=>"titles" pageTitles =>"prop" ["images"]])) (page.images.map ->image image.title))))
|
||||
(method :Array<String> queryImageTitles [:Array<String> pageTitles]
|
||||
(flatten
|
||||
(for =>_id page (the haxe.DynamicAccess<Dynamic> .pages .query (query [=>"titles" pageTitles =>"prop" ["images"]]))
|
||||
(if page.images
|
||||
(page.images.map ->image image.title)
|
||||
[]))))
|
||||
|
||||
(defmethod queryImageUrls [:Array<String> imageTitles]
|
||||
(flatten (for =>_id image (the haxe.DynamicAccess<Dynamic> .pages .query (query [=>"titles" imageTitles =>"prop" ["imageinfo"] =>"iiprop" ["url"]])) (image.imageinfo.map ->image image.url))))
|
||||
(method :Array<String> queryImageUrls [:Array<String> imageTitles]
|
||||
(flatten
|
||||
(for =>_id image (the haxe.DynamicAccess<Dynamic> .pages .query (query [=>"titles" imageTitles =>"prop" ["imageinfo"] =>"iiprop" ["url"]]))
|
||||
(image.imageinfo.map ->image image.url))))
|
||||
|
||||
(var headers [=>"User-Agent" "NatArchiveTool/0.0.0 (https://github.com/NQNStudios/kisslang/tree/main/projects/nat-archive-tool; natquaylenelson@gmail.com) Requests/2.26.0"])
|
||||
@@ -0,0 +1,10 @@
|
||||
package nat.systems;
|
||||
|
||||
import nat.systems.MediaWikiSystem;
|
||||
import kiss.Prelude;
|
||||
|
||||
using haxe.io.Path;
|
||||
using StringTools;
|
||||
|
||||
@:build(kiss.Kiss.build())
|
||||
class WikipediaImageSystem extends MediaWikiSystem {}
|
||||
@@ -0,0 +1,25 @@
|
||||
(load "../Lib.kiss")
|
||||
|
||||
(defNew []
|
||||
(super
|
||||
"https://en.wikipedia.org/w/api.php"
|
||||
->[archive e] (tagsMatch archive e "(and media !wikipediaProcessed)")
|
||||
scrapeForImages
|
||||
1))
|
||||
|
||||
// named method in case a user will want to run it on selectedEntries instead of on media entries
|
||||
(method scrapeForImages [archive e]
|
||||
(let [:String title
|
||||
(readComponent archive e Name)
|
||||
:Array<String> wikipediaImageUrls
|
||||
(queryImageUrls (queryImageTitles [title]))]
|
||||
(doFor url wikipediaImageUrls
|
||||
(assertProcess "wget" ["--directory-prefix=${archive.archiveDir}" url]))
|
||||
(addFiles archive e
|
||||
(for url wikipediaImageUrls
|
||||
(joinPath archive.archiveDir
|
||||
~(.replace
|
||||
(.urlDecode (url.withoutDirectory))
|
||||
// Some symbols shouldn't be decoded because they're invalid in file systems!
|
||||
"\"" "%22"))))
|
||||
(addTags archive e ["wikipediaProcessed"])))
|
||||
Reference in New Issue
Block a user