[nat] wikipedia image scraper

This commit is contained in:
2021-08-02 21:34:34 -06:00
parent 7db311dffe
commit d3fb044e13
10 changed files with 68 additions and 29 deletions

View File

@@ -141,7 +141,6 @@ class CompilerTools {
command = "python";
scriptExt = "py";
if (args.langProjectFile != null) {
trace(args.langProjectFile);
// Make a virtual environment
// NOTE this is placed outside the output folder, so it will get reused.
// In some cases this might be bad if the virtual environment gets bad

View File

@@ -19,6 +19,9 @@
(systems.push system)
system)
(method processSystems []
(doFor system systems (system.process this)))
(method :Entry createEntry [:Entry->Dynamic initializer] // initializer returns Dynamic so ->:Void isn't required
(let [e (_newEntry)]
(initializer e)

View File

@@ -4,6 +4,7 @@ import kiss.Prelude;
import kiss.List;
import haxe.Constraints;
import uuid.Uuid;
import nat.systems.*;
enum CommandArgType {
SelectedEntry;

View File

@@ -133,6 +133,10 @@
[&mut :Array<Entry> selectedEntries []
&mut :ChangeSet lastChangeSet []
:Map<String,Command> commands (new Map)]
// Add systems!
(archive.addSystem (new WikipediaImageSystem))
(archive.processSystems)
(defCommand SelectEntry [e OneEntry]
(set selectedEntries [e]) [])
@@ -180,11 +184,10 @@
(defCommand SelectByComponents [componentsBoolExp (Text null)]
(SelectEntries (filter archive.entries ->e (componentsMatch e componentsBoolExp))))
(defCommand AttachFiles [entries (SelectedEntries 1 null)
(defCommand AddFiles [entries (SelectedEntries 1 null)
// TODO add File and Files as an argument type for commands, ArchiveUI
// TODO make tkinter file browser externs and use tkinter as the file picking mechanism for CLI
files (VarText null)]
(doFor e entries
(doFor file files
(addFile archive e file)))
(addFiles archive e files))
entries))

View File

@@ -49,13 +49,14 @@
`(let [,@bindingList
,retValSymbol {,@body}]
,@saveList
(.refreshEntry ,archive ,e) // Check the entry in and out of systems when its components change
,retValSymbol)))
(defMacro withWritableEntry [archive e &body body]
(let [retValSymbol
(symbol)]
`(let [,retValSymbol {,@body}]
(archive.refreshEntry ,e)
(.refreshEntry ,archive ,e)
,retValSymbol)))
// Create a system that selects Entries according to a single string component (i.e. Name or Author) matching the given value
@@ -77,13 +78,14 @@
(function componentsMatch [:nat.Entry e componentsBoolExp]
(BoolExpInterp.eval componentsBoolExp (for =>cType cId e.components cType)))
(function addFile [:nat.Archive archive :nat.Entry e :String file &opt leaveOriginalCopy]
(function addFiles [:nat.Archive archive :nat.Entry e :Array<String> files &opt leaveOriginalCopy]
(withWritableEntry archive e
(let [pathWithoutDir (haxe.io.Path.withoutDirectory file)]
(unless !(= -1 (e.files.indexOf pathWithoutDir))
((if leaveOriginalCopy sys.io.File.copy sys.FileSystem.rename)
file (joinPath archive.archiveDir "files" pathWithoutDir))
(e.files.push pathWithoutDir)))))
(doFor file files
(let [pathWithoutDir (haxe.io.Path.withoutDirectory file)]
(unless !(= -1 (e.files.indexOf pathWithoutDir))
((if leaveOriginalCopy sys.io.File.copy sys.FileSystem.rename)
file (joinPath archive.archiveDir "files" pathWithoutDir))
(e.files.push pathWithoutDir))))))
(function addTags [:nat.Archive archive :nat.Entry e :Array<String> tagsToAdd]
(if (hasComponent e Tags)

View File

@@ -34,10 +34,16 @@
(assert response.ok)
response.text))))
(defmethod queryImageTitles [:Array<String> pageTitles]
(flatten (for =>_id page (the haxe.DynamicAccess<Dynamic> .pages .query (query [=>"titles" pageTitles =>"prop" ["images"]])) (page.images.map ->image image.title))))
(method :Array<String> queryImageTitles [:Array<String> pageTitles]
(flatten
(for =>_id page (the haxe.DynamicAccess<Dynamic> .pages .query (query [=>"titles" pageTitles =>"prop" ["images"]]))
(if page.images
(page.images.map ->image image.title)
[]))))
(defmethod queryImageUrls [:Array<String> imageTitles]
(flatten (for =>_id image (the haxe.DynamicAccess<Dynamic> .pages .query (query [=>"titles" imageTitles =>"prop" ["imageinfo"] =>"iiprop" ["url"]])) (image.imageinfo.map ->image image.url))))
(method :Array<String> queryImageUrls [:Array<String> imageTitles]
(flatten
(for =>_id image (the haxe.DynamicAccess<Dynamic> .pages .query (query [=>"titles" imageTitles =>"prop" ["imageinfo"] =>"iiprop" ["url"]]))
(image.imageinfo.map ->image image.url))))
(var headers [=>"User-Agent" "NatArchiveTool/0.0.0 (https://github.com/NQNStudios/kisslang/tree/main/projects/nat-archive-tool; natquaylenelson@gmail.com) Requests/2.26.0"])

View File

@@ -0,0 +1,10 @@
package nat.systems;
import nat.systems.MediaWikiSystem;
import kiss.Prelude;
using haxe.io.Path;
using StringTools;
@:build(kiss.Kiss.build())
class WikipediaImageSystem extends MediaWikiSystem {}

View File

@@ -0,0 +1,25 @@
(load "../Lib.kiss")
(defNew []
(super
"https://en.wikipedia.org/w/api.php"
->[archive e] (tagsMatch archive e "(and media !wikipediaProcessed)")
scrapeForImages
1))
// named method in case a user will want to run it on selectedEntries instead of on media entries
(method scrapeForImages [archive e]
(let [:String title
(readComponent archive e Name)
:Array<String> wikipediaImageUrls
(queryImageUrls (queryImageTitles [title]))]
(doFor url wikipediaImageUrls
(assertProcess "wget" ["--directory-prefix=${archive.archiveDir}" url]))
(addFiles archive e
(for url wikipediaImageUrls
(joinPath archive.archiveDir
~(.replace
(.urlDecode (url.withoutDirectory))
// Some symbols shouldn't be decoded because they're invalid in file systems!
"\"" "%22"))))
(addTags archive e ["wikipediaProcessed"])))

View File

@@ -1,11 +1 @@
{
"id": "song1",
"components": [
["Name", "name1"],
["Author", "author1"],
["Tags", "tags1"]
],
"files": [
"Adventure.mp3"
]
}
{"components":[["Tags","tags1"],["Name","name1"],["Author","author1"]],"files":["Adventure.mp3"],"id":"song1"}

View File

@@ -2,7 +2,7 @@
# Run these tests on every target that could be used for a NAT front-end
# (also to test (#extern) on multiple targets)
haxe test.hxml py.hxml &&
haxe test.hxml js.hxml &&
haxe test.hxml cpp.hxml &&
# haxe test.hxml py.hxml &&
# haxe test.hxml js.hxml &&
# haxe test.hxml cpp.hxml &&
haxe test.hxml --interp