Bad NLP Util class

This commit is contained in:
2023-03-20 19:57:21 -06:00
parent 2cdbf35e3e
commit 99727e3701
6 changed files with 68 additions and 56 deletions

View File

@@ -6,7 +6,6 @@ import yaml.Yaml;
import yaml.Parser;
import yaml.util.ObjectMap;
import sys.FileSystem;
import bad_nlp.Names;
using StringTools;
using hx.strings.Strings;

View File

@@ -21,69 +21,18 @@
(dictSet loadedNames (.next (map.keys)) true))))))
(dictSet loadedNameFiles file true))
(var quotesAndThings [
"\""
"'"
"`"
"["
"]"
"("
")"
])
(var punctuation [
","
"."
";"
":"
"-"
"!"
"?"
"'s" // possessive
])
// TODO this isn't specific to Names
(function :Array<String> splitByAll [:String text :Array<String> delims]
(if delims
(let [next (delims.shift)
tokens (text.split next)]
(flatten (for token tokens (splitByAll token (delims.copy)))))
[text]))
// TODO this isn't specific to Names
(function normalize [:String token :Bool toLower]
(cond
// Remove quotes and things around
((apply or (for quote quotesAndThings (token.startsWith quote)))
(normalize (token.substr 1) toLower))
((apply or (for quote quotesAndThings (token.endsWith quote)))
(normalize (substr token 0 -1) toLower))
// Remove punctuation after
((apply or (for punct punctuation (token.endsWith punct)))
(normalize (substr token 0 -1) toLower))
// Lower-case
(toLower (token.toLowerCase))
(true token)))
(function isName [:String token]
(let [token (token.toLowerCase)]
(loadFilesForToken token)
(loadedNames.exists token)))
(var delimiters [
" "
"\n"
"--"
"/"
])
(function containsName [:String text]
(doFor token (splitByAll text delimiters)
(let [t (normalize token false)]
(doFor token (Util.splitTokens text)
(let [t (Util.normalize token false)]
(when (and (.isUpperCase (t.substr 0 1)) (isName t))
(return true))))
false)
(function findNames [:String text]
(.map (filter (splitByAll text delimiters) ->t (containsName t))
->t (normalize t false)))
(.map (filter (Util.splitTokens text) ->t (containsName t))
->t (Util.normalize t false)))

View File

@@ -0,0 +1,9 @@
package bad_nlp;
import kiss.Prelude;
import kiss.List;
using StringTools;
using hx.strings.Strings;
@:build(kiss.Kiss.build())
class Util {}

View File

@@ -0,0 +1,51 @@
(var quotesAndThings [
"\""
"'"
"`"
"["
"]"
"("
")"
])
(var punctuation [
","
"."
";"
":"
"-"
"!"
"?"
"'s" // possessive
])
(var delimiters [
" "
"\n"
"--"
"/"
])
(function :Array<String> splitByAll [:String text :Array<String> delims]
(if delims
(let [next (delims.shift)
tokens (text.split next)]
(flatten (for token tokens (splitByAll token (delims.copy)))))
[text]))
(function splitTokens [:String text]
(splitByAll text delimiters))
(function normalize [:String token :Bool toLower]
(cond
// Remove quotes and things around
((apply or (for quote quotesAndThings (token.startsWith quote)))
(normalize (token.substr 1) toLower))
((apply or (for quote quotesAndThings (token.endsWith quote)))
(normalize (substr token 0 -1) toLower))
// Remove punctuation after
((apply or (for punct punctuation (token.endsWith punct)))
(normalize (substr token 0 -1) toLower))
// Lower-case
(toLower (token.toLowerCase))
(true token)))

View File

@@ -1,5 +1,6 @@
-lib kiss
-lib kiss-vscode-api
-lib bad-nlp
-cp src
-dce full
-D analyzer-optimize

View File

@@ -7,6 +7,9 @@ import kiss.KissInterp;
using haxe.io.Path;
using StringTools;
import bad_nlp.Util;
import bad_nlp.Names;
typedef KTxt2Block = {
source:String,
output:String,