Merge commit '2af495d056616cc0f757a055114b56df2e0d5d84' as 'projects/bad-nlp/name-database'

This commit is contained in:
2023-03-20 18:03:18 -06:00
commit 2722c1ddc3
669 changed files with 423076 additions and 0 deletions

View File

@@ -0,0 +1,53 @@
require 'sequel'
task "export:sqlite" => [:db, "export:sqlite:schema"] do
#first insert the given_names
@names = @output[:names]
@meta = @output[:metadata]
puts "dumping given names"
@db.given_names.each do |entry|
row = @names.insert :name => entry.name, :type => "given"
entry.metadata_without_nesting.each do |key, value|
@meta.insert :key => key, :value => value
end
end
# puts "dumping family names"
# @db.family_names.each do |entry|
# @names << {:name => entry.name, :type => "family"}
# end
end
namespace "export:sqlite" do
task :db => "out" do
existing = Dir["out/*.sqlite"].length
path = "out/names#{(".#{existing}" if existing > 0)}.sqlite"
@output = Sequel.sqlite(path)
end
task :schema => "export:sqlite:db" do
@output.create_table :names do
primary_key :id
String :name
String :type
index [:type, :name]
end
@output.create_table :metadata do
foreign_key :name_id, :names
String :key
String :value
index [:name_id, :key]
end
end
end

View File

@@ -0,0 +1,59 @@
desc "imports the census 2000 data files and merged the data with the existing data files"
task "import:census1990" => "import:census1990:run"
namespace "import:census1990" do
task :run => [:db, "import:census1990:male", "import:census1990:female", "import:census1990:family"] do
@db.write
end
task :male => :db do
open("sources/census-1990/dist.male.first", 'r') do |file|
file.each do |line|
fields = line.split(/\s+/)
name = fields.first
entry = @db.given_names.get name
entry.meta[:gender] = case entry.meta[:gender]
when "female", "unisex" then
"unisex"
else
"male"
end
end
end
end
task :female => :db do
open("sources/census-1990/dist.female.first", 'r') do |file|
file.each do |line|
fields = line.split(/\s+/)
name = fields.first
entry = @db.given_names.get name
entry.meta[:gender] = case entry.meta[:gender]
when "male", "unisex" then
"unisex"
else
"female"
end
end
end
end
task :family => :db do
open("sources/census-1990/dist.all.last", 'r') do |file|
file.each do |line|
fields = line.split(/\s+/)
name = fields.first
@db.family_names.get name
end
end
end
end

View File

@@ -0,0 +1,20 @@
desc "imports the census 2000 data files and merged the data with the existing data files"
task "import:census2000" => "import:census2000:run"
namespace "import:census2000" do
task :run => :db do
open("sources/census-2000/app_c.csv", 'r') do |file|
file.gets #skip the header
file.each do |line|
fields = line.split(",")
name = fields.first
@db.family_names.get name
end
end
@db.write
end
end

View File

@@ -0,0 +1,42 @@
directory "out"
task "clean" do
FileUtils.rm_rf "out"
end
["given", "family"].each do |set|
rule Regexp.new("name:#{set}:[a-z]+") do |t|
name = t.name.gsub("name:#{set}:", '')
path = path_for_name(set, name)
Rake::Task[path].invoke
parsed = YAML.load(IO.read path)
parsed ||= []
next if entry_names(parsed).index(name)
parsed << name
parsed.sort!{|l,r| entry_name(l) <=> entry_name(r)}
open(path, 'w'){|f| f.puts YAML.dump(parsed) }
end
# opens a textmate window to the appropriate file and line of the name specified
rule Regexp.new("mate:#{set}:[a-z]+") => [proc{|name| name.gsub("mate:", "name:")}] do |t|
end
end
task :default => :db do
entry = @db.family_names.get "smith"
@db.write
end
task :db do
$:.unshift File.dirname(__FILE__) + "/../lib"
require 'name-database'
@db = NameDatabase.new(".")
end

View File

@@ -0,0 +1,13 @@
["given", "family"].each do |set|
("aa".."zz").each do |prefix|
path = "#{set}_name/#{prefix.first}/#{prefix}.yml"
dir = File.dirname(path)
directory dir
file path => dir do
open(path, "w") do |f|
f.puts "---"
f.puts "# This file contain #{set} names starting with #{prefix}"
end
end
end
end

View File

@@ -0,0 +1,17 @@
def path_for_name(set, name)
"#{set}_name/#{name.first}/#{name.first(2)}.yml"
end
def entry_names(list)
list.map do |entry|
entry_name entry
end.compact
end
def entry_name(entry)
case entry
when String: entry
when Hash: entry.keys.first
end
end

View File

@@ -0,0 +1,16 @@
task :validate_data do
paths = Dir["{given,family}_name/**/*.yml"]
pad_length = 55
paths.each do |path|
begin
STDOUT.write path.ljust(pad_length, '.')
loaded = YAML.load(IO.read path)
puts "success"
rescue ArgumentError => e
puts "failed"
puts e.message
puts
end
end
end