import csv
import sys
import numpy as np
from collections import Counter
#file paths
filepath1 = "dachau_names.csv"
filepath2 = "uncorrupted_bios_names.csv"
#lists to store first and last names of Dachau prisoners
dachau_last_names = []
dachau_first_names = []
with open(filepath1, 'rU') as f1:
#create csv reader object
reader1 = csv.reader(f1)
for row in reader1:
dachau_last_names.append(row[1])
dachau_first_names.append(row[2])
#list to store biography names
bio_names = []
with open(filepath2, 'rU') as f2:
reader2 = csv.reader(f2)
for row in reader2:
if len(row) > 0:
bio_names.append(row[1])
dachau_last_names = np.array(dachau_last_names)
dachau_first_names = np.array(dachau_first_names)
assert dachau_last_names.size == dachau_first_names.size
bio_names = np.array(bio_names)
dachau_len = dachau_last_names.size
bio_len = bio_names.size
dachau_ids = []
bio_ids = []
for i in range(0, dachau_len):
for j in range(0, bio_len):
#First, eliminate unwanted matches manually (this proved to be easiest)
if bio_names[j].lower().startswith("schlicker, peter josef"):
continue
if bio_names[j].lower().startswith("schwake, theodor (p. gregorius)"):
continue
if bio_names[j].lower().startswith("zimmermann, johannes"):
continue
if bio_names[j].lower().startswith("rehling, p. engelbert omi"):
continue
if bio_names[j].lower().startswith("schumann, p. emil msc"):
continue
if bio_names[j].lower().startswith("habich, kurt"):
continue
if bio_names[j].lower().startswith("hennen, heinrich"):
continue
if bio_names[j].lower().startswith("kopera, amand"):
continue
if bio_names[j].lower().startswith("morper, br. karl sac"):
continue
if bio_names[j].lower().startswith("braun, wilhelm"):
continue
##################################
#first, check to see if strings start the same way (want to last names to match under any circumstance)
#here, we compare in lower case to avoid case-sensitive issues with sting containment
if bio_names[j].lower().startswith(dachau_last_names[i].lower()):
if dachau_first_names[i].lower() in bio_names[j].lower():
dachau_ids.append(i)
bio_ids.append(j)
#handle Hans vs. Johannes case
elif dachau_first_names[i].lower() == "hans" and "johannes" in bio_names[j].lower():
dachau_ids.append(i)
bio_ids.append(j)
#handle Joseph vs. Josef case
else:
if dachau_first_names[i].lower() == "josef" and "joseph" in bio_names[j].lower():
dachau_ids.append(i)
bio_ids.append(j)
#handles Radecke misspelling case
if bio_names[j].lower().startswith("radecke"):
if dachau_last_names[i].lower().startswith("radeke"):
dachau_ids.append(i)
bio_ids.append(j)
#handles more edge cases
if bio_names[j].lower().startswith("baensch"):
if dachau_first_names[i].lower().startswith("oskar") and dachau_last_names[i].lower().startswith("b"):
dachau_ids.append(i)
bio_ids.append(j)
if bio_names[j].lower().startswith("scholze") and "aloys" in bio_names[j].lower():
if dachau_first_names[i].lower().startswith("alois") and dachau_last_names[i].lower().startswith("scholze"):
dachau_ids.append(i)
bio_ids.append(j)
if bio_names[j].lower().startswith("wueste") and "bernhard" in bio_names[j].lower():
if dachau_first_names[i].lower().startswith("bernhard") and (dachau_last_names[i].lower().startswith("w") and "ste" in dachau_last_names[i].lower()):
dachau_ids.append(i)
bio_ids.append(j)
if bio_names[j].lower().startswith("ries") and "johann" in bio_names[j].lower():
if dachau_first_names[i].lower().startswith("johannes") and dachau_last_names[i].lower().startswith("ries"):
dachau_ids.append(i)
bio_ids.append(j)
if bio_names[j].lower().startswith("pereira") and "" in bio_names[j].lower():
if dachau_first_names[i].lower().startswith("klemens") and dachau_last_names[i].lower().startswith("pereira"):
dachau_ids.append(i)
bio_ids.append(j)
dachau_ids = np.array(dachau_ids)
bio_ids = np.array(bio_ids)
dachau_dups = [item for item, count in Counter(dachau_ids).iteritems() if count > 1]
#stores indices of duplicate matches
for_removal = []
for i in range(0, len(dachau_dups)):
mask = (dachau_ids == dachau_dups[i])
dachau_vals = dachau_ids[mask]
bio_vals = bio_ids[mask]
#checks to see if name is duplicated (due to double scan, etc.)
if len(set(bio_names[bio_vals])) == 1:
#in which case, we delete one of the associations
for_removal.append(np.argmax(dachau_ids == dachau_dups[i]))
#sort by decreasing order so removing elements won't disrupt indexing
for_removal.sort(reverse=True)
#removes duplicate indices
for i in range(0, len(for_removal)):
dachau_ids = np.delete(dachau_ids, for_removal[i])
bio_ids = np.delete(bio_ids, for_removal[i])
#opens bio CSV for reading
with open(filepath2, 'rU') as f2:
reader2 = csv.reader(f2)
#opens output CSV file for writing
with open('filtered_bios.csv', 'w') as outfile:
writer = csv.writer(outfile)
#iterate over the rows of the bio CSV
for row in reader2:
#if the row is one of the rows of interest, we write the row to the CSV file, with the Dachau ID appended
for i in range(0, len(bio_ids)):
if bio_ids[i] == int(row[0]):
row.insert(0, dachau_ids[i]-1)
writer.writerow(row)