In [ ]:
import csv
import sys
import numpy as np
from collections import Counter

#file paths
filepath1 = "dachau_names.csv"
filepath2 = "uncorrupted_bios_names.csv"

#lists to store first and last names of Dachau prisoners 
dachau_last_names = []
dachau_first_names = []

with open(filepath1, 'rU') as f1:

        #create csv reader object
        reader1 = csv.reader(f1)
        for row in reader1:
		dachau_last_names.append(row[1])
		dachau_first_names.append(row[2])

#list to store biography names
bio_names = []

with open(filepath2, 'rU') as f2:

	reader2 = csv.reader(f2)
	for row in reader2:
		if len(row) > 0:
			bio_names.append(row[1])	


dachau_last_names = np.array(dachau_last_names)
dachau_first_names = np.array(dachau_first_names)
assert dachau_last_names.size == dachau_first_names.size
bio_names = np.array(bio_names)


dachau_len = dachau_last_names.size
bio_len = bio_names.size

dachau_ids = []
bio_ids = []

for i in range(0, dachau_len):
	for j in range(0, bio_len):

		#First, eliminate unwanted matches manually (this proved to be easiest)

		if bio_names[j].lower().startswith("schlicker, peter josef"):
			continue

		if bio_names[j].lower().startswith("schwake, theodor (p. gregorius)"):
			continue
	
		if bio_names[j].lower().startswith("zimmermann, johannes"):
			continue

		if bio_names[j].lower().startswith("rehling, p. engelbert omi"):
			continue

		if bio_names[j].lower().startswith("schumann, p. emil msc"):
			continue

		if bio_names[j].lower().startswith("habich, kurt"):
			continue

		if bio_names[j].lower().startswith("hennen, heinrich"):
			continue		

		if bio_names[j].lower().startswith("kopera, amand"):
			continue

		if bio_names[j].lower().startswith("morper, br. karl sac"):
			continue

		if bio_names[j].lower().startswith("braun, wilhelm"):
			continue

		##################################


		#first, check to see if strings start the same way (want to last names to match under any circumstance)
		#here, we compare in lower case to avoid case-sensitive issues with sting containment
		if bio_names[j].lower().startswith(dachau_last_names[i].lower()):

			if dachau_first_names[i].lower() in bio_names[j].lower():

				dachau_ids.append(i)
				bio_ids.append(j)


			#handle Hans vs. Johannes case

			elif dachau_first_names[i].lower() == "hans" and "johannes" in bio_names[j].lower():

					dachau_ids.append(i)
					bio_ids.append(j)

			#handle Joseph vs. Josef case

			else: 
			       if dachau_first_names[i].lower() == "josef" and "joseph" in bio_names[j].lower():

                                        dachau_ids.append(i)
                                        bio_ids.append(j)

		
		#handles Radecke misspelling case
                if bio_names[j].lower().startswith("radecke"):
                        if dachau_last_names[i].lower().startswith("radeke"):

				dachau_ids.append(i)
				bio_ids.append(j)


		#handles more edge cases
		if bio_names[j].lower().startswith("baensch"):
			if dachau_first_names[i].lower().startswith("oskar") and dachau_last_names[i].lower().startswith("b"):

                                dachau_ids.append(i)
                                bio_ids.append(j)

                if bio_names[j].lower().startswith("scholze") and "aloys" in bio_names[j].lower():
                        if dachau_first_names[i].lower().startswith("alois") and dachau_last_names[i].lower().startswith("scholze"):

                                dachau_ids.append(i)
                                bio_ids.append(j)

                if bio_names[j].lower().startswith("wueste") and "bernhard" in bio_names[j].lower():
                        if dachau_first_names[i].lower().startswith("bernhard") and (dachau_last_names[i].lower().startswith("w") and "ste" in dachau_last_names[i].lower()):

                                dachau_ids.append(i)
                                bio_ids.append(j)


                if bio_names[j].lower().startswith("ries") and "johann" in bio_names[j].lower():
                        if dachau_first_names[i].lower().startswith("johannes") and dachau_last_names[i].lower().startswith("ries"):

                                dachau_ids.append(i)
                                bio_ids.append(j)


                if bio_names[j].lower().startswith("pereira") and "" in bio_names[j].lower():
                        if dachau_first_names[i].lower().startswith("klemens") and dachau_last_names[i].lower().startswith("pereira"):

                                dachau_ids.append(i)
                                bio_ids.append(j)


dachau_ids = np.array(dachau_ids)
bio_ids = np.array(bio_ids)

dachau_dups = [item for item, count in Counter(dachau_ids).iteritems() if count > 1]

#stores indices of duplicate matches
for_removal = []

for i in range(0, len(dachau_dups)):

	mask = (dachau_ids == dachau_dups[i])
	dachau_vals = dachau_ids[mask]
	bio_vals = bio_ids[mask]

	#checks to see if name is duplicated (due to double scan, etc.)
	if len(set(bio_names[bio_vals])) == 1:

		#in which case, we delete one of the associations
		for_removal.append(np.argmax(dachau_ids == dachau_dups[i]))

#sort by decreasing order so removing elements won't disrupt indexing		
for_removal.sort(reverse=True)

#removes duplicate indices
for i in range(0, len(for_removal)):
	dachau_ids = np.delete(dachau_ids, for_removal[i])
	bio_ids = np.delete(bio_ids, for_removal[i])


#opens bio CSV for reading
with open(filepath2, 'rU') as f2:

        reader2 = csv.reader(f2)

	#opens output CSV file for writing
	with open('filtered_bios.csv', 'w') as outfile:

		writer = csv.writer(outfile)

		#iterate over the rows of the bio CSV
		for row in reader2:


            #if the row is one of the rows of interest, we write the row to the CSV file, with the Dachau ID appended
			for i in range(0, len(bio_ids)):
				if bio_ids[i] == int(row[0]):

					row.insert(0, dachau_ids[i]-1)
					writer.writerow(row)