In [ ]:
import geocoder
import csv
import sys
import numpy as np
from collections import Counter
import re
import time
from datetime import date
import matplotlib.pyplot as plt


#file paths
filepath1 = "for_palladio.csv"
filepath2 = "temp.csv"
filepath3 = "output_with_dates.csv"

#list to store birthdates
birthdates = []

#list to store arrest dates
arrest_dates = []

#list to store post-Dachau fate
post_dachau = []

#list to store nationality
nationality = []

#list to store dates of arrival at Dachau
arrival_dates = []

with open(filepath1, 'rU') as f1:

        #create csv reader object
        reader1 = csv.reader(f1)
        for row in reader1:
                birthdates.append(row[7])
		nationality.append(row[9])
		arrest_dates.append(row[13])
		post_dachau.append(row[26])	
		arrival_dates.append(row[25])



#delete first entry in all (just the header):
birthdates.pop(0)
arrest_dates.pop(0)
post_dachau.pop(0)
nationality.pop(0)
arrival_dates.pop(0)

assert len(birthdates) == len(arrest_dates)
assert len(birthdates) == len(post_dachau)
assert len(birthdates) == len(nationality)
assert len(birthdates) == len(arrival_dates)



#list of indices (add 1 so the indexing is correct -- skipping 1st row)
indices = np.arange(len(birthdates)) + 1

mask = np.ones(len(birthdates), dtype=bool)

#quick fix for leap year date that is incorrect (get exception:  day not in month)
birthdates[2343] = '1898-02-28'

#now, iterate through all dates to convert to date form
for i in range(0, len(birthdates)):

	#if neither are null entries (or mis-formatted entries)
	if np.logical_and(len(birthdates[i]) == 10, len(arrest_dates[i]) == 10):

		#another cut to make sure not misformatted
		if np.logical_and("/" not in birthdates[i], "/" not in arrest_dates[i]):

			#replace string date in form YYYY-MM-DD with date object
			birthdates[i] = date( int(birthdates[i][:4]), int(birthdates[i][5:7]), int(birthdates[i][8:]) )
			arrest_dates[i] = date( int(arrest_dates[i][:4]), int(arrest_dates[i][5:7]), int(arrest_dates[i][8:]) )

		else:
			mask[i] = 0

	
	#if null or misformatted, set both to 0 (then can do np.nonzero cut)
	else:
		mask[i] = 0

	#if post_dachau is of weird form, exclude from data as well
        if "perished" in post_dachau[i] or "liberated" in post_dachau[i] or "Death" in post_dachau[i] or "released" in post_dachau[i]:
                a = 0
		
        else:
                mask[i] = 0
                
	#if arrival date not present
	if '/' not in arrival_dates[i]:
		mask[i] = 0



#convert to numpy array
birthdates = np.array(birthdates)
arrest_dates = np.array(arrest_dates)
post_dachau = np.array(post_dachau)
nationality = np.array(nationality)
arrival_dates = np.array(arrival_dates)


birthdates = birthdates[mask]
arrest_dates = arrest_dates[mask]
post_dachau = post_dachau[mask]
nationality = nationality[mask]
arrival_dates = arrival_dates[mask]
indices = indices[mask]


#check to make sure the arrays are still equal in length
assert len(birthdates) == len(arrest_dates)
assert len(birthdates) == len(post_dachau)
assert len(birthdates) == len(nationality)
assert len(birthdates) == len(arrival_dates)
assert len(indices) == len(indices)



#mask to determine fate
fate_mask = np.zeros(len(post_dachau), dtype=bool)

for i in range(0, len(post_dachau)):
	if "perished" in post_dachau[i] or "Death" in post_dachau[i]:
		fate_mask[i] = 1 


#list to store age when arrested
arrest_age = np.zeros(len(birthdates))

for i in range(0, len(birthdates)):
	arrest_age[i] = (arrest_dates[i] - birthdates[i]).days/365.0


#list to store incarceration time
incarceration_time = np.zeros(len(birthdates))

liberation_date = date(1945, 4, 29)

for i in range(0, len(arrest_dates)):
	incarceration_time[i] = (liberation_date - arrest_dates[i]).days/365.0


#mask for new bad dates
new_mask = np.zeros(len(birthdates), dtype=bool)
for i in range(0, len(birthdates)):
	if arrest_age[i] > 18:
		if np.logical_and(incarceration_time[i] < 15, incarceration_time[i] > 0):
			new_mask[i] = 1


fate_mask = fate_mask[new_mask]
arrest_age = arrest_age[new_mask]
incarceration_time = incarceration_time[new_mask]
birthdates = birthdates[new_mask]
arrest_dates = arrest_dates[new_mask]
post_dachau = post_dachau[new_mask]
nationality = nationality[new_mask]
indices = indices[new_mask]

arrival_dates = arrival_dates[new_mask]
assert arrest_age.size == incarceration_time.size


#now, make nationality masks
def make_mask(keyword):
	country_mask = np.zeros(len(birthdates), dtype=bool)
	for i in range(0, len(country_mask)):
		if keyword in nationality[i]:
			country_mask[i] = 1
	print np.sum(country_mask)
	return country_mask

german_mask = make_mask("German")
polish_mask = make_mask("Polish")
french_mask = make_mask("French")
czech_mask = make_mask("Czech")
dutch_mask = make_mask("Dutch")
italian_mask = make_mask("Italian")
belgian_mask = make_mask("Belgian")
#serbian_mask = make_mask("Serbian")
slovenian_mask = make_mask("Slovenian")
yugoslavian_mask = make_mask("Yugoslavian")


temp_ra = []


#process arrival dates in the form: "m/d/y 0:00"
for i in range(0, len(arrival_dates)):

        #first, strip off " 0:00" part of substring
        arrival_dates[i] = arrival_dates[i].replace(" 0:00", "")

        #split into components based on '/'
        components = arrival_dates[i].split('/')

        #convert to date
	temp_ra.append( date(int('19' + components[2]), int(components[0]), int(components[1])) )

arrival_dates = temp_ra



#list to store time spent at other camps
other_camp_time = np.zeros(len(arrival_dates))

#list to store time spent at Dachau
Dachau_time = np.zeros(len(arrival_dates))

for i in range(0, len(arrest_dates)):
	other_camp_time[i] = (arrival_dates[i] - arrest_dates[i]).days/365.0
        Dachau_time[i] = (liberation_date - arrival_dates[i]).days/365.0



#re-naming arrays for convenience
victim_ages = arrest_age[fate_mask]
survivor_ages = arrest_age[np.invert(fate_mask)]

victim_incarceration_time = incarceration_time[fate_mask]
survivor_incarceration_time = incarceration_time[np.invert(fate_mask)]

victim_other_camp_time = other_camp_time[fate_mask]
survivor_other_camp_time = other_camp_time[np.invert(fate_mask)]

victim_Dachau_time = Dachau_time[fate_mask]
survivor_Dachau_time = Dachau_time[np.invert(fate_mask)]





bins = np.arange(25)*70.0/25.0 + 10.0


plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(8, 8)
hist, ignore = np.histogram(victim_ages, bins=bins)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=0.5, label='died while incarcerated')
hist, ignore = np.histogram(survivor_ages, bins=bins)
plt.bar(center, hist, align='center', width=width, color='g', alpha=0.5, label='survived incarceration')
plt.axvline(victim_ages.mean(), color='b', linestyle='dashed', linewidth=2)
plt.axvline(survivor_ages.mean(), color='g', linestyle='dashed', linewidth=2)
plt.legend(loc='upper right')
plt.title("Histogram of Age when Arrested")
plt.xlabel("age (in years) when arrested")
plt.savefig("plots/hist_arrest_age.png", dpi=300)


plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(8, 8)
hist, ignore = np.histogram(victim_ages, bins=bins)
hist = hist/float(victim_ages.size)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=0.5, label='died while incarcerated')
hist, ignore = np.histogram(survivor_ages, bins=bins)
hist = hist/float(survivor_ages.size)
plt.bar(center, hist, align='center', width=width, color='g', alpha=0.5, label='survived incarceration')
plt.axvline(victim_ages.mean(), color='b', linestyle='dashed', linewidth=2)
plt.axvline(survivor_ages.mean(), color='g', linestyle='dashed', linewidth=2)
plt.legend(loc='upper right')
plt.title("Normalized Histogram of Age when Arrested")
plt.xlabel("age (in years) when arrested")
plt.savefig("plots/hist_arrest_age_normalized.png", dpi=300)




bins = np.arange(25)*8.0/25.0

plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(8, 8)
hist, ignore = np.histogram(victim_incarceration_time, bins=bins)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=0.5, label='died while incarcerated')
hist, ignore = np.histogram(survivor_incarceration_time, bins=bins)
plt.bar(center, hist, align='center', width=width, color='g', alpha=0.5, label='survived incarceration')
plt.axvline(victim_incarceration_time.mean(), color='b', linestyle='dashed', linewidth=2)
plt.axvline(survivor_incarceration_time.mean(), color='g', linestyle='dashed', linewidth=2)
plt.legend(loc='upper right')
plt.title("Histogram of Total Incarceration Time")
plt.xlabel("incarceration time (in years)")
plt.savefig("plots/hist_incarceration_time.png", dpi=300)


plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(8, 8)
hist, ignore = np.histogram(victim_incarceration_time, bins=bins)
hist = hist/float(victim_incarceration_time.size)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=0.5, label='died while incarcerated')
hist, ignore = np.histogram(survivor_incarceration_time, bins=bins)
hist = hist/float(survivor_incarceration_time.size)
plt.bar(center, hist, align='center', width=width, color='g', alpha=0.5, label='survived incarceration')
plt.axvline(victim_incarceration_time.mean(), color='b', linestyle='dashed', linewidth=2)
plt.axvline(survivor_incarceration_time.mean(), color='g', linestyle='dashed', linewidth=2)
plt.legend(loc='upper right')
plt.title("Normalized Histogram of Total Incarceration Time")
plt.xlabel("incarceration time (in years)")
plt.savefig("plots/hist_incarceration_time_normalized.png", dpi=300)



plt.clf()

#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(15, 20)

#ensures that there is ample whitespace between two panels (widthwise, heightwise)
plt.subplots_adjust(wspace=0.5)
plt.subplots_adjust(hspace=0.3)

for i in range(1, 13):

	plt.subplot(4,3,i)

	if i % 3 == 1:

		country_mask = 0

		if i == 1:
			country_mask = polish_mask
		if i == 4:
			country_mask = german_mask
		if i == 7:
			country_mask = french_mask
		if i == 10:
			country_mask = czech_mask


		bins = np.arange(20)*8.0/20.0
		hist, ignore = np.histogram(incarceration_time[country_mask], bins=bins)
		width = (bins[1] - bins[0])
		center = (bins[:-1] + bins[1:]) / 2
		plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
		plt.axvline(incarceration_time[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)

		if i == 1:
			plt.title("Total Incarceration Time, Polish")
		if i == 4:
			plt.title("Total Incarceration Time, German")
		if i == 7:
			plt.title("Total Incarceration Time, French")
		if i == 10:
			plt.title("Total Incarceration Time, Czech")

		plt.xlabel("incarceration time (years)")


        if i % 3 == 2:

                country_mask = 0

                if i == 2:
                        country_mask = polish_mask
                if i == 5:
                        country_mask = german_mask
                if i == 8:
                        country_mask = french_mask
                if i == 11:
                        country_mask = czech_mask


                bins = np.arange(20)*8.0/20.0
                hist, ignore = np.histogram(Dachau_time[country_mask], bins=bins)
                width = (bins[1] - bins[0])
                center = (bins[:-1] + bins[1:]) / 2
                plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
                plt.axvline(Dachau_time[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)

                if i == 2:
                        plt.title("Dachau Incarceration Time, Polish")
                if i == 5:
                        plt.title("Dachau Incarceration Time, German")
                if i == 8:
                        plt.title("Dachau Incarceration Time, French")
                if i == 11:
                        plt.title("Dachau Incarceration Time, Czech")

                plt.xlabel("incarceration time (years)")


        if i % 3 == 0:

                country_mask = 0

                if i == 3:
                        country_mask = polish_mask
                if i == 6:
                        country_mask = german_mask
                if i == 9:
                        country_mask = french_mask
                if i == 12:
                        country_mask = czech_mask


                bins = np.arange(20)*8.0/20.0
                hist, ignore = np.histogram(other_camp_time[country_mask], bins=bins)
                width = (bins[1] - bins[0])
                center = (bins[:-1] + bins[1:]) / 2
                plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
                plt.axvline(other_camp_time[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)

                if i == 3:
                        plt.title("Other Camp Incarceration Time, Polish")
                if i == 6:
                        plt.title("Other Camp Incarceration Time, German")
                if i == 9:
                        plt.title("Other Camp Incarceration Time, French")
                if i == 12:
                        plt.title("Other Camp Incarceration Time, Czech")

                plt.xlabel("incarceration time (years)")

plt.savefig("plots/multipanel_countries.png", dpi=300)







plt.clf()

#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(10, 20)

#ensures that there is ample whitespace between two panels (widthwise, heightwise)
plt.subplots_adjust(wspace=0.5)
plt.subplots_adjust(hspace=0.3)

for i in range(1, 9):

        plt.subplot(4,2,i)

        if i % 2 == 1:

                country_mask = 0

                if i == 1:
                        country_mask = polish_mask
                if i == 3:
                        country_mask = german_mask
                if i == 5:
                        country_mask = french_mask
                if i == 7:
                        country_mask = czech_mask

                bins = np.arange(25)*70.0/25.0 + 10.0
                hist, ignore = np.histogram(arrest_age[country_mask], bins=bins)
                width = (bins[1] - bins[0])
                center = (bins[:-1] + bins[1:]) / 2
                plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
                plt.axvline(arrest_age[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)

                if i == 1:
                        plt.title("Arrest Age, Polish")
                if i == 3:
                        plt.title("Arrest Age, German")
                if i == 5:
                        plt.title("Arrest Age, French")
                if i == 7:
                        plt.title("Arrest Age, Czech")

                plt.xlabel("arrest age (years)")


        if i % 2 == 0:

                country_mask = 0

                if i == 2:
                        country_mask = polish_mask
                if i == 4:
                        country_mask = german_mask
                if i == 6:
                        country_mask = french_mask
                if i == 8:
                        country_mask = czech_mask


                bins = np.arange(20)*8.0/20.0
                hist, ignore = np.histogram(incarceration_time[country_mask], bins=bins)
                width = (bins[1] - bins[0])
                center = (bins[:-1] + bins[1:]) / 2
                plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
                plt.axvline(incarceration_time[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)

                if i == 2:
                        plt.title("Incarceration Time, Polish")
                if i == 4:
                        plt.title("Incarceration Time, German")
                if i == 6:
                        country_mask = french_mask
                if i == 8:
                        country_mask = czech_mask


                bins = np.arange(20)*8.0/20.0
                hist, ignore = np.histogram(incarceration_time[country_mask], bins=bins)
                width = (bins[1] - bins[0])
                center = (bins[:-1] + bins[1:]) / 2
                plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
                plt.axvline(incarceration_time[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)

                if i == 2:
                        plt.title("Total Incarceration Time, Polish")
                if i == 4:
                        plt.title("Total Incarceration Time, German")
                if i == 6:
                        plt.title("Total Incarceration Time, French")
                if i == 8:
                        plt.title("Total Incarceration Time, Czech")

                plt.xlabel("incarceration time (years)")

plt.savefig("plots/multipanel_countries_2.png", dpi=300)









plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(8, 8)

plt.scatter(arrest_age[german_mask].mean(), incarceration_time[german_mask].mean(), color='r', label='German', marker='x') 
plt.scatter(arrest_age[polish_mask].mean(), incarceration_time[polish_mask].mean(), color='g', label='Polish', marker='x')
plt.scatter(arrest_age[french_mask].mean(), incarceration_time[french_mask].mean(), color='b', label='French', marker='x')
plt.scatter(arrest_age[czech_mask].mean(), incarceration_time[czech_mask].mean(), color='c', label='Czech', marker='x')
plt.scatter(arrest_age[dutch_mask].mean(), incarceration_time[dutch_mask].mean(), color='orange', label='Dutch', marker='x')
plt.scatter(arrest_age[belgian_mask].mean(), incarceration_time[belgian_mask].mean(), color='purple', label='Belgian', marker='x')

plt.legend()

plt.savefig('plots/scatter.png')




#### FOR FILE APPENDING
#opens full Dachau file for reading
with open(filepath1, 'rU') as f2:

        reader2 = csv.reader(f2)

        #opens output CSV file for writing
        with open(filepath2, 'w') as outfile:

                writer = csv.writer(outfile)

                i = 0

                #iterate over the rows of the bio CSV
                for row in reader2:

			if i == 0:
				row.insert(0, "Arrest age")
				row.insert(0, "Time spent at Dachau")
				row.insert(0, "Time spent at other camps")
				row.insert(0, "total incarceration time")	
				writer.writerow(row)


                        #if this coordinate persisted to the end
			elif i in indices:
				loc = np.nonzero(indices == i)[0]
				row.insert(0, arrest_age[loc][0])
				row.insert(0, Dachau_time[loc][0])
				row.insert(0, other_camp_time[loc][0])
				row.insert(0, incarceration_time[loc][0])
				writer.writerow(row)

                        else:
				for k in range(0, 4):
					row.insert(0,"")
				writer.writerow(row)

			i = i + 1
####






##### need to add all arrival dates to file

#list to store dates of arrival at Dachau
arrival_dates = []

#stores geo coordinates (need to split!)
coords = []

with open(filepath1, 'rU') as f1:

        #create csv reader object
        reader1 = csv.reader(f1)
        for row in reader1:
                arrival_dates.append(row[25])
		coords.append(row[0])



#opens full Dachau file for reading
with open(filepath2, 'rU') as f2:

        reader2 = csv.reader(f2)

        #opens output CSV file for writing
        with open(filepath3, 'w') as outfile:

                writer = csv.writer(outfile)

                i = 0

		prev = False
		temp_lat = 0
		temp_long = 0

                #iterate over the rows of the bio CSV
                for row in reader2:

                        if i == 0:
                                row.insert(0, "Arrival Date")
                       


                        #if this coordinate persisted to the end
                        elif '/' in arrival_dates[i]:
		
		                #first, strip off " 0:00" part of substring
        		        arrival_dates[i] = arrival_dates[i].replace(" 0:00", "")

        		        #split into components based on '/'
        		        components = arrival_dates[i].split('/')

        		        #convert to date
        		        row.insert(0, date(int('19' + components[2]), int(components[0]), int(components[1])).isoformat())
                        
					
                        else:
                                row.insert(0,"")
                                writer.writerow(row)

			if i == 0:
				row.insert(4, "Latitude")
				row.insert(5, "Longitude")

			elif prev == True:

				row.insert(4, temp_lat)	
				row.insert(5, temp_long)

			else:
				row.insert(4,"")
				row.insert(5,"")


                        if ',' in coords[i]:
                                components = coords[i].split(',')
                                temp_lat = components[0]
                                temp_long = components[1]
                                prev = True

                        else:
                                prev = False


			writer.writerow(row)
                        i = i + 1
####

sys.exit()