import geocoder
import csv
import sys
import numpy as np
from collections import Counter
import re
import time
from datetime import date
import matplotlib.pyplot as plt
#file paths
filepath1 = "for_palladio.csv"
filepath2 = "temp.csv"
filepath3 = "output_with_dates.csv"
#list to store birthdates
birthdates = []
#list to store arrest dates
arrest_dates = []
#list to store post-Dachau fate
post_dachau = []
#list to store nationality
nationality = []
#list to store dates of arrival at Dachau
arrival_dates = []
with open(filepath1, 'rU') as f1:
#create csv reader object
reader1 = csv.reader(f1)
for row in reader1:
birthdates.append(row[7])
nationality.append(row[9])
arrest_dates.append(row[13])
post_dachau.append(row[26])
arrival_dates.append(row[25])
#delete first entry in all (just the header):
birthdates.pop(0)
arrest_dates.pop(0)
post_dachau.pop(0)
nationality.pop(0)
arrival_dates.pop(0)
assert len(birthdates) == len(arrest_dates)
assert len(birthdates) == len(post_dachau)
assert len(birthdates) == len(nationality)
assert len(birthdates) == len(arrival_dates)
#list of indices (add 1 so the indexing is correct -- skipping 1st row)
indices = np.arange(len(birthdates)) + 1
mask = np.ones(len(birthdates), dtype=bool)
#quick fix for leap year date that is incorrect (get exception: day not in month)
birthdates[2343] = '1898-02-28'
#now, iterate through all dates to convert to date form
for i in range(0, len(birthdates)):
#if neither are null entries (or mis-formatted entries)
if np.logical_and(len(birthdates[i]) == 10, len(arrest_dates[i]) == 10):
#another cut to make sure not misformatted
if np.logical_and("/" not in birthdates[i], "/" not in arrest_dates[i]):
#replace string date in form YYYY-MM-DD with date object
birthdates[i] = date( int(birthdates[i][:4]), int(birthdates[i][5:7]), int(birthdates[i][8:]) )
arrest_dates[i] = date( int(arrest_dates[i][:4]), int(arrest_dates[i][5:7]), int(arrest_dates[i][8:]) )
else:
mask[i] = 0
#if null or misformatted, set both to 0 (then can do np.nonzero cut)
else:
mask[i] = 0
#if post_dachau is of weird form, exclude from data as well
if "perished" in post_dachau[i] or "liberated" in post_dachau[i] or "Death" in post_dachau[i] or "released" in post_dachau[i]:
a = 0
else:
mask[i] = 0
#if arrival date not present
if '/' not in arrival_dates[i]:
mask[i] = 0
#convert to numpy array
birthdates = np.array(birthdates)
arrest_dates = np.array(arrest_dates)
post_dachau = np.array(post_dachau)
nationality = np.array(nationality)
arrival_dates = np.array(arrival_dates)
birthdates = birthdates[mask]
arrest_dates = arrest_dates[mask]
post_dachau = post_dachau[mask]
nationality = nationality[mask]
arrival_dates = arrival_dates[mask]
indices = indices[mask]
#check to make sure the arrays are still equal in length
assert len(birthdates) == len(arrest_dates)
assert len(birthdates) == len(post_dachau)
assert len(birthdates) == len(nationality)
assert len(birthdates) == len(arrival_dates)
assert len(indices) == len(indices)
#mask to determine fate
fate_mask = np.zeros(len(post_dachau), dtype=bool)
for i in range(0, len(post_dachau)):
if "perished" in post_dachau[i] or "Death" in post_dachau[i]:
fate_mask[i] = 1
#list to store age when arrested
arrest_age = np.zeros(len(birthdates))
for i in range(0, len(birthdates)):
arrest_age[i] = (arrest_dates[i] - birthdates[i]).days/365.0
#list to store incarceration time
incarceration_time = np.zeros(len(birthdates))
liberation_date = date(1945, 4, 29)
for i in range(0, len(arrest_dates)):
incarceration_time[i] = (liberation_date - arrest_dates[i]).days/365.0
#mask for new bad dates
new_mask = np.zeros(len(birthdates), dtype=bool)
for i in range(0, len(birthdates)):
if arrest_age[i] > 18:
if np.logical_and(incarceration_time[i] < 15, incarceration_time[i] > 0):
new_mask[i] = 1
fate_mask = fate_mask[new_mask]
arrest_age = arrest_age[new_mask]
incarceration_time = incarceration_time[new_mask]
birthdates = birthdates[new_mask]
arrest_dates = arrest_dates[new_mask]
post_dachau = post_dachau[new_mask]
nationality = nationality[new_mask]
indices = indices[new_mask]
arrival_dates = arrival_dates[new_mask]
assert arrest_age.size == incarceration_time.size
#now, make nationality masks
def make_mask(keyword):
country_mask = np.zeros(len(birthdates), dtype=bool)
for i in range(0, len(country_mask)):
if keyword in nationality[i]:
country_mask[i] = 1
print np.sum(country_mask)
return country_mask
german_mask = make_mask("German")
polish_mask = make_mask("Polish")
french_mask = make_mask("French")
czech_mask = make_mask("Czech")
dutch_mask = make_mask("Dutch")
italian_mask = make_mask("Italian")
belgian_mask = make_mask("Belgian")
#serbian_mask = make_mask("Serbian")
slovenian_mask = make_mask("Slovenian")
yugoslavian_mask = make_mask("Yugoslavian")
temp_ra = []
#process arrival dates in the form: "m/d/y 0:00"
for i in range(0, len(arrival_dates)):
#first, strip off " 0:00" part of substring
arrival_dates[i] = arrival_dates[i].replace(" 0:00", "")
#split into components based on '/'
components = arrival_dates[i].split('/')
#convert to date
temp_ra.append( date(int('19' + components[2]), int(components[0]), int(components[1])) )
arrival_dates = temp_ra
#list to store time spent at other camps
other_camp_time = np.zeros(len(arrival_dates))
#list to store time spent at Dachau
Dachau_time = np.zeros(len(arrival_dates))
for i in range(0, len(arrest_dates)):
other_camp_time[i] = (arrival_dates[i] - arrest_dates[i]).days/365.0
Dachau_time[i] = (liberation_date - arrival_dates[i]).days/365.0
#re-naming arrays for convenience
victim_ages = arrest_age[fate_mask]
survivor_ages = arrest_age[np.invert(fate_mask)]
victim_incarceration_time = incarceration_time[fate_mask]
survivor_incarceration_time = incarceration_time[np.invert(fate_mask)]
victim_other_camp_time = other_camp_time[fate_mask]
survivor_other_camp_time = other_camp_time[np.invert(fate_mask)]
victim_Dachau_time = Dachau_time[fate_mask]
survivor_Dachau_time = Dachau_time[np.invert(fate_mask)]
bins = np.arange(25)*70.0/25.0 + 10.0
plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(8, 8)
hist, ignore = np.histogram(victim_ages, bins=bins)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=0.5, label='died while incarcerated')
hist, ignore = np.histogram(survivor_ages, bins=bins)
plt.bar(center, hist, align='center', width=width, color='g', alpha=0.5, label='survived incarceration')
plt.axvline(victim_ages.mean(), color='b', linestyle='dashed', linewidth=2)
plt.axvline(survivor_ages.mean(), color='g', linestyle='dashed', linewidth=2)
plt.legend(loc='upper right')
plt.title("Histogram of Age when Arrested")
plt.xlabel("age (in years) when arrested")
plt.savefig("plots/hist_arrest_age.png", dpi=300)
plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(8, 8)
hist, ignore = np.histogram(victim_ages, bins=bins)
hist = hist/float(victim_ages.size)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=0.5, label='died while incarcerated')
hist, ignore = np.histogram(survivor_ages, bins=bins)
hist = hist/float(survivor_ages.size)
plt.bar(center, hist, align='center', width=width, color='g', alpha=0.5, label='survived incarceration')
plt.axvline(victim_ages.mean(), color='b', linestyle='dashed', linewidth=2)
plt.axvline(survivor_ages.mean(), color='g', linestyle='dashed', linewidth=2)
plt.legend(loc='upper right')
plt.title("Normalized Histogram of Age when Arrested")
plt.xlabel("age (in years) when arrested")
plt.savefig("plots/hist_arrest_age_normalized.png", dpi=300)
bins = np.arange(25)*8.0/25.0
plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(8, 8)
hist, ignore = np.histogram(victim_incarceration_time, bins=bins)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=0.5, label='died while incarcerated')
hist, ignore = np.histogram(survivor_incarceration_time, bins=bins)
plt.bar(center, hist, align='center', width=width, color='g', alpha=0.5, label='survived incarceration')
plt.axvline(victim_incarceration_time.mean(), color='b', linestyle='dashed', linewidth=2)
plt.axvline(survivor_incarceration_time.mean(), color='g', linestyle='dashed', linewidth=2)
plt.legend(loc='upper right')
plt.title("Histogram of Total Incarceration Time")
plt.xlabel("incarceration time (in years)")
plt.savefig("plots/hist_incarceration_time.png", dpi=300)
plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(8, 8)
hist, ignore = np.histogram(victim_incarceration_time, bins=bins)
hist = hist/float(victim_incarceration_time.size)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=0.5, label='died while incarcerated')
hist, ignore = np.histogram(survivor_incarceration_time, bins=bins)
hist = hist/float(survivor_incarceration_time.size)
plt.bar(center, hist, align='center', width=width, color='g', alpha=0.5, label='survived incarceration')
plt.axvline(victim_incarceration_time.mean(), color='b', linestyle='dashed', linewidth=2)
plt.axvline(survivor_incarceration_time.mean(), color='g', linestyle='dashed', linewidth=2)
plt.legend(loc='upper right')
plt.title("Normalized Histogram of Total Incarceration Time")
plt.xlabel("incarceration time (in years)")
plt.savefig("plots/hist_incarceration_time_normalized.png", dpi=300)
plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(15, 20)
#ensures that there is ample whitespace between two panels (widthwise, heightwise)
plt.subplots_adjust(wspace=0.5)
plt.subplots_adjust(hspace=0.3)
for i in range(1, 13):
plt.subplot(4,3,i)
if i % 3 == 1:
country_mask = 0
if i == 1:
country_mask = polish_mask
if i == 4:
country_mask = german_mask
if i == 7:
country_mask = french_mask
if i == 10:
country_mask = czech_mask
bins = np.arange(20)*8.0/20.0
hist, ignore = np.histogram(incarceration_time[country_mask], bins=bins)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
plt.axvline(incarceration_time[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)
if i == 1:
plt.title("Total Incarceration Time, Polish")
if i == 4:
plt.title("Total Incarceration Time, German")
if i == 7:
plt.title("Total Incarceration Time, French")
if i == 10:
plt.title("Total Incarceration Time, Czech")
plt.xlabel("incarceration time (years)")
if i % 3 == 2:
country_mask = 0
if i == 2:
country_mask = polish_mask
if i == 5:
country_mask = german_mask
if i == 8:
country_mask = french_mask
if i == 11:
country_mask = czech_mask
bins = np.arange(20)*8.0/20.0
hist, ignore = np.histogram(Dachau_time[country_mask], bins=bins)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
plt.axvline(Dachau_time[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)
if i == 2:
plt.title("Dachau Incarceration Time, Polish")
if i == 5:
plt.title("Dachau Incarceration Time, German")
if i == 8:
plt.title("Dachau Incarceration Time, French")
if i == 11:
plt.title("Dachau Incarceration Time, Czech")
plt.xlabel("incarceration time (years)")
if i % 3 == 0:
country_mask = 0
if i == 3:
country_mask = polish_mask
if i == 6:
country_mask = german_mask
if i == 9:
country_mask = french_mask
if i == 12:
country_mask = czech_mask
bins = np.arange(20)*8.0/20.0
hist, ignore = np.histogram(other_camp_time[country_mask], bins=bins)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
plt.axvline(other_camp_time[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)
if i == 3:
plt.title("Other Camp Incarceration Time, Polish")
if i == 6:
plt.title("Other Camp Incarceration Time, German")
if i == 9:
plt.title("Other Camp Incarceration Time, French")
if i == 12:
plt.title("Other Camp Incarceration Time, Czech")
plt.xlabel("incarceration time (years)")
plt.savefig("plots/multipanel_countries.png", dpi=300)
plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(10, 20)
#ensures that there is ample whitespace between two panels (widthwise, heightwise)
plt.subplots_adjust(wspace=0.5)
plt.subplots_adjust(hspace=0.3)
for i in range(1, 9):
plt.subplot(4,2,i)
if i % 2 == 1:
country_mask = 0
if i == 1:
country_mask = polish_mask
if i == 3:
country_mask = german_mask
if i == 5:
country_mask = french_mask
if i == 7:
country_mask = czech_mask
bins = np.arange(25)*70.0/25.0 + 10.0
hist, ignore = np.histogram(arrest_age[country_mask], bins=bins)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
plt.axvline(arrest_age[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)
if i == 1:
plt.title("Arrest Age, Polish")
if i == 3:
plt.title("Arrest Age, German")
if i == 5:
plt.title("Arrest Age, French")
if i == 7:
plt.title("Arrest Age, Czech")
plt.xlabel("arrest age (years)")
if i % 2 == 0:
country_mask = 0
if i == 2:
country_mask = polish_mask
if i == 4:
country_mask = german_mask
if i == 6:
country_mask = french_mask
if i == 8:
country_mask = czech_mask
bins = np.arange(20)*8.0/20.0
hist, ignore = np.histogram(incarceration_time[country_mask], bins=bins)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
plt.axvline(incarceration_time[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)
if i == 2:
plt.title("Incarceration Time, Polish")
if i == 4:
plt.title("Incarceration Time, German")
if i == 6:
country_mask = french_mask
if i == 8:
country_mask = czech_mask
bins = np.arange(20)*8.0/20.0
hist, ignore = np.histogram(incarceration_time[country_mask], bins=bins)
width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width, color='b', alpha=1.0, label='victims')
plt.axvline(incarceration_time[country_mask].mean(), color='r', linestyle='dashed', linewidth=2)
if i == 2:
plt.title("Total Incarceration Time, Polish")
if i == 4:
plt.title("Total Incarceration Time, German")
if i == 6:
plt.title("Total Incarceration Time, French")
if i == 8:
plt.title("Total Incarceration Time, Czech")
plt.xlabel("incarceration time (years)")
plt.savefig("plots/multipanel_countries_2.png", dpi=300)
plt.clf()
#sets dimensions of plot
fig = plt.gcf()
fig.set_size_inches(8, 8)
plt.scatter(arrest_age[german_mask].mean(), incarceration_time[german_mask].mean(), color='r', label='German', marker='x')
plt.scatter(arrest_age[polish_mask].mean(), incarceration_time[polish_mask].mean(), color='g', label='Polish', marker='x')
plt.scatter(arrest_age[french_mask].mean(), incarceration_time[french_mask].mean(), color='b', label='French', marker='x')
plt.scatter(arrest_age[czech_mask].mean(), incarceration_time[czech_mask].mean(), color='c', label='Czech', marker='x')
plt.scatter(arrest_age[dutch_mask].mean(), incarceration_time[dutch_mask].mean(), color='orange', label='Dutch', marker='x')
plt.scatter(arrest_age[belgian_mask].mean(), incarceration_time[belgian_mask].mean(), color='purple', label='Belgian', marker='x')
plt.legend()
plt.savefig('plots/scatter.png')
#### FOR FILE APPENDING
#opens full Dachau file for reading
with open(filepath1, 'rU') as f2:
reader2 = csv.reader(f2)
#opens output CSV file for writing
with open(filepath2, 'w') as outfile:
writer = csv.writer(outfile)
i = 0
#iterate over the rows of the bio CSV
for row in reader2:
if i == 0:
row.insert(0, "Arrest age")
row.insert(0, "Time spent at Dachau")
row.insert(0, "Time spent at other camps")
row.insert(0, "total incarceration time")
writer.writerow(row)
#if this coordinate persisted to the end
elif i in indices:
loc = np.nonzero(indices == i)[0]
row.insert(0, arrest_age[loc][0])
row.insert(0, Dachau_time[loc][0])
row.insert(0, other_camp_time[loc][0])
row.insert(0, incarceration_time[loc][0])
writer.writerow(row)
else:
for k in range(0, 4):
row.insert(0,"")
writer.writerow(row)
i = i + 1
####
##### need to add all arrival dates to file
#list to store dates of arrival at Dachau
arrival_dates = []
#stores geo coordinates (need to split!)
coords = []
with open(filepath1, 'rU') as f1:
#create csv reader object
reader1 = csv.reader(f1)
for row in reader1:
arrival_dates.append(row[25])
coords.append(row[0])
#opens full Dachau file for reading
with open(filepath2, 'rU') as f2:
reader2 = csv.reader(f2)
#opens output CSV file for writing
with open(filepath3, 'w') as outfile:
writer = csv.writer(outfile)
i = 0
prev = False
temp_lat = 0
temp_long = 0
#iterate over the rows of the bio CSV
for row in reader2:
if i == 0:
row.insert(0, "Arrival Date")
#if this coordinate persisted to the end
elif '/' in arrival_dates[i]:
#first, strip off " 0:00" part of substring
arrival_dates[i] = arrival_dates[i].replace(" 0:00", "")
#split into components based on '/'
components = arrival_dates[i].split('/')
#convert to date
row.insert(0, date(int('19' + components[2]), int(components[0]), int(components[1])).isoformat())
else:
row.insert(0,"")
writer.writerow(row)
if i == 0:
row.insert(4, "Latitude")
row.insert(5, "Longitude")
elif prev == True:
row.insert(4, temp_lat)
row.insert(5, temp_long)
else:
row.insert(4,"")
row.insert(5,"")
if ',' in coords[i]:
components = coords[i].split(',')
temp_lat = components[0]
temp_long = components[1]
prev = True
else:
prev = False
writer.writerow(row)
i = i + 1
####
sys.exit()