The next step, now that we can identify relevant records in each supporting dataset, is to use that information to make the PII redaction decision for a particular card. This will follow the procedure we outlined on the same flowchart as before. This is copied below for your reference.
The outer loop is there to indicate that we will process all of the 250 incident cards for PII. However, we can start by writing a Python function that makes that determination for just one card.
import pandas as pd
# reading in all of the data
data_Card = pd.read_csv( "Datasets/Cards_Box9.csv" )
data_Form26 = pd.read_csv("Datasets/WRAForm26.csv")
data_FAR = pd.read_csv("Datasets/TuleLake_FAR_ALL_FINAL4.csv")
# indexing the supporting datasets
g_FAR = data_FAR.groupby(data_FAR['LastName'].str.lower())
g_dataForm26 = data_Form26.groupby(data_Form26['LastName'].str.lower())
def lookup_form26(last, first, other): # note the additional "other" parameter
"""Looks up a name in Form 26, starting with the last name index."""
last = last.lower()
if g_dataForm26.groups.get(last) is None:
return None # No match on last name
for i in list(g_dataForm26.groups.get(last)):
val = data_Form26.values[i]
if first is not None and pd.notna(val[1]):
if first.lower() == val[1].lower():
return val # if both names match, return this record
elif other is not None and pd.notna(val[1]):
if other.lower() == val[1].lower():
return val # card other name matches form 26 first name
return None # if we reach the end of the family group w/o finding a matching value.
def lookup_far(last, first, other):
"""Looks up a name in FAR, starting with the last name index."""
last = last.lower()
if g_FAR.groups.get(last) is None:
return None # No match on last name
for i in list(g_FAR.groups.get(last)):
val = data_FAR.values[i]
if first is not None and pd.notna(val[2]):
if first.lower() == val[2].lower():
return val # if both names match, return this record
elif other is not None and pd.notna(val[3]):
if other.lower() == val[3].lower():
return val # card other name matches form 26 first name
return None # if we reach the end of the family group w/o finding a matching value.
from datetime import datetime, timedelta as delta
age_of_majority = 18
weeks_per_year = 52.1429
def reasonsToRedact(index): # i is the integer index of the card requested
"""Determines if a particular card, indicated by it's data_Card index,
is releasable. It returns a list of reasons the data cannot be released
or an empty list if it may be released."""
result = [] # this is where we will add redaction reasons
# check the not inmate column first, as these row don't have names
if pd.notna( data_Card.iloc[ index, 1 ] ):
# if nan, that indicates not an incarceree
result.append('Not an incarceree')
return result
# first we read some card data into local variables
# we have to make sure to replace NaN with None
last_name = data_Card.iloc[ index, 2 ]
last_name = None if pd.isna(last_name) else last_name
first_name = data_Card.iloc[ index, 3 ]
first_name = None if pd.isna(first_name) else first_name
other_name = data_Card.iloc[ index, 4 ]
other_name = None if pd.isna(other_name) else other_name
incident_date_str = data_Card.iloc[ index, 5 ]
incident_date = datetime.strptime(incident_date_str,'%m/%d/%y')
incident_date = incident_date.replace(year=incident_date.year - 100) # b/c Python assumes 2-digit years are 20XX
incident_year = int(data_Card.iloc[ index, 6 ])
far_match = lookup_far(last_name, first_name, other_name)
if far_match is not None:
birth_date_str = far_match[4]
birth_date = datetime.strptime(birth_date_str,'%m/%d/%Y')
weeks_until_majority = (age_of_majority + 1) * weeks_per_year
majority_date = birth_date + delta(weeks=weeks_until_majority)
if majority_date > incident_date:
result.append("FAR birthdate indicates still a minor")
f26_match = lookup_form26(last_name, first_name, other_name)
if f26_match is not None:
birth_year = f26_match[2]
majority_year = birth_year + 19
if majority_year > incident_year:
result.append("Form 26 indicates still a minor")
if far_match is None and f26_match is None:
result.append('No supporting data found')
return result
# let's do some testing
print(data_Card.values[1])
print(reasonsToRedact(1))
print('\n')
print(data_Card.values[2])
print(reasonsToRedact(2))
print('\n')
print(data_Card.values[3])
print(reasonsToRedact(3))
print('\n')
print(data_Card.values[6])
print(reasonsToRedact(6))
print('\n')
print(data_Card.values[112])
print(reasonsToRedact(112))
['Box9-0692.jpg' nan 'Ebesu' 'Kikumatsu' nan '7/24/42' 1942] ['No supporting data found'] ['Box9-0642.jpg' nan 'Doi' 'Satomi' nan '8/6/42' 1942] [] ['Box9-0765.jpg' nan 'Endo' nan 'Herbert' '8/25/42' 1942] ['FAR birthdate indicates still a minor', 'Form 26 indicates still a minor'] ['Box9-0632.jpg' nan 'Doi' 'Kanjiro' nan '10/3/42' 1942] [] ['Box9-0196.jpg' 'Y' nan nan nan '3/11/44' 1944] ['Not an incarceree']
This box 9 incident card dataset has already been redacted, which is why we can share it with you. So how can we know that the algorithm works well and how many determinations it can make?
Since we made the age_of_majority a variable, we can change it just for testing purposes..
age_of_majority = 75
print(data_Card.values[111])
print(reasonsToRedact(111))
['Box9-1053.jpg' nan 'Fujii' 'Yasuko' nan '3/7/44' 1944] ['FAR birthdate indicates still a minor']
NOTE: You will have to change the variable back to 18 before the code will run normally.
Let's go ahead and do that and then make sure that all of the PII has already been redacted.
age_of_majority = 18
count = 0
for i in range(0, 113):
reasons = reasonsToRedact(i)
if(len(reasons) > 0): # should be redacted!
if 'Not an incarceree' in reasons:
# oops nevermind, it was a staffer..
continue
else:
count = count + 1
print(str(data_Card.values[i]) +" for these reasons " + str(reasons))
print(count)
['Box9-0692.jpg' nan 'Ebesu' 'Kikumatsu' nan '7/24/42' 1942] for these reasons ['No supporting data found'] ['Box9-0765.jpg' nan 'Endo' nan 'Herbert' '8/25/42' 1942] for these reasons ['FAR birthdate indicates still a minor', 'Form 26 indicates still a minor'] ['Box9-0780.jpg' nan 'Enjoki' nan 'George' '1/10/43' 1943] for these reasons ['No supporting data found'] ['Box9-0015.jpg' nan 'Tujii' 'Yoshio' nan '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0168.jpg' nan 'Antoku' 'Teru' 'Charles' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0169.jpg' nan 'Antoku' nan 'Charles' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0201.jpg' nan 'CHI-no-maki' nan nan '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0509.jpg' nan 'Ayoama' 'Dan' nan '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0641.jpg' nan 'Doi' 'Noburo' nan '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0729.jpg' nan 'Kitagawa' 'Eichi' nan '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0759.jpg' nan 'Nogawa' 'Ematsu' nan '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0820.jpg' nan 'Yamamoto' 'Euchi ?' nan '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0951.jpg' nan 'Sugimoto' 'Frank' nan '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0956.jpg' nan 'Kawaii' nan 'Frank' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0959.jog' nan 'Miyake' 'S.' 'Frank' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0964.jpg' nan 'Takashita' nan 'Frank' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0965.jpg' nan 'Terakami' 'M.' 'Frank' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0972.jpg' nan 'Kakimoto' 'Masa' 'Freddie' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0977.jpg' nan 'Mori' 'H.' 'Fred' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0978.jpg' nan 'Mori' 'H.' 'Fred' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0979.jpg' nan 'Mori' 'H.' 'Fred' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0981.jpg' nan 'Morita' 'O.' 'Fred' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0985.jpg' nan 'Yamasaki' 'Saku' 'Fred' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-1024.jpg' nan 'Fujii' 'Y.' 'Fred' '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-1076.jpg' nan 'Fujimoto' 'Akiro' nan '11/4/43' 1943] for these reasons ['No supporting data found'] ['Box9-0980.jpg' nan 'Mori' 'H.' 'Fred' '11/13/43' 1943] for these reasons ['No supporting data found'] ['Box9-0634.jpg' nan 'Doi' 'Kasumi' nan '2/11/44' 1944] for these reasons ['No supporting data found'] ['Box9-1015.jpg' nan 'Fujihiro' 'Mieko' nan '3/7/44' 1944] for these reasons ['No supporting data found'] 28