#!/usr/bin/env python # coding: utf-8 # # Entity Resolution in Japanese-American Internee Records # # ## Dependencies # # Dependencies are installed via the Pipfile in the same directory. # ## The Resolver # # Here we define a function that take two data frame-like objects with our chosen column names and attempts to identify matches between them. It returns a list of matches. # In[44]: import pandas as pd import numpy as np from IPython.display import display, HTML # from datetime import datetime, date from fuzzywuzzy import fuzz def resolveFamilyMembers( sourceDF, targetDF, debug=False ): # source, target result = pd.DataFrame(None, index=sourceDF.index, columns=['match']) threshold = float(.5) xdf = pd.DataFrame(float(0), index=targetDF.index, columns=sourceDF.index ) if debug: display(HTML('

grid of possible matches:

'), xdf) # Visit every cell in the grid and plug in a similarity value between the FAR (cols) and WRA (index) axis.. for (id1, row1) in targetDF.iterrows(): for (id2, row2) in sourceDF.iterrows(): xdf.at[id1, id2] = compare(row1, row2) if debug: display(HTML('

grid with similarity scores:

'),xdf) maxMatches = xdf.idxmax() # now we want to find the best matches for each FAR record if debug: display(HTML('

The first, best scores for each column:

'), maxMatches) for id2 in maxMatches.index: if xdf.loc[maxMatches[id2], id2] > threshold: # max could be zero, so check that we have acceptable score # sourceDF.at[id2, 'pid'] = targetDF.loc[maxMatches[id2]]['pid'] # assign the pid to the FAR row result.at[id2, 'match'] = maxMatches[id2] #xdf.drop([maxMatches[id2]], inplace=True) # drop the WRA row we used xdf.drop(columns=[id2], inplace=True) # drop the FAR column we used if debug: display(HTML('

Possibilities remaining:

'), xdf) if xdf.size == 1: # if only one possibility remains, choose it if debug: print('just one possibility left!') id1 = xdf.index[0] id2 = xdf.columns[0] # sourceDF.at[id2, 'pid'] = targetDF.loc[id1]['pid'] result.at[id2, 'match'] = id1 return result # Scoring is weird when there are multiple values to match.. # Using fn/ofn, byear+-1 for matching so far.. def compare(a, b): result = float(max( # use whichever score is higher in this list.. min(fn(a,b), by1(a,b)), # if either fn() or by1() return zero then this result is zero.. 0.5*(by1(a,b)), # if we only match the byear, then score is just .75 #### another compare method )) return result # returns score of 1 if fn matches or 1 if ofn matches fn # now with fuzzywuzzy ratio adjusted to btw 0 and 1 def fn(a, b): try: result = float(max( fuzz.ratio(str(a['fn']), str(b['fn'])), fuzz.ratio(str(a['fn']), str(b['ofn'])), # includes matching with other names fuzz.ratio(str(b['fn']), str(a['ofn'])), fuzz.ratio(str(b['ofn']), str(a['ofn'])), )/100) return result except TypeError as e: import traceback print(traceback.format_exc()) display(a['fn'], a['ofn'], b['fn'], b['ofn']) return 0 # returns a score of 1 if byears are within 1 year def by1(a, b): return ( abs( a['byear'] - b['byear'] ) <= 1) # ## Load the big datasets into Pandas data frames # In[45]: farfile = './far_cleanup-09-26-2021.xlsx' wrafile = './wra_cleanup-09-26-2021.csv' na = ['Unknown', 'unknown', 'Unk.', 'UNK', 'unk.', 'Unk', 'unk', ' ', ' ', ' '] fardf = pd.read_excel(farfile,dtype=str,na_values=na,keep_default_na=True) wradf = pd.read_csv(wrafile,dtype=str,na_values=na,keep_default_na=True) # ## Create consistent labels for the significant columns across datasets # # We are going to do a lot of work with these particular columns, let's use some short, easy labels and defined the data types we want: # # * m_pseudoid - str - a string that is unique to each row in the dataset (mostly) and used to verify results against a reference dataset. # * famid - str - WRA family id number (also used in FAR) # * fn - str - first name # * ofn - str - other first name(s) # * ln - str - last name # * byear - float - birth year # # We are also going to reindex, starting with 1 instead of zero. This makes it easier to create our unique 'id' column. The 'id' column is a globally unique url for the line within the source file. # In[46]: # first step, create uniform column names across dataframes wradf.rename(columns = {'NEW-m_lastname':'ln', 'NEW-m_firstname':'fn', 'NEW-m_familyno_normal':'famid', 'NEW-m_birthyear':'byear'}, inplace = True) wradf['row'] = wradf.index.map(lambda x: x+1) wradf['id'] = 'http://densho.org/data/WRA.csv#' + wradf['row'].map(str) wradf['ofn'] = None # not in WRA data wradf.set_index('row', inplace=True, verify_integrity=True) fardf.rename(columns = {'NEW-last_name_corrected':'ln', 'NEW-first_name_corrected':'fn', 'NEW-other_names':'ofn', 'NEW-family_number':'famid', 'NEW-year_of_birth':'byear'}, inplace = True) fardf['row'] = fardf.index.map(lambda x: x+1) fardf['id'] = 'http://densho.org/data/FAR.csv#' + fardf['row'].map(str) fardf['m_pseudoid'] = fardf['FAR Exit Dataset'] + ":" + fardf['original_order'] + ":" + fardf['far_line_id'] fardf.set_index('row', inplace=True, verify_integrity=True) # fardf.dropna(subset=['m_pseudoid'], inplace=True) # drop rows without pseudoid # fardf.dropna(subset=['fn'], inplace=True) # drop rows without first name # fardf.dropna(subset=['famid'], inplace=True) # no longer dropping empty famid rows b/c trying soundex # ## Gold Standard Reference Dataframe # # These matches were performed manually on a subset of FAR records, based on the list of internees that were transferred to North Dakota after the event described as a "riot". We are taking the Excel file and rearranging and simplifying that data in order to have a simple list of expected matched. Each line in Excel contains two FAR ids that are expect to match one WRA id. # # We will use this data as the "ground truth" to measure the accuracy of algorithmic entity resolution. The "gold" data frame is only the id columns. The additional columns are in "golddf". # In[47]: matchesfile = './Transfer_NDak_100-male_Jul-3-1945_CLEANED.xlsx' matches_src_df = pd.read_excel(matchesfile,dtype=str,na_values=na,keep_default_na=True) golddf = matches_src_df.loc[2:101, ['WRA_pseudoid', 'FAR_Exit_Dataset-original_order-far_line_id', 'FAR_Exit_Dataset-original_order-far_line_id.1', 'LastName', 'FirstName']] golddf = golddf.merge(wradf, left_on=['WRA_pseudoid'], right_on=['m_pseudoid'], how='left') golddf = golddf.loc[golddf.index.repeat(2),:].reset_index(drop=True) idx_duplicate = golddf.duplicated(keep="first") golddf.loc[idx_duplicate, 'farid'] = golddf['FAR_Exit_Dataset-original_order-far_line_id.1'] golddf.loc[~idx_duplicate, 'farid'] = golddf['FAR_Exit_Dataset-original_order-far_line_id'] golddf.rename(columns={'WRA_pseudoid': 'wraid'}, inplace=True) gold = golddf.loc[:,['wraid', 'farid']] gold # In[48]: gold[gold.farid.str.contains('NOT') == True] # ## Combine our FAR and WRA datasets. # In[49]: # alldf = pd.concat([wradf, fardf], axis=0, keys=['wra', 'far']) # In[50]: # alldf.head() # ## Use case-insensitive strings for names # # We want to treat most of our columns as case insensitive strings. `casefold()` is a function that is more consistent in different languages than simply calling `lower()`. # In[51]: # deal with empty and missing strings => None def clean(alldf): alldf['famid'] = alldf['famid'].astype(str) alldf['fn'] = alldf['fn'].astype(str) alldf['ln'] = alldf['ln'].astype(str) alldf['ofn'] = alldf['ofn'].astype(str) # Casefold is like lowercase, but better for consistent case insensitive match in more languages alldf['famid'] = alldf['famid'].str.casefold() alldf['fn'] = alldf['fn'].str.casefold() alldf['ln'] = alldf['ln'].str.casefold() alldf['ofn'] = alldf['ofn'].str.casefold() alldf.replace(r'^\s*$|^none$', value=np.nan, regex=True, inplace=True) alldf.loc[:, 'ofn'] = alldf.loc[:, 'ofn'].where(alldf.ofn.notna(), None) alldf.loc[:, 'fn'] = alldf.loc[:, 'fn'].where(alldf.fn.notna(), None) clean(fardf) clean(wradf) # ## Clean the family ids # # Some of the family ids in the source data may not make sense. Other family ids may have spaces at the start or the end that need to be trimmed. Sometimes the family id string is just a number. Others are a combination of number and letter code, with varying separator characters, such as hyphens and spaces. Others are two numbers separated by a slash. Perhaps this indicates that someone has a new family in addition to their original family ID. # # Let's first use a regex to find all the rows do not match our expectations. # In[52]: weird_famids = fardf[ fardf['famid'].str.contains('^\d+$', na=True) == False ][['famid']] display(weird_famids) print(weird_famids.count()) # So we have 11168 weird Fam IDs. # # We see all of these letter and number patterns in the data: # * xx-11-xx # * xx 11 xx # * 11-xx-11 # * 11 xx 11 # * xx-11 # * xx 11 # * 11-xx # * 11 xx # # These patterns are not consistent in the WRA or the FAR data. We'll have to make them consistent if we want them to match up. (The letter and number segments above can be longer than two characters.) # # Some of these weird values record that the family ID was unknown when the record was created; "unk", "unk.", or "unknown". Others may have been known at the time, but are now \[illegible\]. Only one row contains a note rather than a number. # # For our purposes today, we are going to count all of the "unknown" FIDs as NaN or "not a number", which says that effectively there is no useful value available. # In[53]: foofid = fardf.famid.str.replace(r'^([a-zA-Z]+)[\s-](\d+)$', lambda m: m.group(1).lower()+'-'+m.group(2), case=False, regex=True) foofid = foofid.str.replace(r'^(\d+)[\s-]([a-zA-Z]+)$', lambda m: m.group(2).lower()+'-'+m.group(1), case=False, regex=True) foofid = foofid.str.replace(r'^(\d+)[\s-]([a-zA-Z]+)[\s-](\d+)$', lambda m: m.group(1)+'-'+m.group(2).lower()+'-'+m.group(3), case=False, regex=True) foofid = foofid.str.replace(r'^([a-zA-Z]+)[\s-](\d+)[\s-]([a-zA-Z]+)$', lambda m: m.group(1).lower()+'-'+m.group(2)+'-'+m.group(3).lower(), case=False, regex=True) #foofid[ foofid.str.contains(' ', na=False) ] display(fardf.dtypes) # In[54]: fardf[ fardf['famid'].str.contains('^\d+$', na=True) == False ][['famid']] ## weird Fam IDs should be gone # In[55]: print('Fam ID NaN count: %s / %s' % (fardf['famid'].isna().sum(), fardf['famid'].count())) # ## Clean birth year data # # Some of the birth years are considered to be strings by Pandas. This includes values such as "17 y". We can clean this up with a regular expression that extracts the numeric digits. Since some of the birth years are missing and therefore NaN or "not a number", we will set the data type of the series to "float", since we can do the necessary birth year arithmetic on a float and NaN is considered a float. # In[56]: fardf['byear'] = fardf['byear'].str.replace('(\d+).*', lambda m: m.group(1), regex=True) fardf['byear'] = fardf['byear'].astype(str).astype(float) wradf['byear'] = wradf['byear'].str.replace('(\d+).*', lambda m: m.group(1), regex=True) wradf['byear'] = wradf['byear'].astype(str).astype(float) # ## Test the `resolveFamilyMembers()` function on a single family # # Here was are modifying the FAR dataframe using the matches returned by the function. Since the index of the matches is the same index from the original FAR DF, we can simply assign the match series to a new column and the indices will be aligned for us. # In[62]: # Look at our person matching data problem w/in a family fam_far = fardf[fardf['famid']=='7423'][['famid','ln','fn','ofn','byear']] fam_wra = wradf[wradf['famid']=='7423'][['famid','ln','fn','ofn','byear']] display(fam_far) all_matchesdf2 = pd.DataFrame() matches = resolveFamilyMembers(fam_far, fam_wra, debug=True) display(matches) all_matchesdf2 = pd.concat([all_matchesdf2, matches]) display(all_matchesdf2) fardf['wra row'] = all_matchesdf2.match fardf[fardf['famid']=='7423'][['famid','ln','fn','ofn','byear', 'wra row']] # ## The Whole Enchilada # # Let's see if we can perform matching against the entire dataset. What could go wrong? # In[63]: fams = 10 skip = 5000 count = 0 all_matchesdf = pd.DataFrame() unmatched_famids = pd.DataFrame(columns=['dataset', 'famid']) for famid, frame in fardf.groupby(['famid']): # , 'FAR Exit Dataset']): # count = count + 1 # if count < skip: # continue # if count >= skip + fams: # break try: wra_fam = wradf[wradf['famid'] == famid] if wra_fam.empty: unmatched_famids.loc[len(unmatched_famids.index)] = ['far', famid] continue matches = resolveFamilyMembers(frame, wra_fam, debug=False) all_matchesdf = pd.concat([all_matchesdf, matches]) except KeyError as e: display(e) unmatched_famids.to_csv('./unmatched_famids.csv') #fardf['wra row'] = all_matchesdf.match all_matchesdf.head() # In[ ]: # display(all_matchesdf.count()) #fardf['wra row'] = all_matchesdf[0] # In[ ]: unmatched_famids.groupby(['dataset']).count() # In[ ]: # fardf.dropna(subset=['famid'], inplace=True) fardf.loc[7000:7010, :][['famid','ln','fn','ofn','byear', 'wra row']] # In[ ]: print( 'unmatched with famid: ' + str(fardf['wra row'].isna().sum()) + ' / '+ str(fardf['famid'].count())) print( 'unmatched with famid percent: ' + str(fardf['wra row'].isna().sum()/fardf['famid'].count()*100)) # # Measure Accuracy against Gold Standard Dataset of 204 FAR records # # 199 or the 204 FAR records were able to be matched up by hand with their WRA records. Now we want to see how many of these same records were matched by the algorithm so far. # In[ ]: cnt = 0 matched = 0 for index, row in gold.iterrows(): cnt += 1 try: far_row = fardf[fardf['m_pseudoid'] == row['farid']] wra_row = wradf.index[wradf['m_pseudoid'] == row['wraid']].tolist()[0] if (wra_row != None) & (far_row['wra row'].tolist()[0] == wra_row): matched += 1 except: pass print('matched: %s / %s'%(matched, cnt)) gold_unmatched = gold.farid.str.contains('NOT').sum() print('gold matched: %s / %s'%(cnt-gold_unmatched, cnt)) # # Try resolving the remainder with family name soundex segments # In[ ]: from libindic.soundex import Soundex instance = Soundex() fardf['ln_soundex'] = fardf[fardf['ln'].isna() == False]['ln'].apply(instance.soundex) wradf['ln_soundex'] = wradf[wradf['ln'].isna() == False]['ln'].apply(instance.soundex) # In[ ]: sndx_matchesdf = pd.DataFrame() unmatched_lnsx = pd.DataFrame(columns=['dataset', 'lnsx']) unmatched_far = fardf[fardf['wra row'].isna()] for lnsx, frame in unmatched_far.groupby(['ln_soundex']): try: wra = wradf[wradf['ln_soundex'] == lnsx] if wra.empty: continue; matches = resolveFamilyMembers(frame, wra, debug=False) sndx_matchesdf = pd.concat([sndx_matchesdf, matches]) # display(fardf[fardf['famid']==fam][['famid','pid','fn','ofn','byear', 'wra row']]) except KeyError as e: # display(e) unmatched_lnsx.loc[len(unmatched_lnsx.index)] = ['far', lnsx] continue unmatched_lnsx.to_csv('./unmatched_lnsx.csv') # In[ ]: sndx_matchesdf.count() # In[ ]: fardf['wra row sndx'] = sndx_matchesdf.match # In[ ]: fardf.to_csv('./MATCHED FAR.csv') # In[ ]: fardf[~fardf['wra row'].isna() | ~fardf['wra row sndx'].isna()].count() # In[ ]: 1- 131790 / 151728 # In[ ]: 151728 - 131790 # In[ ]: cnt = 0 matched = 0 famid_errors = 0 soundex_errors = 0 for index, row in gold.iterrows(): cnt += 1 try: far_row = fardf[fardf['m_pseudoid'] == row['farid']] wraidx = wradf.index[wradf['m_pseudoid'] == row['wraid']].tolist()[0] if (far_row['wra row'].tolist()[0] == wraidx) | (far_row['wra row sndx'].tolist()[0] == wraidx): matched += 1 if (far_row['wra row'].tolist()[0] != wraidx): # & (~far_row['wra row'].isna()): famid_errors += 1 if (far_row['wra row sndx'].tolist()[0] != wraidx): # & (~far_row['wra row sndx'].isna()): famid_errors += 1 except: pass print('matched: %s / %s'%(matched, cnt)) print('famid errors: %s / %s'%(famid_errors, cnt)) print('soundex errors: %s / %s'%(soundex_errors, cnt)) gold_unmatched = gold.farid.str.contains('NOT').sum() print('gold matched: %s / %s'%(cnt-gold_unmatched, cnt)) # ## Family ID discrepancies # # We can see above that many family IDs were found in one dataset, but not found in the other dataset. If you examine these IDs you can see that formatting differences are common, with hypens and spaces used interchangeably. Also the letter codes are sometimes before or after the numeric portion of the ID. # # We are going to need to create consistency for the resolved to work on these families. Then we'll see how many unmatched family IDs are left. # # To create consistency we need to capture the letter and the number portions of the IDs and reformat them one consistent way. We'll choose xx-1111 as that target format. We're only going to reformat IDs that have a letter portion, leaving the plain integers alone. # In[ ]: alldf[ alldf['famid'] =='15170'][['fn', 'famid', 'ofn']] # In[ ]: alldf[ alldf['fn'] =='harry'][['fn', 'famid', 'ofn', 'sn']] # In[ ]: