Here we define a function that take two data frame-like objects with our chosen column names and attempts to identify matches between them. It returns a list of matches.
import pandas as pd
import numpy as np
from IPython.display import display, HTML
# from datetime import datetime, date
from fuzzywuzzy import fuzz
def resolveFamilyMembers( sourceDF, targetDF, debug=False ): # source, target
result = pd.DataFrame(None, index=sourceDF.index, columns=['match'])
threshold = float(.5)
xdf = pd.DataFrame(float(0), index=targetDF.index, columns=sourceDF.index )
if debug:
display(HTML('<h3>grid of possible matches:</h3>'), xdf)
# Visit every cell in the grid and plug in a similarity value between the FAR (cols) and WRA (index) axis..
for (id1, row1) in targetDF.iterrows():
for (id2, row2) in sourceDF.iterrows():
xdf.at[id1, id2] = compare(row1, row2)
if debug:
display(HTML('<h3>grid with similarity scores:</h3>'),xdf)
maxMatches = xdf.idxmax() # now we want to find the best matches for each FAR record
if debug:
display(HTML('<h3>The first, best scores for each column:</h3>'), maxMatches)
for id2 in maxMatches.index:
if xdf.loc[maxMatches[id2], id2] > threshold: # max could be zero, so check that we have acceptable score
# sourceDF.at[id2, 'pid'] = targetDF.loc[maxMatches[id2]]['pid'] # assign the pid to the FAR row
result.at[id2, 'match'] = maxMatches[id2]
#xdf.drop([maxMatches[id2]], inplace=True) # drop the WRA row we used
xdf.drop(columns=[id2], inplace=True) # drop the FAR column we used
if debug:
display(HTML('<h3>Possibilities remaining:</h3>'), xdf)
if xdf.size == 1: # if only one possibility remains, choose it
if debug:
print('just one possibility left!')
id1 = xdf.index[0]
id2 = xdf.columns[0]
# sourceDF.at[id2, 'pid'] = targetDF.loc[id1]['pid']
result.at[id2, 'match'] = id1
return result
# Scoring is weird when there are multiple values to match..
# Using fn/ofn, byear+-1 for matching so far..
def compare(a, b):
result = float(max( # use whichever score is higher in this list..
min(fn(a,b), by1(a,b)), # if either fn() or by1() return zero then this result is zero..
0.5*(by1(a,b)), # if we only match the byear, then score is just .75
#### another compare method
))
return result
# returns score of 1 if fn matches or 1 if ofn matches fn
# now with fuzzywuzzy ratio adjusted to btw 0 and 1
def fn(a, b):
try:
result = float(max(
fuzz.ratio(str(a['fn']), str(b['fn'])),
fuzz.ratio(str(a['fn']), str(b['ofn'])), # includes matching with other names
fuzz.ratio(str(b['fn']), str(a['ofn'])),
fuzz.ratio(str(b['ofn']), str(a['ofn'])),
)/100)
return result
except TypeError as e:
import traceback
print(traceback.format_exc())
display(a['fn'], a['ofn'], b['fn'], b['ofn'])
return 0
# returns a score of 1 if byears are within 1 year
def by1(a, b):
return ( abs( a['byear'] - b['byear'] ) <= 1)
farfile = './far_cleanup-09-26-2021.xlsx'
wrafile = './wra_cleanup-09-26-2021.csv'
na = ['Unknown', 'unknown', 'Unk.', 'UNK', 'unk.', 'Unk', 'unk', ' ', ' ', ' ']
fardf = pd.read_excel(farfile,dtype=str,na_values=na,keep_default_na=True)
wradf = pd.read_csv(wrafile,dtype=str,na_values=na,keep_default_na=True)
We are going to do a lot of work with these particular columns, let's use some short, easy labels and defined the data types we want:
We are also going to reindex, starting with 1 instead of zero. This makes it easier to create our unique 'id' column. The 'id' column is a globally unique url for the line within the source file.
# first step, create uniform column names across dataframes
wradf.rename(columns = {'NEW-m_lastname':'ln', 'NEW-m_firstname':'fn', 'NEW-m_familyno_normal':'famid', 'NEW-m_birthyear':'byear'}, inplace = True)
wradf['row'] = wradf.index.map(lambda x: x+1)
wradf['id'] = 'http://densho.org/data/WRA.csv#' + wradf['row'].map(str)
wradf['ofn'] = None # not in WRA data
wradf.set_index('row', inplace=True, verify_integrity=True)
fardf.rename(columns = {'NEW-last_name_corrected':'ln', 'NEW-first_name_corrected':'fn', 'NEW-other_names':'ofn', 'NEW-family_number':'famid', 'NEW-year_of_birth':'byear'}, inplace = True)
fardf['row'] = fardf.index.map(lambda x: x+1)
fardf['id'] = 'http://densho.org/data/FAR.csv#' + fardf['row'].map(str)
fardf['m_pseudoid'] = fardf['FAR Exit Dataset'] + ":" + fardf['original_order'] + ":" + fardf['far_line_id']
fardf.set_index('row', inplace=True, verify_integrity=True)
# fardf.dropna(subset=['m_pseudoid'], inplace=True) # drop rows without pseudoid
# fardf.dropna(subset=['fn'], inplace=True) # drop rows without first name
# fardf.dropna(subset=['famid'], inplace=True) # no longer dropping empty famid rows b/c trying soundex
These matches were performed manually on a subset of FAR records, based on the list of internees that were transferred to North Dakota after the event described as a "riot". We are taking the Excel file and rearranging and simplifying that data in order to have a simple list of expected matched. Each line in Excel contains two FAR ids that are expect to match one WRA id.
We will use this data as the "ground truth" to measure the accuracy of algorithmic entity resolution. The "gold" data frame is only the id columns. The additional columns are in "golddf".
matchesfile = './Transfer_NDak_100-male_Jul-3-1945_CLEANED.xlsx'
matches_src_df = pd.read_excel(matchesfile,dtype=str,na_values=na,keep_default_na=True)
golddf = matches_src_df.loc[2:101, ['WRA_pseudoid', 'FAR_Exit_Dataset-original_order-far_line_id', 'FAR_Exit_Dataset-original_order-far_line_id.1', 'LastName', 'FirstName']]
golddf = golddf.merge(wradf, left_on=['WRA_pseudoid'], right_on=['m_pseudoid'], how='left')
golddf = golddf.loc[golddf.index.repeat(2),:].reset_index(drop=True)
idx_duplicate = golddf.duplicated(keep="first")
golddf.loc[idx_duplicate, 'farid'] = golddf['FAR_Exit_Dataset-original_order-far_line_id.1']
golddf.loc[~idx_duplicate, 'farid'] = golddf['FAR_Exit_Dataset-original_order-far_line_id']
golddf.rename(columns={'WRA_pseudoid': 'wraid'}, inplace=True)
gold = golddf.loc[:,['wraid', 'farid']]
gold
wraid | farid | |
---|---|---|
0 | 7-manzanar_ajisaka_1890_tatsuo | tulelake1:208:198 |
1 | 7-manzanar_ajisaka_1890_tatsuo | manzanar1:36:35 |
2 | 6-jerome_arichi_1894_saburo | tulelake1:628:602 |
3 | 6-jerome_arichi_1894_saburo | jerome1:293:289 |
4 | 2-poston_hagio_1889_takemats | tulelake1:2715:2575 |
... | ... | ... |
199 | 1-topaz_yamasaki_1914_satoru | topaz1:10959:10587 |
200 | 10-tulelake_yoshida_1911_riichi | tulelake1:29776:28342 |
201 | 10-tulelake_yoshida_1911_riichi | NOT IN FAR |
202 | 3-gilariver_yoshioka_1918_kiyozo | tulelake1:30146:28693 |
203 | 3-gilariver_yoshioka_1918_kiyozo | gilariver1:16851:16379 |
204 rows × 2 columns
gold[gold.farid.str.contains('NOT') == True]
wraid | farid | |
---|---|---|
51 | 10-tulelake_kiyama_1924_jimmie | NOT IN FAR |
53 | 10-tulelake_kiyama_1926_tommie | NOT IN FAR |
119 | NOT IN WRA | NOT IN FAR |
177 | 10-tulelake_toyota_1919_yoshimi | NOT IN FAR |
201 | 10-tulelake_yoshida_1911_riichi | NOT IN FAR |
# alldf = pd.concat([wradf, fardf], axis=0, keys=['wra', 'far'])
# alldf.head()
We want to treat most of our columns as case insensitive strings. casefold()
is a function that is more consistent in different languages than simply calling lower()
.
# deal with empty and missing strings => None
def clean(alldf):
alldf['famid'] = alldf['famid'].astype(str)
alldf['fn'] = alldf['fn'].astype(str)
alldf['ln'] = alldf['ln'].astype(str)
alldf['ofn'] = alldf['ofn'].astype(str)
# Casefold is like lowercase, but better for consistent case insensitive match in more languages
alldf['famid'] = alldf['famid'].str.casefold()
alldf['fn'] = alldf['fn'].str.casefold()
alldf['ln'] = alldf['ln'].str.casefold()
alldf['ofn'] = alldf['ofn'].str.casefold()
alldf.replace(r'^\s*$|^none$', value=np.nan, regex=True, inplace=True)
alldf.loc[:, 'ofn'] = alldf.loc[:, 'ofn'].where(alldf.ofn.notna(), None)
alldf.loc[:, 'fn'] = alldf.loc[:, 'fn'].where(alldf.fn.notna(), None)
clean(fardf)
clean(wradf)
/tmp/ipykernel_261915/2963965774.py:14: FutureWarning: Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)` alldf.replace(r'^\s*$|^none$', value=np.nan, regex=True, inplace=True)
Some of the family ids in the source data may not make sense. Other family ids may have spaces at the start or the end that need to be trimmed. Sometimes the family id string is just a number. Others are a combination of number and letter code, with varying separator characters, such as hyphens and spaces. Others are two numbers separated by a slash. Perhaps this indicates that someone has a new family in addition to their original family ID.
Let's first use a regex to find all the rows do not match our expectations.
weird_famids = fardf[ fardf['famid'].str.contains('^\d+$', na=True) == False ][['famid']]
display(weird_famids)
print(weird_famids.count())
famid | |
---|---|
row | |
40450 | 1932-07-01 00:00:00 |
40451 | 1932-07-01 00:00:00 |
40452 | 1932-07-01 00:00:00 |
41944 | 1933-07-01 00:00:00 |
41945 | 1933-07-01 00:00:00 |
... | ... |
151724 | nan |
151725 | nan |
151726 | nan |
151727 | nan |
151728 | nan |
10606 rows × 1 columns
famid 10606 dtype: int64
So we have 11168 weird Fam IDs.
We see all of these letter and number patterns in the data:
These patterns are not consistent in the WRA or the FAR data. We'll have to make them consistent if we want them to match up. (The letter and number segments above can be longer than two characters.)
Some of these weird values record that the family ID was unknown when the record was created; "unk", "unk.", or "unknown". Others may have been known at the time, but are now [illegible]. Only one row contains a note rather than a number.
For our purposes today, we are going to count all of the "unknown" FIDs as NaN or "not a number", which says that effectively there is no useful value available.
foofid = fardf.famid.str.replace(r'^([a-zA-Z]+)[\s-](\d+)$', lambda m: m.group(1).lower()+'-'+m.group(2), case=False, regex=True)
foofid = foofid.str.replace(r'^(\d+)[\s-]([a-zA-Z]+)$', lambda m: m.group(2).lower()+'-'+m.group(1), case=False, regex=True)
foofid = foofid.str.replace(r'^(\d+)[\s-]([a-zA-Z]+)[\s-](\d+)$', lambda m: m.group(1)+'-'+m.group(2).lower()+'-'+m.group(3), case=False, regex=True)
foofid = foofid.str.replace(r'^([a-zA-Z]+)[\s-](\d+)[\s-]([a-zA-Z]+)$', lambda m: m.group(1).lower()+'-'+m.group(2)+'-'+m.group(3).lower(), case=False, regex=True)
#foofid[ foofid.str.contains(' ', na=False) ]
display(fardf.dtypes)
FAR Exit Dataset object original_order object far_line_id object family_number object famid object last_name_corrected object ln object last_name_original object first_name_corrected object first_name_original object fn object other_names object ofn object date_of_birth object NEW-date_of_birth object year_of_birth object byear object sex object NEW_marital_status object citizenship object alien_registration_no. object type_of_original_entry object NEW-type_of_original_entry object NEW-pre-evacuation_address object pre-evacuation_state object date_of_original_entry object type_of_final_departure object NEW-type_of_final_departure object date_of_final_departure object final_departure_state object camp_address_original object camp_address_block object camp_address_barracks object camp_address_room object reference object notes object id object m_pseudoid object dtype: object
fardf[ fardf['famid'].str.contains('^\d+$', na=True) == False ][['famid']] ## weird Fam IDs should be gone
famid | |
---|---|
row | |
40450 | 1932-07-01 00:00:00 |
40451 | 1932-07-01 00:00:00 |
40452 | 1932-07-01 00:00:00 |
41944 | 1933-07-01 00:00:00 |
41945 | 1933-07-01 00:00:00 |
... | ... |
151724 | nan |
151725 | nan |
151726 | nan |
151727 | nan |
151728 | nan |
10606 rows × 1 columns
print('Fam ID NaN count: %s / %s' % (fardf['famid'].isna().sum(), fardf['famid'].count()))
Fam ID NaN count: 0 / 151728
Some of the birth years are considered to be strings by Pandas. This includes values such as "17 y". We can clean this up with a regular expression that extracts the numeric digits. Since some of the birth years are missing and therefore NaN or "not a number", we will set the data type of the series to "float", since we can do the necessary birth year arithmetic on a float and NaN is considered a float.
fardf['byear'] = fardf['byear'].str.replace('(\d+).*', lambda m: m.group(1), regex=True)
fardf['byear'] = fardf['byear'].astype(str).astype(float)
wradf['byear'] = wradf['byear'].str.replace('(\d+).*', lambda m: m.group(1), regex=True)
wradf['byear'] = wradf['byear'].astype(str).astype(float)
resolveFamilyMembers()
function on a single family¶Here was are modifying the FAR dataframe using the matches returned by the function. Since the index of the matches is the same index from the original FAR DF, we can simply assign the match series to a new column and the indices will be aligned for us.
# Look at our person matching data problem w/in a family
fam_far = fardf[fardf['famid']=='7423'][['famid','ln','fn','ofn','byear']]
fam_wra = wradf[wradf['famid']=='7423'][['famid','ln','fn','ofn','byear']]
display(fam_far)
all_matchesdf2 = pd.DataFrame()
matches = resolveFamilyMembers(fam_far, fam_wra, debug=True)
display(matches)
all_matchesdf2 = pd.concat([all_matchesdf2, matches])
display(all_matchesdf2)
fardf['wra row'] = all_matchesdf2.match
fardf[fardf['famid']=='7423'][['famid','ln','fn','ofn','byear', 'wra row']]
famid | ln | fn | ofn | byear | |
---|---|---|---|---|---|
row | |||||
27510 | 7423 | abe | agnes | harumi | 1920.0 |
27511 | 7423 | abe | hama | nan | 1900.0 |
27512 | 7423 | abe | janice | mikiye | 1923.0 |
27513 | 7423 | abe | shuji | nan | 1878.0 |
27514 | 7423 | abe | shuzo | nan | 1874.0 |
27515 | 7423 | abe | tami | nan | 1874.0 |
27516 | 7423 | abe | tomee | tomoye | 1935.0 |
row | 27510 | 27511 | 27512 | 27513 | 27514 | 27515 | 27516 |
---|---|---|---|---|---|---|---|
row | |||||||
73 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
80 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
113 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
258 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
260 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
276 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
289 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
row | 27510 | 27511 | 27512 | 27513 | 27514 | 27515 | 27516 |
---|---|---|---|---|---|---|---|
row | |||||||
73 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
80 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
113 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
258 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 |
260 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 |
276 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 |
289 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
row 27510 80 27511 73 27512 113 27513 258 27514 260 27515 260 27516 289 dtype: int64
row |
---|
row |
73 |
80 |
113 |
258 |
260 |
276 |
289 |
match | |
---|---|
row | |
27510 | 80 |
27511 | 73 |
27512 | 113 |
27513 | 258 |
27514 | 260 |
27515 | 260 |
27516 | 289 |
match | |
---|---|
row | |
27510 | 80 |
27511 | 73 |
27512 | 113 |
27513 | 258 |
27514 | 260 |
27515 | 260 |
27516 | 289 |
famid | ln | fn | ofn | byear | wra row | |
---|---|---|---|---|---|---|
row | ||||||
27510 | 7423 | abe | agnes | harumi | 1920.0 | 80 |
27511 | 7423 | abe | hama | nan | 1900.0 | 73 |
27512 | 7423 | abe | janice | mikiye | 1923.0 | 113 |
27513 | 7423 | abe | shuji | nan | 1878.0 | 258 |
27514 | 7423 | abe | shuzo | nan | 1874.0 | 260 |
27515 | 7423 | abe | tami | nan | 1874.0 | 260 |
27516 | 7423 | abe | tomee | tomoye | 1935.0 | 289 |
Let's see if we can perform matching against the entire dataset. What could go wrong?
fams = 10
skip = 5000
count = 0
all_matchesdf = pd.DataFrame()
unmatched_famids = pd.DataFrame(columns=['dataset', 'famid'])
for famid, frame in fardf.groupby(['famid']): # , 'FAR Exit Dataset']):
# count = count + 1
# if count < skip:
# continue
# if count >= skip + fams:
# break
try:
wra_fam = wradf[wradf['famid'] == famid]
if wra_fam.empty:
unmatched_famids.loc[len(unmatched_famids.index)] = ['far', famid]
continue
matches = resolveFamilyMembers(frame, wra_fam, debug=False)
all_matchesdf = pd.concat([all_matchesdf, matches])
except KeyError as e:
display(e)
unmatched_famids.to_csv('./unmatched_famids.csv')
#fardf['wra row'] = all_matchesdf.match
all_matchesdf.head()
#
display(all_matchesdf.count())
#fardf['wra row'] = all_matchesdf[0]
match 119953 dtype: int64
unmatched_famids.groupby(['dataset']).count()
famid | |
---|---|
dataset | |
far | 1490 |
# fardf.dropna(subset=['famid'], inplace=True)
fardf.loc[7000:7010, :][['famid','ln','fn','ofn','byear', 'wra row']]
famid | ln | fn | ofn | byear | wra row | |
---|---|---|---|---|---|---|
row | ||||||
7000 | 2324 | yamada | mitsuzo | yugetsu | 1903.0 | 100780 |
7001 | 2324 | yamada | mitzi | mitsue | 1925.0 | 100782 |
7002 | 2324 | yamada | yuriko | nan | 1929.0 | 101035 |
7003 | 2325 | segawa | chiyoko | angie | 1918.0 | 76628 |
7004 | 2325 | segawa | chiyoko | nan | 1918.0 | 76628 |
7005 | 2325 | segawa | sharleen | chizuko | 1944.0 | NaN |
7006 | 2325 | segawa | ted | jr. | 1941.0 | 76647 |
7007 | 2325 | segawa | theodore | soichi | 1941.0 | 76647 |
7008 | 2325 | segawa | theodore | nan | 1914.0 | 76648 |
7009 | 2326 | osugi | harumi | nan | 1914.0 | 70154 |
7010 | 2326 | osugi | mikio | donald | 1934.0 | 70161 |
print( 'unmatched with famid: ' + str(fardf['wra row'].isna().sum()) + ' / '+ str(fardf['famid'].count()))
print( 'unmatched with famid percent: ' + str(fardf['wra row'].isna().sum()/fardf['famid'].count()*100))
unmatched with famid: 31775 / 151728 unmatched with famid percent: 20.94208056522198
199 or the 204 FAR records were able to be matched up by hand with their WRA records. Now we want to see how many of these same records were matched by the algorithm so far.
cnt = 0
matched = 0
for index, row in gold.iterrows():
cnt += 1
try:
far_row = fardf[fardf['m_pseudoid'] == row['farid']]
wra_row = wradf.index[wradf['m_pseudoid'] == row['wraid']].tolist()[0]
if (wra_row != None) & (far_row['wra row'].tolist()[0] == wra_row):
matched += 1
except:
pass
print('matched: %s / %s'%(matched, cnt))
gold_unmatched = gold.farid.str.contains('NOT').sum()
print('gold matched: %s / %s'%(cnt-gold_unmatched, cnt))
matched: 155 / 204 gold matched: 199 / 204
from libindic.soundex import Soundex
instance = Soundex()
fardf['ln_soundex'] = fardf[fardf['ln'].isna() == False]['ln'].apply(instance.soundex)
wradf['ln_soundex'] = wradf[wradf['ln'].isna() == False]['ln'].apply(instance.soundex)
sndx_matchesdf = pd.DataFrame()
unmatched_lnsx = pd.DataFrame(columns=['dataset', 'lnsx'])
unmatched_far = fardf[fardf['wra row'].isna()]
for lnsx, frame in unmatched_far.groupby(['ln_soundex']):
try:
wra = wradf[wradf['ln_soundex'] == lnsx]
if wra.empty:
continue;
matches = resolveFamilyMembers(frame, wra, debug=False)
sndx_matchesdf = pd.concat([sndx_matchesdf, matches])
# display(fardf[fardf['famid']==fam][['famid','pid','fn','ofn','byear', 'wra row']])
except KeyError as e:
# display(e)
unmatched_lnsx.loc[len(unmatched_lnsx.index)] = ['far', lnsx]
continue
unmatched_lnsx.to_csv('./unmatched_lnsx.csv')
sndx_matchesdf.count()
match 11837 dtype: int64
fardf['wra row sndx'] = sndx_matchesdf.match
fardf.to_csv('./MATCHED FAR.csv')
fardf[~fardf['wra row'].isna() | ~fardf['wra row sndx'].isna()].count()
FAR Exit Dataset 131790 original_order 131790 far_line_id 131130 family_number 131669 famid 131790 last_name_corrected 131789 ln 131790 last_name_original 131694 first_name_corrected 131783 first_name_original 131783 fn 131790 other_names 48898 ofn 131790 date_of_birth 131200 NEW-date_of_birth 131200 year_of_birth 131217 byear 131223 sex 131697 NEW_marital_status 131683 citizenship 131707 alien_registration_no. 49199 type_of_original_entry 131718 NEW-type_of_original_entry 131710 NEW-pre-evacuation_address 129817 pre-evacuation_state 129503 date_of_original_entry 131720 type_of_final_departure 131177 NEW-type_of_final_departure 131176 date_of_final_departure 131182 final_departure_state 117194 camp_address_original 52351 camp_address_block 62573 camp_address_barracks 62137 camp_address_room 62105 reference 565 notes 2893 id 131790 m_pseudoid 131130 wra row 131790 ln_soundex 131790 wra row sndx 11837 dtype: int64
1- 131790 / 151728
0.13140620056944008
151728 - 131790
19938
cnt = 0
matched = 0
famid_errors = 0
soundex_errors = 0
for index, row in gold.iterrows():
cnt += 1
try:
far_row = fardf[fardf['m_pseudoid'] == row['farid']]
wraidx = wradf.index[wradf['m_pseudoid'] == row['wraid']].tolist()[0]
if (far_row['wra row'].tolist()[0] == wraidx) | (far_row['wra row sndx'].tolist()[0] == wraidx):
matched += 1
if (far_row['wra row'].tolist()[0] != wraidx): # & (~far_row['wra row'].isna()):
famid_errors += 1
if (far_row['wra row sndx'].tolist()[0] != wraidx): # & (~far_row['wra row sndx'].isna()):
famid_errors += 1
except:
pass
print('matched: %s / %s'%(matched, cnt))
print('famid errors: %s / %s'%(famid_errors, cnt))
print('soundex errors: %s / %s'%(soundex_errors, cnt))
gold_unmatched = gold.farid.str.contains('NOT').sum()
print('gold matched: %s / %s'%(cnt-gold_unmatched, cnt))
matched: 168 / 204 famid errors: 183 / 204 soundex errors: 0 / 204 gold matched: 199 / 204
We can see above that many family IDs were found in one dataset, but not found in the other dataset. If you examine these IDs you can see that formatting differences are common, with hypens and spaces used interchangeably. Also the letter codes are sometimes before or after the numeric portion of the ID.
We are going to need to create consistency for the resolved to work on these families. Then we'll see how many unmatched family IDs are left.
To create consistency we need to capture the letter and the number portions of the IDs and reformat them one consistent way. We'll choose xx-1111 as that target format. We're only going to reformat IDs that have a letter portion, leaving the plain integers alone.
alldf[ alldf['famid'] =='15170'][['fn', 'famid', 'ofn']]
alldf[ alldf['fn'] =='harry'][['fn', 'famid', 'ofn', 'sn']]