#!/usr/bin/env python # coding: utf-8 # # Parsing Text Data Patterns Line-by-Line # # When a text uses natural language, i.e. normal human speech or prose, then data is embedded in # the particular syntax and lexicon of that language. Natural Language Processing employs sophisticated # models, trained on millions of documents, to parse natural language for meaningful information. # # In the case of the Charlotte directory, the text resembles a table with rows and columns, more so than # prose. So there are no sentence structures for NLP to use as hints. Instead it reverts to using the English # lexicon alone to establish parts of speech and relationships. The result is not much data, in the form of named # entity recognition. # # In fact, the lines in our text file have non-English rules that we can establish and use to extract data directly. # For instance, each line starts with a description of the household "race" category. Then within households there # are often members of another race and these are also labeled. After each mention of race there is usually a surname # and then a given name, but not always. Our job is to describe and encode these rules in formal patterns. # In[134]: import re import os cwd = os.getcwd() data_loc = cwd + "/data" output_loc = cwd + "/output" allfiles = [] # find all the text files for root, dirs, files in os.walk(data_loc): for file in files: if file.endswith(".txt"): allfiles.append(os.path.join(root, file)) print('%d files' % len(allfiles)) alltext = [] for file in allfiles: with open(file, "r") as a_file: for line in a_file: alltext.append(line) print('%d lines of text' % len(alltext)) # In[58]: def parse(pattern): data = [] for line in alltext: match = pattern.search(line) if match is None: data.append(None) print(line) else: data.append(match.groupdict()) return data # In[45]: race_pattern = re.compile("^(Black|White).*$") # Match any line starting with Black or White data = parse(race_pattern) print(data[0:3]) # In[46]: race_pattern = re.compile("^(?PBlack|White)\t(?P\w*).*$") # Match surname after a tab character data = parse(race_pattern) print(data[0:3]) # In[49]: pattern = re.compile("^(?PBlack|White)\t(?P\w*)\t+(?PMiss|Dr|Rev|Mrs).*$") # Match specific titles data = parse(pattern) print(data[0:3]) # Now that we have captured all of the lines that have titles, we need to deal with the fact that # many lines do not include a title at all. So we need to make the title pattern group optional, using a leading question mark. # In[51]: pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\t+(?P<title>Miss|Dr|Rev|Mrs)?.*$") # Match specific titles data = parse(pattern) print(data[0:3]) # From the output above we learned that our new pattern with optional title matches every line except one. # The reason is the '\t' or tab character we are expecting between the surname and the title. In the one # exceptional line, printed above by our parse() function, the surname and title are separated by a space. # Instead of using the tab character, let's use a generic "whitespace" detector, which is '\w'. # In[53]: pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\w+(?P<title>Miss|Dr|Rev|Mrs)?.*$") # Match specific titles data = parse(pattern) print(data[0:3]) # Now all the text lines are matching again and we have the titles, when they are present. # # The next group is the head of household's given name. # In[56]: pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\W+(?P<title>Miss|Dr|Rev|Mrs)?(?P<hohgiven>\w+).*$") # Match specific titles data = parse(pattern) print(data[0:3]) # Sometimes there is a middle initial for the head of the household.. It appears that # the initial is only ever one whitespace character away from the head of household given name. # We are also going to change our display output so that we can look more closely at the results.. # In[60]: pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\W+(?P<title>Miss|Dr|Rev|Mrs)?(?P<hohgiven>\w+)(\W(?P<hohmi>\w))?.*$") # Match specific titles data = parse(pattern) for i in range(0, len(alltext)): print(alltext[i]) print(data[i]) # Next is often the name of a second person in the household, but not always. Sometimes the next # word is the start of the head of household occupation. These are distinct in that names start # with an upper case character. So our second given name pattern will ask for an upper case first character. # In[62]: pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\W+(?P<title>Miss|Dr|Rev|Mrs)?(?P<hohgiven>\w+)(\W(?P<hohmi>\w))?\W+(?P<given2>[A-Z]{1}\w*)?.*$") # Match specific titles data = parse(pattern) for i in range(0, 5): print(alltext[i]) print(data[i]) # The second person may also optionally have a middle initial. # In[64]: pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\W+(?P<title>Miss|Dr|Rev|Mrs)?(?P<hohgiven>\w+)(\W(?P<hohmi>\w))?\W+(?P<given2>[A-Z]{1}\w*)?(\W(?P<mi2>\w))?.*$") # Match specific titles data = parse(pattern) for i in range(0, 5): print(alltext[i]) print(data[i]) # Now we come to the occupation, which is a series of lower case words, separated by single spaces. # In[72]: pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\W+(?P<title>Miss|Dr|Rev|Mrs)?(?P<hohgiven>\w+)(\W(?P<hohmi>\w))?\W+(?P<given2>[A-Z]{1}\w*)?(\W(?P<mi2>\w))?\W+(?P<occupation>[a-z ]+)?.*$") # Match specific titles data = parse(pattern) for i in range(0, 5): print(alltext[i]) print(data[i]) # The next bit seems to be either "House" or "Boards" or the capitalized, often multi-word, name of a business. # Let's add a required "House" or "Boards" later in the pattern as required for a match. Then the workplace pattern will # be before that and optional. # In[96]: pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w+)(\W+(?P<title>Miss|Dr|Rev|Mrs))?(\W+(?P<hohgiven>\w+))(\W+(?P<hohmi>[A-Z]{1}))?(\W+(?P<given2>[A-Z]{1}\w*))?(\W+(?P<mi2>[A-Z]{1}))?(\W+(?P<occupation>[a-z ]+))?(\W+(?P<workplace>[A-Z0-9]{1}[A-Za-z0-9 /&-]+))?(\W+(?P<la>House|Boards|Rooms))?(\W+(?P<address>[A-Za-z0-9 /]+))?$") data = parse(pattern) for i in range(0, 5): print(alltext[i]) print(data[i]) # Now looking at the unmatched lines, we see a number of problems remaining, each a special case # that we need to add to the overall pattern.. # # * Widow pattern with deceased husband in parentheses # ```White Adams wid (Geo O)``` # # * Widower pattern with deceased wife in parentheses. # ```White Adkins Walter D (Leona E)``` # # The parentheses above are unexpected in our current pattern. # In[138]: pattern = re.compile(r'^(?P<race>Black|White)\t(?P<surname>\w+)(\W+wid \((?P<deadhusband>[\w ])\))?(\W+(?P<title>Miss|Dr|Rev|Mrs))?(\W+(?P<hohgiven>\w+))(\W+(?P<hohmi>[A-Z]{1})\W)?(\W*(?P<given2>(?!House|Boards|Rooms)[A-Z]{1}\w*))?(\W+(?P<mi2>[A-Z]{1})\W)?(\W*(?P<occupation>[a-z ]+))?(\W+(?P<workplace>(?!House|Boards|Rooms)[A-Z0-9]{1}[A-Za-z0-9- /&\']+))?(\W+(?P<la>House|Boards|Rooms))?(\W+(?P<address>[A-Za-z0-9 ]+))?.*$') data = parse(pattern) for i in range(0, 5): print(alltext[i]) print(data[i]) # In[139]: import json with open(os.path.join(output_loc,'data.json'), 'w') as outfile: json.dump(data, outfile) # In[ ]: # In[ ]: