#!/usr/bin/env python
# coding: utf-8

# # Parsing Text Data Patterns Line-by-Line
# 
# When a text uses natural language, i.e. normal human speech or prose, then data is embedded in
# the particular syntax and lexicon of that language. Natural Language Processing employs sophisticated
# models, trained on millions of documents, to parse natural language for meaningful information.
# 
# In the case of the Charlotte directory, the text resembles a table with rows and columns, more so than
# prose. So there are no sentence structures for NLP to use as hints. Instead it reverts to using the English
# lexicon alone to establish parts of speech and relationships. The result is not much data, in the form of named
# entity recognition.
# 
# In fact, the lines in our text file have non-English rules that we can establish and use to extract data directly.
# For instance, each line starts with a description of the household "race" category. Then within households there
# are often members of another race and these are also labeled. After each mention of race there is usually a surname
# and then a given name, but not always. Our job is to describe and encode these rules in formal patterns.

# In[134]:


import re
import os
cwd = os.getcwd()
data_loc = cwd + "/data"
output_loc = cwd + "/output"
allfiles = []

# find all the text files
for root, dirs, files in os.walk(data_loc):
    for file in files:
        if file.endswith(".txt"):
            allfiles.append(os.path.join(root, file))
            
print('%d files' % len(allfiles))

alltext = []
for file in allfiles:
    with open(file, "r") as a_file:
        for line in a_file: alltext.append(line)
        
print('%d lines of text' % len(alltext))


# In[58]:


def parse(pattern):
    data = []
    for line in alltext:
        match = pattern.search(line)
        if match is None:
            data.append(None)
            print(line)
        else:
            data.append(match.groupdict())
    return data


# In[45]:


race_pattern = re.compile("^(Black|White).*$")  # Match any line starting with Black or White
data = parse(race_pattern)
print(data[0:3])


# In[46]:


race_pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*).*$")  # Match surname after a tab character
data = parse(race_pattern)
print(data[0:3])


# In[49]:


pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\t+(?P<title>Miss|Dr|Rev|Mrs).*$")  # Match specific titles
data = parse(pattern)
print(data[0:3])


# Now that we have captured all of the lines that have titles, we need to deal with the fact that 
# many lines do not include a title at all. So we need to make the title pattern group optional, using a leading question mark.

# In[51]:


pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\t+(?P<title>Miss|Dr|Rev|Mrs)?.*$")  # Match specific titles
data = parse(pattern)
print(data[0:3])


# From the output above we learned that our new pattern with optional title matches every line except one.
# The reason is the '\t' or tab character we are expecting between the surname and the title. In the one
# exceptional line, printed above by our parse() function, the surname and title are separated by a space.
# Instead of using the tab character, let's use a generic "whitespace" detector, which is '\w'.

# In[53]:


pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\w+(?P<title>Miss|Dr|Rev|Mrs)?.*$")  # Match specific titles
data = parse(pattern)
print(data[0:3])


# Now all the text lines are matching again and we have the titles, when they are present.
# 
# The next group is the head of household's given name.

# In[56]:


pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\W+(?P<title>Miss|Dr|Rev|Mrs)?(?P<hohgiven>\w+).*$")  # Match specific titles
data = parse(pattern)
print(data[0:3])


# Sometimes there is a middle initial for the head of the household..  It appears that
# the initial is only ever one whitespace character away from the head of household given name.
# We are also going to change our display output so that we can look more closely at the results..

# In[60]:


pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\W+(?P<title>Miss|Dr|Rev|Mrs)?(?P<hohgiven>\w+)(\W(?P<hohmi>\w))?.*$")  # Match specific titles
data = parse(pattern)
for i in range(0, len(alltext)):
    print(alltext[i])
    print(data[i])


# Next is often the name of a second person in the household, but not always. Sometimes the next 
# word is the start of the head of household occupation. These are distinct in that names start 
# with an upper case character. So our second given name pattern will ask for an upper case first character.

# In[62]:


pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\W+(?P<title>Miss|Dr|Rev|Mrs)?(?P<hohgiven>\w+)(\W(?P<hohmi>\w))?\W+(?P<given2>[A-Z]{1}\w*)?.*$")  # Match specific titles
data = parse(pattern)
for i in range(0, 5):
    print(alltext[i])
    print(data[i])


# The second person may also optionally have a middle initial.

# In[64]:


pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\W+(?P<title>Miss|Dr|Rev|Mrs)?(?P<hohgiven>\w+)(\W(?P<hohmi>\w))?\W+(?P<given2>[A-Z]{1}\w*)?(\W(?P<mi2>\w))?.*$")  # Match specific titles
data = parse(pattern)
for i in range(0, 5):
    print(alltext[i])
    print(data[i])


# Now we come to the occupation, which is a series of lower case words, separated by single spaces.

# In[72]:


pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w*)\W+(?P<title>Miss|Dr|Rev|Mrs)?(?P<hohgiven>\w+)(\W(?P<hohmi>\w))?\W+(?P<given2>[A-Z]{1}\w*)?(\W(?P<mi2>\w))?\W+(?P<occupation>[a-z ]+)?.*$")  # Match specific titles
data = parse(pattern)
for i in range(0, 5):
    print(alltext[i])
    print(data[i])


# The next bit seems to be either "House" or "Boards" or the capitalized, often multi-word, name of a business.
# Let's add a required "House" or "Boards" later in the pattern as required for a match. Then the workplace pattern will
# be before that and optional.

# In[96]:


pattern = re.compile("^(?P<race>Black|White)\t(?P<surname>\w+)(\W+(?P<title>Miss|Dr|Rev|Mrs))?(\W+(?P<hohgiven>\w+))(\W+(?P<hohmi>[A-Z]{1}))?(\W+(?P<given2>[A-Z]{1}\w*))?(\W+(?P<mi2>[A-Z]{1}))?(\W+(?P<occupation>[a-z ]+))?(\W+(?P<workplace>[A-Z0-9]{1}[A-Za-z0-9 /&-]+))?(\W+(?P<la>House|Boards|Rooms))?(\W+(?P<address>[A-Za-z0-9 /]+))?$")
data = parse(pattern)
for i in range(0, 5):
    print(alltext[i])
    print(data[i])


# Now looking at the unmatched lines, we see a number of problems remaining, each a special case
# that we need to add to the overall pattern..
# 
# * Widow pattern with deceased husband in parentheses
# ```White	Adams		wid (Geo O)```
# 
# * Widower pattern with deceased wife in parentheses.
# ```White	Adkins Walter D (Leona E)```
# 
# The parentheses above are unexpected in our current pattern.

# In[138]:


pattern = re.compile(r'^(?P<race>Black|White)\t(?P<surname>\w+)(\W+wid \((?P<deadhusband>[\w ])\))?(\W+(?P<title>Miss|Dr|Rev|Mrs))?(\W+(?P<hohgiven>\w+))(\W+(?P<hohmi>[A-Z]{1})\W)?(\W*(?P<given2>(?!House|Boards|Rooms)[A-Z]{1}\w*))?(\W+(?P<mi2>[A-Z]{1})\W)?(\W*(?P<occupation>[a-z ]+))?(\W+(?P<workplace>(?!House|Boards|Rooms)[A-Z0-9]{1}[A-Za-z0-9- /&\']+))?(\W+(?P<la>House|Boards|Rooms))?(\W+(?P<address>[A-Za-z0-9 ]+))?.*$')
data = parse(pattern)

for i in range(0, 5):
    print(alltext[i])
    print(data[i])


# In[139]:


import json
with open(os.path.join(output_loc,'data.json'), 'w') as outfile:
    json.dump(data, outfile)


# In[ ]:


# In[ ]: