#!/usr/bin/env python
# coding: utf-8

# # Import the English Language Model
# 
# If you have not already done so, you will need to run this code to download the language model.

# In[5]:


import sys
get_ipython().system('{sys.executable} -m spacy download en_core_web_sm')


# # Defining variables

# In[1]:


## define directory path and entity type
import os
cwd = os.getcwd()
data_loc = cwd + "/data"
output_loc = cwd + "/output/"
ent_type = "PERSON"

### entity type can be "PERSON", "NORP", "ORG", "GPE", etc.
### https://spacy.io/api/annotation#named-entities


# # Imports and setup

# In[2]:


import spacy
from spacy import displacy
import os
import string
import codecs
import subprocess
from collections import Counter

nlp = spacy.load('en_core_web_sm')


# # Walk the directory and collect text files

# In[3]:


allfiles = []

for root, dirs, files in os.walk(data_loc):
    for file in files:
        if file.endswith(".txt"):
            allfiles.append(os.path.join(root, file))
            
print('files: %d ' % len(allfiles))


# In[4]:


myfile = codecs.open(allfiles[0], 'r', encoding='utf-8')
pagetext=myfile.read()
myfile.close()


# # First pass: Parse the text and recognize entities
# 
# Here we apply the plain, "out of the box" Spacy English model to our text document. 
# We then display the first sentence as a dependency graph and the entire document
# with highlighted entities.

# In[5]:


def parse():
    doc = nlp(pagetext)
    sentence_spans = list(doc.sents)
    displacy.render(sentence_spans[0:1], options={'compact': True}, style="dep")
    displacy.render(doc, options={'compact': True}, style="ent")
    

# In[6]:


parse()


# # Student Exercise
# 
# Analyze the results obtained above. How accurate are the entities that are recognized. Can you point out any reasons why certain mistakes were made
# by the "out of the box" model?

# # Create Line-by-Line Sentence Boundaries
# 
# Our directory text files contain one group of related words per line, but they aren't exactly sentences.
# Let's see if we can improve the NLP output by telling the pipeline that each line is a sentence of related
# words. The code below creates a function 'set_newline_sentences', which is added to our NLP pipeline.
# 
# ## Newline and Escape Characters
# The newline character in text-encoded files that is only indirectly visible. It causes the character after it
# to jump to the next when the file is printed or displayed in an editor or viewer. In programming languages you
# often need to create a newline character within a string, without typing a literal line-break. Instead we use
# an "escape code" to add the invisible character. Newline's escape code is '\n'. String escape code in most 
# programming languages start with a '\', for instance a tab character is created by placing '\t' in a string.

# In[7]:


def set_newline_sentences(doc):
    for token in doc[:-1]:
        if token.text == "\n":
            doc[token.i+1].is_sent_start = True
        elif doc[token.i].is_sent_start is None:
            doc[token.i].is_sent_start = False
    return doc

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(set_newline_sentences, before="parser")


# In[8]:


parse()


# In[13]:


from spacy.pipeline import EntityRuler
race_entities = EntityRuler(nlp)
patterns = [{"label": "RACE", "pattern": [{"LOWER": "black"},]},
            {"label": "RACE", "pattern": [{"LOWER": "white"},]}]
race_entities.add_patterns(patterns)

nlp = spacy.load('en_core_web_sm')
nlp.entity.add_label('RACE')
nlp.add_pipe(set_newline_sentences, before="parser")
nlp.add_pipe(race_entities, before="ner")


# In[14]:


parse()


# In[28]:


from spacy.tokens import Span
def lastname_follows_race_entities(doc):
    new_ents = []
    for ent in doc.ents:
        new_ents.append(ent)
        if ent.label_ == "RACE":
            next_token = doc[ent.end].nbor()
            new_ent = Span(doc, next_token.i, next_token.i + 1, label="PERSON")
            new_ents.append(new_ent)
    doc.ents = new_ents
    return doc

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(set_newline_sentences, name="newline", before="parser")
nlp.entity.add_label('RACE')
nlp.add_pipe(race_entities, name="race", before="ner")
nlp.add_pipe(lastname_follows_race_entities, name="lastname", after='race')


# In[29]:


parse()


# # Return top entities

# In[26]:


os.makedirs(output_loc)
os.chdir(output_loc)


namecount = Counter(filter_entlist)
fullnamecount = Counter(filter_entlist2)
commonnames = [x for x in fullnamecount.most_common() if x[1] > 5]
commonall = [x for x in namecount.most_common() if x[1] > 5]

entities_table = []

for name in commonnames:
    row = [(name[0])[0].encode('utf-8'), name[1]]
    entities_table.append(row)

out_path = "entities_fullnames.csv"

header = ['Name', 'Frequency']

with open(out_path, 'w') as fo:
    csv_writer = csv.writer(fo)
    csv_writer.writerow(header)
    csv_writer.writerows(entities_table)
    
entities_table2 = []

for name in commonall:
    row = [(name[0])[0].encode('utf-8'), name[1]]
    entities_table2.append(row)

out_path = "names_all.csv"

header = ['Name', 'Frequency']

with open(out_path, 'w') as fo:
    csv_writer = csv.writer(fo)
    csv_writer.writerow(header)
    csv_writer.writerows(entities_table2)


# In[ ]:


# In[ ]: