#!/usr/bin/env python # coding: utf-8 # # Import the English Language Model # # If you have not already done so, you will need to run this code to download the language model. # In[5]: import sys get_ipython().system('{sys.executable} -m spacy download en_core_web_sm') # # Defining variables # In[1]: ## define directory path and entity type import os cwd = os.getcwd() data_loc = cwd + "/data" output_loc = cwd + "/output/" ent_type = "PERSON" ### entity type can be "PERSON", "NORP", "ORG", "GPE", etc. ### https://spacy.io/api/annotation#named-entities # # Imports and setup # In[2]: import spacy from spacy import displacy import os import string import codecs import subprocess from collections import Counter nlp = spacy.load('en_core_web_sm') # # Walk the directory and collect text files # In[3]: allfiles = [] for root, dirs, files in os.walk(data_loc): for file in files: if file.endswith(".txt"): allfiles.append(os.path.join(root, file)) print('files: %d ' % len(allfiles)) # In[4]: myfile = codecs.open(allfiles[0], 'r', encoding='utf-8') pagetext=myfile.read() myfile.close() # # First pass: Parse the text and recognize entities # # Here we apply the plain, "out of the box" Spacy English model to our text document. # We then display the first sentence as a dependency graph and the entire document # with highlighted entities. # In[5]: def parse(): doc = nlp(pagetext) sentence_spans = list(doc.sents) displacy.render(sentence_spans[0:1], options={'compact': True}, style="dep") displacy.render(doc, options={'compact': True}, style="ent") # In[6]: parse() # # Student Exercise # # Analyze the results obtained above. How accurate are the entities that are recognized. Can you point out any reasons why certain mistakes were made # by the "out of the box" model? # # Create Line-by-Line Sentence Boundaries # # Our directory text files contain one group of related words per line, but they aren't exactly sentences. # Let's see if we can improve the NLP output by telling the pipeline that each line is a sentence of related # words. The code below creates a function 'set_newline_sentences', which is added to our NLP pipeline. # # ## Newline and Escape Characters # The newline character in text-encoded files that is only indirectly visible. It causes the character after it # to jump to the next when the file is printed or displayed in an editor or viewer. In programming languages you # often need to create a newline character within a string, without typing a literal line-break. Instead we use # an "escape code" to add the invisible character. Newline's escape code is '\n'. String escape code in most # programming languages start with a '\', for instance a tab character is created by placing '\t' in a string. # In[7]: def set_newline_sentences(doc): for token in doc[:-1]: if token.text == "\n": doc[token.i+1].is_sent_start = True elif doc[token.i].is_sent_start is None: doc[token.i].is_sent_start = False return doc nlp = spacy.load('en_core_web_sm') nlp.add_pipe(set_newline_sentences, before="parser") # In[8]: parse() # In[13]: from spacy.pipeline import EntityRuler race_entities = EntityRuler(nlp) patterns = [{"label": "RACE", "pattern": [{"LOWER": "black"},]}, {"label": "RACE", "pattern": [{"LOWER": "white"},]}] race_entities.add_patterns(patterns) nlp = spacy.load('en_core_web_sm') nlp.entity.add_label('RACE') nlp.add_pipe(set_newline_sentences, before="parser") nlp.add_pipe(race_entities, before="ner") # In[14]: parse() # In[28]: from spacy.tokens import Span def lastname_follows_race_entities(doc): new_ents = [] for ent in doc.ents: new_ents.append(ent) if ent.label_ == "RACE": next_token = doc[ent.end].nbor() new_ent = Span(doc, next_token.i, next_token.i + 1, label="PERSON") new_ents.append(new_ent) doc.ents = new_ents return doc nlp = spacy.load('en_core_web_sm') nlp.add_pipe(set_newline_sentences, name="newline", before="parser") nlp.entity.add_label('RACE') nlp.add_pipe(race_entities, name="race", before="ner") nlp.add_pipe(lastname_follows_race_entities, name="lastname", after='race') # In[29]: parse() # # Return top entities # In[26]: os.makedirs(output_loc) os.chdir(output_loc) namecount = Counter(filter_entlist) fullnamecount = Counter(filter_entlist2) commonnames = [x for x in fullnamecount.most_common() if x[1] > 5] commonall = [x for x in namecount.most_common() if x[1] > 5] entities_table = [] for name in commonnames: row = [(name[0])[0].encode('utf-8'), name[1]] entities_table.append(row) out_path = "entities_fullnames.csv" header = ['Name', 'Frequency'] with open(out_path, 'w') as fo: csv_writer = csv.writer(fo) csv_writer.writerow(header) csv_writer.writerows(entities_table) entities_table2 = [] for name in commonall: row = [(name[0])[0].encode('utf-8'), name[1]] entities_table2.append(row) out_path = "names_all.csv" header = ['Name', 'Frequency'] with open(out_path, 'w') as fo: csv_writer = csv.writer(fo) csv_writer.writerow(header) csv_writer.writerows(entities_table2) # In[ ]: # In[ ]: