If you have not already done so, you will need to run this code to download the language model.
import sys
!{sys.executable} -m spacy download en_core_web_sm
Requirement already satisfied: en_core_web_sm==2.2.5 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (2.2.5)
Requirement already satisfied: spacy>=2.2.2 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from en_core_web_sm==2.2.5) (2.2.4)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (4.44.1)
Requirement already satisfied: setuptools in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (46.1.3)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.23.0)
Requirement already satisfied: thinc==7.4.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.4.0)
Requirement already satisfied: blis<0.5.0,>=0.4.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)
Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.6.0)
Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)
Requirement already satisfied: plac<1.2.0,>=0.9.6 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.1.3)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.2)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.3)
Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)
Requirement already satisfied: numpy>=1.15.0 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.18.2)
Requirement already satisfied: idna<3,>=2.5 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.9)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.25.8)
Requirement already satisfied: certifi>=2017.4.17 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2019.11.28)
Requirement already satisfied: chardet<4,>=3.0.2 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.4)
Requirement already satisfied: importlib-metadata>=0.20; python_version < "3.8" in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.6.0)
Requirement already satisfied: zipp>=0.5 in /home/jansen/.local/share/virtualenvs/bdarchives-nlp-worlW0cl/lib/python3.6/site-packages (from importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.1.0)
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')
## define directory path and entity type
import os
cwd = os.getcwd()
data_loc = cwd + "/data"
output_loc = cwd + "/output/"
ent_type = "PERSON"
### entity type can be "PERSON", "NORP", "ORG", "GPE", etc.
### https://spacy.io/api/annotation#named-entities
import spacy
from spacy import displacy
import os
import string
import codecs
import subprocess
from collections import Counter
nlp = spacy.load('en_core_web_sm')
allfiles = []
for root, dirs, files in os.walk(data_loc):
for file in files:
if file.endswith(".txt"):
allfiles.append(os.path.join(root, file))
print('files: %d ' % len(allfiles))
files: 4
myfile = codecs.open(allfiles[0], 'r', encoding='utf-8')
pagetext=myfile.read()
myfile.close()
Here we apply the plain, "out of the box" Spacy English model to our text document. We then display the first sentence as a dependency graph and the entire document with highlighted entities.
def parse():
doc = nlp(pagetext)
sentence_spans = list(doc.sents)
displacy.render(sentence_spans[0:1], options={'compact': True}, style="dep")
displacy.render(doc, options={'compact': True}, style="ent")
parse()
Analyze the results obtained above. How accurate are the entities that are recognized. Can you point out any reasons why certain mistakes were made by the "out of the box" model?
Our directory text files contain one group of related words per line, but they aren't exactly sentences. Let's see if we can improve the NLP output by telling the pipeline that each line is a sentence of related words. The code below creates a function 'set_newline_sentences', which is added to our NLP pipeline.
The newline character in text-encoded files that is only indirectly visible. It causes the character after it to jump to the next when the file is printed or displayed in an editor or viewer. In programming languages you often need to create a newline character within a string, without typing a literal line-break. Instead we use an "escape code" to add the invisible character. Newline's escape code is '\n'. String escape code in most programming languages start with a '', for instance a tab character is created by placing '\t' in a string.
def set_newline_sentences(doc):
for token in doc[:-1]:
if token.text == "\n":
doc[token.i+1].is_sent_start = True
elif doc[token.i].is_sent_start is None:
doc[token.i].is_sent_start = False
return doc
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(set_newline_sentences, before="parser")
parse()
from spacy.pipeline import EntityRuler
race_entities = EntityRuler(nlp)
patterns = [{"label": "RACE", "pattern": [{"LOWER": "black"},]},
{"label": "RACE", "pattern": [{"LOWER": "white"},]}]
race_entities.add_patterns(patterns)
nlp = spacy.load('en_core_web_sm')
nlp.entity.add_label('RACE')
nlp.add_pipe(set_newline_sentences, before="parser")
nlp.add_pipe(race_entities, before="ner")
parse()
from spacy.tokens import Span
def lastname_follows_race_entities(doc):
new_ents = []
for ent in doc.ents:
new_ents.append(ent)
if ent.label_ == "RACE":
next_token = doc[ent.end].nbor()
new_ent = Span(doc, next_token.i, next_token.i + 1, label="PERSON")
new_ents.append(new_ent)
doc.ents = new_ents
return doc
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe(set_newline_sentences, name="newline", before="parser")
nlp.entity.add_label('RACE')
nlp.add_pipe(race_entities, name="race", before="ner")
nlp.add_pipe(lastname_follows_race_entities, name="lastname", after='race')
parse()
os.makedirs(output_loc)
os.chdir(output_loc)
namecount = Counter(filter_entlist)
fullnamecount = Counter(filter_entlist2)
commonnames = [x for x in fullnamecount.most_common() if x[1] > 5]
commonall = [x for x in namecount.most_common() if x[1] > 5]
entities_table = []
for name in commonnames:
row = [(name[0])[0].encode('utf-8'), name[1]]
entities_table.append(row)
out_path = "entities_fullnames.csv"
header = ['Name', 'Frequency']
with open(out_path, 'w') as fo:
csv_writer = csv.writer(fo)
csv_writer.writerow(header)
csv_writer.writerows(entities_table)
entities_table2 = []
for name in commonall:
row = [(name[0])[0].encode('utf-8'), name[1]]
entities_table2.append(row)
out_path = "names_all.csv"
header = ['Name', 'Frequency']
with open(out_path, 'w') as fo:
csv_writer = csv.writer(fo)
csv_writer.writerow(header)
csv_writer.writerows(entities_table2)