Processing texts using spaCy
Contents
Processing texts using spaCy#
This section introduces you to basic tasks in natural language processing and how they can be performed using a Python library named spaCy.
After reading this section, you should:
know some of the key concepts and tasks in natural language processing
know how to perform simple natural language processing tasks using the spaCy library
Getting started#
To get started, we import spaCy, one of the many libraries available for natural language processing in Python.
# Import the spaCy library
import spacy
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[2], line 2
1 # Import the spaCy library
----> 2 import spacy
File ~/nlp/lib/python3.9/site-packages/spacy/__init__.py:11
8 setup_default_warnings() # noqa: E402
10 # These are imported as part of the API
---> 11 from thinc.api import prefer_gpu, require_gpu, require_cpu # noqa: F401
12 from thinc.api import Config
14 from . import pipeline # noqa: F401
File ~/nlp/lib/python3.9/site-packages/thinc/api.py:2
1 from .config import Config, registry, ConfigValidationError
----> 2 from .initializers import normal_init, uniform_init, glorot_uniform_init, zero_init
3 from .initializers import configure_normal_init
4 from .loss import CategoricalCrossentropy, L2Distance, CosineDistance
File ~/nlp/lib/python3.9/site-packages/thinc/initializers.py:4
1 from typing import Callable, cast
2 import numpy
----> 4 from .backends import Ops
5 from .config import registry
6 from .types import FloatsXd, Shape
File ~/nlp/lib/python3.9/site-packages/thinc/backends/__init__.py:7
4 from contextvars import ContextVar
5 import threading
----> 7 from .ops import Ops
8 from .cupy_ops import CupyOps, has_cupy
9 from .numpy_ops import NumpyOps
File ~/nlp/lib/python3.9/site-packages/thinc/backends/ops.py:13
11 from ..types import FloatsXd, Ints1d, Ints2d, Ints3d, Ints4d, IntsXd, _Floats
12 from ..types import DeviceTypes, Generator, Padded, Batchable, SizedGenerator
---> 13 from ..util import get_array_module, is_xp_array, to_numpy
16 ArrayT = TypeVar("ArrayT", bound=ArrayXd)
17 FloatsT = TypeVar("FloatsT", bound=_Floats)
File ~/nlp/lib/python3.9/site-packages/thinc/util.py:48
45 torch_version = Version("0.0.0")
47 try: # pragma: no cover
---> 48 import tensorflow.experimental.dlpack
49 import tensorflow as tf
51 has_tensorflow = True
File ~/nlp/lib/python3.9/site-packages/tensorflow/__init__.py:51
49 from ._api.v2 import autograph
50 from ._api.v2 import bitwise
---> 51 from ._api.v2 import compat
52 from ._api.v2 import config
53 from ._api.v2 import data
File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/__init__.py:37
3 """Compatibility functions.
4
5 The `tf.compat` module contains two sets of compatibility functions.
(...)
32
33 """
35 import sys as _sys
---> 37 from . import v1
38 from . import v2
39 from tensorflow.python.compat.compat import forward_compatibility_horizon
File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/__init__.py:30
28 from . import autograph
29 from . import bitwise
---> 30 from . import compat
31 from . import config
32 from . import data
File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/compat/__init__.py:37
3 """Compatibility functions.
4
5 The `tf.compat` module contains two sets of compatibility functions.
(...)
32
33 """
35 import sys as _sys
---> 37 from . import v1
38 from . import v2
39 from tensorflow.python.compat.compat import forward_compatibility_horizon
File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/compat/v1/__init__.py:47
45 from tensorflow._api.v2.compat.v1 import layers
46 from tensorflow._api.v2.compat.v1 import linalg
---> 47 from tensorflow._api.v2.compat.v1 import lite
48 from tensorflow._api.v2.compat.v1 import logging
49 from tensorflow._api.v2.compat.v1 import lookup
File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/lite/__init__.py:9
6 import sys as _sys
8 from . import constants
----> 9 from . import experimental
10 from tensorflow.lite.python.lite import Interpreter
11 from tensorflow.lite.python.lite import OpHint
File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/lite/experimental/__init__.py:8
3 """Public API for tf.lite.experimental namespace.
4 """
6 import sys as _sys
----> 8 from . import authoring
9 from tensorflow.lite.python.analyzer import ModelAnalyzer as Analyzer
10 from tensorflow.lite.python.lite import OpResolverType
File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/lite/experimental/authoring/__init__.py:8
3 """Public API for tf.lite.experimental.authoring namespace.
4 """
6 import sys as _sys
----> 8 from tensorflow.lite.python.authoring.authoring import compatible
File ~/nlp/lib/python3.9/site-packages/tensorflow/lite/python/authoring/authoring.py:43
39 import functools
42 # pylint: disable=g-import-not-at-top
---> 43 from tensorflow.lite.python import convert
44 from tensorflow.lite.python import lite
45 from tensorflow.lite.python.metrics import converter_error_data_pb2
File ~/nlp/lib/python3.9/site-packages/tensorflow/lite/python/convert.py:33
31 from tensorflow.lite.python.convert_phase import ConverterError
32 from tensorflow.lite.python.convert_phase import SubComponent
---> 33 from tensorflow.lite.python.metrics.wrapper import metrics_wrapper as _metrics_wrapper
34 from tensorflow.lite.toco import model_flags_pb2 as _model_flags_pb2
35 from tensorflow.lite.toco import toco_flags_pb2 as _conversion_flags_pb2
File ~/nlp/lib/python3.9/site-packages/tensorflow/lite/python/metrics/wrapper/metrics_wrapper.py:19
17 from tensorflow.lite.python import wrap_toco
18 from tensorflow.lite.python.metrics import converter_error_data_pb2
---> 19 from tensorflow.lite.python.metrics._pywrap_tensorflow_lite_metrics_wrapper import MetricsWrapper # pylint: disable=unused-import
22 def retrieve_collected_errors():
23 """Returns and clears the list of collected errors in ErrorCollector.
24
25 The RetrieveCollectedErrors function in C++ returns a list of serialized proto
(...)
29 A list of ConverterErrorData.
30 """
KeyboardInterrupt:
To perform natural language processing tasks for a given language, we must load a language model that has been trained to perform these tasks for the language in question.
spaCy supports many languages, but provides pre-trained language models for fewer languages.
These language models come in different sizes and flavours. We will explore these models and their differences later.
To get acquainted with basic tasks in natural language processing, we will start with a small language model for the English language.
Language models are loaded using spaCy’s load()
function, which takes the name of the model as input.
# Load the small language model for English and assign it to the variable 'nlp'
nlp = spacy.load('en_core_web_sm')
# Call the variable to examine the object
nlp
Calling the variable nlp
returns a spaCy Language object that contains a language model for the English language.
Esentially, spaCy’s Language object is a pipeline that uses the language model to perform a number of natural language processing tasks. We will return to these tasks shortly below.
What is a language model?#
Most modern language models are based on statistics instead of human-defined rules.
Statistical language models are based on probabilities, e.g.:
What is the probability of a given sentence occurring in a language?
How likely is a given word to occur in a sequence of words?
Consider the following sentences from the news articles from the previous sections:
From financial exchanges in
HIDDEN
Manhattan to cloakrooms in Washington and homeless shelters in California, unfamiliar rituals were the order of the day.
Security precautions were being taken around the
HIDDEN
as the deadline for Iraq to withdraw from Kuwait neared.
You can probably make informed guesses on the HIDDEN
words based on your knowledge of the English language and the world in general.
Similarly, creating a statistical language model involves observing the occurrence of words in large corpora and calculating their probabilities of occurrence in a given context. The language model can then be trained by making predictions and adjusting the model based on the errors made during prediction.
How are language models trained?#
The small language model for English, for instance, is trained on a corpus called OntoNotes 5.0, which features texts from different genres such as newswire text, broadcast news, broadcast and telephone conversations and blogs.
This allows the corpus to cover linguistic variation in both written and spoken English.
The OntoNotes 5.0 corpus consists of more than just plain text: the annotations include part-of-speech tags, syntactic dependencies and co-references between words.
This allows modelling not just the occurrence of particular words or their sequences, but their grammatical features as well.
Performing basic NLP tasks using spaCy#
To process text using the Language object containing the language model for English, we simply call the Language object nlp
on some text.
Let’s begin by defining a simple test sentence, a Python string object that is stored under the variable text
.
As usual, we can print out the contents by calling the variable.
# Assign an example sentence to the variable 'text'
text = "The Federal Bureau of Investigation has been ordered to track down as many as 3,000 Iraqis in this country whose visas have expired, the Justice Department said yesterday."
# Call the variable to examine the output
text
Passing the variable text
to the Language object nlp
returns a spaCy Doc object, short for document.
In natural language processing, longer pieces of text are commonly referred to as documents, although in this case our document consists of a single sentence.
This object contains both the input text stored under text
and the results of natural language processing using spaCy.
# Feed the string object under 'text' to the Language object under 'nlp'
# Store the result under the variable 'doc'
doc = nlp(text)
The Doc object is now stored under the variable doc
.
# Call the variable to examine the object
doc
Calling the variable doc
returns the contents of the object.
Although the output resembles that of a Python string, the Doc object contains a wealth of information about its linguistic structure, which spaCy generated by passing the text through the NLP pipeline.
We will now examine the tasks that were performed under the hood after we provided the input sentence to the language model.
Tokenization#
What takes place first is a task known as tokenization, which breaks the text down into analytical units in need of further processing.
In most cases, a token corresponds to words separated by whitespace, but punctuation marks are also considered as independent tokens. Because computers treat words as sequences of characters, assigning punctuation marks to their own tokens prevents trailing punctuation from attaching to the words that precede them.
The diagram below the outlines the tasks that spaCy can perform after a text has been tokenised, such as part-of-speech tagging, syntactic parsing and named entity recognition.
A spaCy Doc object is consists of a sequence of Token objects, which store the results of various natural language processing tasks.
Let’s print out each Token object stored in the Doc object doc
.
# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:
# Print each token
print(token)
The output shows one Token per line. As expected, punctuation marks such as ‘.’ and ‘,’ constitute their own Tokens.
Part-of-speech tagging#
Part-of-speech (POS) tagging is the task of determining the word class of a token. This is crucial for disambiguation, because different parts of speech may have similar forms.
Consider the example: The sailor dogs the hatch.
The present tense of the verb dog (to fasten something with something) is precisely the same as the plural form of the noun dog: dogs.
To identify the correct word class, we must examine the context in which the word appears.
spaCy provides two types of part-of-speech tags, coarse and fine-grained, which are stored under the attributes pos_
and tag_
, respectively.
We can access the attributes of a Python object by inserting the attribute after the object and separating them with a full stop, e.g. token.pos_
.
To access the results of POS tagging, let’s loop over the Doc object doc
and print each Token and its coarse and fine-grained part-of-speech tags.
# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:
# Print the token and the POS tags
print(token, token.pos_, token.tag_)
The coarse part-of-speech tags available under the pos_
attribute are based on the Universal Dependencies tag set.
The fine-grained part-of-speech tags under tag_
, in turn, are based on the OntoNotes 5.0 corpus introduced above.
In contrast to coarse part-of-speech tags, the fine-grained tags also encode grammatical information. The tags for verbs, for example, are distinguished by aspect and tense.
Morphological analysis#
Morphemes constitute the smallest grammatical units that carry meaning. Two types of morphemes are generally recognised: free morphemes, which consist of words that can stand on their own, and bound morphemes, which inflect other morphemes. For the English language, bound morphemes include suffixes such as -s, which is used to indicate the plural form of a noun.
Put differently, morphemes shape the external form of a word, and these forms are associated with given grammatical functions.
spaCy performs morphological analysis automatically and stores the result under the attribute morph
of a Token object.
# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:
# Print the token and the results of morphological analysis
print(token, token.morph)
As the output shows, all Tokens do not have morphological information, because they consist of free morphemes.
To retrieve morphological information from a Token object, we must use the get()
method of the morph
attribute.
We can use the brackets []
to access items in the Doc object.
The following line retrieves morphological information about aspect for the 22nd Token in the Doc object.
# Retrieve morphological information on aspect for the Token at index 22 in the Doc object
doc[22].morph.get('Aspect')
This returns a list with a single string item Perf, which refers to the perfective aspect.
What if we attempt to retrieve a morphological feature that a Token does not have?
Let’s attempt to retrieve information on aspect for the 21st Token in the Doc object.
# Retrieve morphological information on aspect for 21st Token in the Doc object
doc[21].morph.get('Aspect')
This returns an empty list, as indicated by the brackets [ ]
with nothing between them.
To retrieve all the morphological information available for a given Token, the best solution is to use the to_dict()
method of the morph
attribute.
This returns a dictionary, a Python data structure consisting of key and value pairs.
# Retrieve morphological information for 21st Token in the Doc object
# Use the to_dict() method to cast the result into a dictionary
doc[21].morph.to_dict()
A Python dictionary is marked by curly brackets { }
. Each key/value pair is separated by a colon :
. In this case, both keys and values consist of string objects.
The value stored under a key may be accessed by placing the key name in brackets [ ]
and placing this right after the name of the dictionary, as shown below.
# Assign morphological information to the dictionary 'morph_dict'
morph_dict = doc[21].morph.to_dict()
# Get the value corresponding to the key 'Mood'
morph_dict['Mood']
Dictionaries are a powerful data structure in Python, which we will frequently use for storing information.
Syntactic parsing#
Syntactic parsing (or dependency parsing) is the task of defining syntactic dependencies that hold between tokens.
The syntactic dependencies are available under the dep_
attribute of a Token object.
Let’s print out the syntactic dependencies for each Token in the Doc object.
# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:
# Print the token and its dependency tag
print(token, token.dep_)
Unlike part-of-speech tags that are associated with a single Token, dependency tags indicate a relation that holds between two Tokens.
To better understand the syntactic relations captured by dependency parsing, let’s use some of the additional attributes available for each Token:
i
: the position of the Token in the Doctoken
: the Token itselfdep_
: a tag for the syntactic relationhead
andi
: the Token that governs the current Token and its index
This illustrates how Python attributes can be used in a flexible manner: the attribute head
points to another Token, which naturally has the attribute i
that contains its index or position in the Doc. We can combine these two attributes to retrieve this information for any token by referring to .head.i
.
# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:
# Print the index of current token, the token itself, the dependency, the head and its index
print(token.i, token, token.dep_, token.head.i, token.head)
Although the output above helps to clarify the syntactic dependencies between tokens, they are generally much easier to perceive using diagrams.
spaCy provides a visualisation tool for visualising dependencies. This component of the spaCy library, displacy, can be imported using the following command.
from spacy import displacy
The displacy
module has a function named render()
, which takes a Doc object as input.
To draw a dependency tree, we provide the Doc object doc
to the render()
function with two arguments:
style
: The valuedep
instructs displacy to draw a visualisation for syntactic dependencies.options
: This argument takes a Python dictionary as input. We provide a dictionary with the keycompact
and Boolean valueTrue
to instruct displacy to draw a compact tree diagram. Additional options for formatting the visualisation can be found in spaCy documentation.
displacy.render(doc, style='dep', options={'compact': True})
The syntactic dependencies are visualised using lines that lead from the head Token to the Token governed by that head.
The dependency tags are based on Universal Dependencies, a framework for describing morphological and syntactic features across languages (for a theoretical discussion of Universal Dependencies, see de Marneffe et al. 2021).
If you don’t know what a particular tag means, spaCy provides a function for explaining the tags, explain()
, which takes a tag as input (note that the tags are case-sensitive).
spacy.explain('pobj')
Finally, if you wonder about the underscores _
in the attribute names: spaCy encodes all strings by mapping them to hash values (a numerical representation) for computational efficiency.
Let’s print out the first Token in the Doc [0]
and its dependencies to examine how this works.
print(doc[0], doc[0].dep, doc[0].dep_)
As you can see, the hash value 415 is reserved for the tag corresponding to a determiner (det
).
If you want human-readable output for dependency parsing and spaCy returns sequences of numbers, then you most likely forgot to add the underscore to the attribute name.
Sentence segmentation#
spaCy also segments Doc objects into sentences. This task is known as sentence segmentation.
Sentence segmentation imposes additional structure to larger texts. By determining the boundaries of a sentence, we can constrain tasks such as dependency parsing to individual sentences.
spaCy provides access to the results of sentence segmentation via the attribute sents
of a Doc object.
Let’s loop over the sentences contained in the Doc object doc
and count them using Python’s enumerate()
function.
Using the enumerate()
function returns a count that increases with each item in the loop.
We assign this count to the variable number
, whereas each sentence is stored under sent
. We then print out both at the same time using the print()
function.
# Loop over sentences in the Doc object and count them using enumerate()
for number, sent in enumerate(doc.sents):
# Print the token and its dependency tag
print(number, sent)
This only returns a single sentence, but the Doc object could easily hold a longer text with multiple sentences, such as an entire newspaper article.
Lemmatization#
A lemma is the base form of a word. Keep in mind that unless explicitly instructed, computers cannot tell the difference between singular and plural forms of words, but treat them as distinct tokens, because their forms differ.
If one wants to count the occurrences of words, for instance, a process known as lemmatization is needed to group together the different forms of the same token.
Lemmas are available for each Token under the attribute lemma_
.
# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:
# Print the token and its dependency tag
print(token, token.lemma_)
Named entity recognition (NER)#
Named entity recognition (NER) is the task of recognising and classifying entities named in a text.
spaCy can recognise the named entities annotated in the OntoNotes 5 corpus, such as persons, geographic locations and products, to name but a few examples.
We can use the Doc object’s .ents
attribute to get the named entities.
doc.ents
This returns a tuple with the named entities.
Each item in the tuple is a spaCy Span object. Span objects can consist of multiple Token objects, as many named entities span multiple Tokens.
The named entities and their types are stored under the attributes .text
and .label_
of each Span object.
Let’s loop over the Span objects in the tuple and print out both attributes.
# Loop over the named entities in the Doc object
for ent in doc.ents:
# Print the named entity and its label
print(ent.text, ent.label_)
As you can see, the majority of named entities identified in the Doc consist of multiple Tokens, which is why they are represented as Span objects.
We can verify this by accessing the first named entity under doc.ents
, which can be found at position 0
, because Python starts counting from zero, and feeding this object to Python’s type()
function.
# Check the type of the object used to store named entities
type(doc.ents[0])
spaCy Span objects contain several useful arguments.
Most importantly, the attributes start
and end
return the indices of Tokens, which determine where the Span starts and ends in the Doc object.
We can examine this in greater detail by printing out the start
and end
attributes for the first named entity in the document.
# Print the named entity and indices of its start and end Tokens
print(doc.ents[0], doc.ents[0].start, doc.ents[0].end)
The named entity starts at index 0 and ends at index 5 in the Doc object.
If we retrieve the sixth Token in the Doc object (at index 5), we will see that this corresponds to the Token “has”.
doc[5]
This shows that the index returned by the end
attribute does not correspond to the last Token in the Span that contains the named entity, but returns the index of the first Token following the Span.
Let’s examine this by looping over the slice of the Doc object that corresponds to the first named entity.
# Loop over a slice of the Doc object that covers the first named entity
for token in doc[doc.ents[0].start: doc.ents[0].end]:
# Print the Token and its index
print(token, token.i)
As you can see, the start
attribute means that the Span starts here, whereas the end
attribute means that the Span has ended here.
We can also render the named entities using displacy, the spaCy module we used for visualising dependency parses above.
Note that we must pass the string ent
to the style
argument to indicate that we wish to visualise named entities.
displacy.render(doc, style='ent')
If you don’t recognise a particular tag used for a named entity, you can always ask spaCy for an explanation.
spacy.explain('NORP')
This section should have given you an idea of some basic natural language processing tasks, how they can be performed using spaCy and what kinds of linguistic annotations they produce.
The following section focuses on how to customise the tasks that spaCy performs on an input text.