Processing texts using spaCy#

This section introduces you to basic tasks in natural language processing and how they can be performed using a Python library named spaCy.

After reading this section, you should:

  • know some of the key concepts and tasks in natural language processing

  • know how to perform simple natural language processing tasks using the spaCy library

Getting started#

To get started, we import spaCy, one of the many libraries available for natural language processing in Python.

# Import the spaCy library
import spacy
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[2], line 2
      1 # Import the spaCy library
----> 2 import spacy

File ~/nlp/lib/python3.9/site-packages/spacy/__init__.py:11
      8 setup_default_warnings()  # noqa: E402
     10 # These are imported as part of the API
---> 11 from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401
     12 from thinc.api import Config
     14 from . import pipeline  # noqa: F401

File ~/nlp/lib/python3.9/site-packages/thinc/api.py:2
      1 from .config import Config, registry, ConfigValidationError
----> 2 from .initializers import normal_init, uniform_init, glorot_uniform_init, zero_init
      3 from .initializers import configure_normal_init
      4 from .loss import CategoricalCrossentropy, L2Distance, CosineDistance

File ~/nlp/lib/python3.9/site-packages/thinc/initializers.py:4
      1 from typing import Callable, cast
      2 import numpy
----> 4 from .backends import Ops
      5 from .config import registry
      6 from .types import FloatsXd, Shape

File ~/nlp/lib/python3.9/site-packages/thinc/backends/__init__.py:7
      4 from contextvars import ContextVar
      5 import threading
----> 7 from .ops import Ops
      8 from .cupy_ops import CupyOps, has_cupy
      9 from .numpy_ops import NumpyOps

File ~/nlp/lib/python3.9/site-packages/thinc/backends/ops.py:13
     11 from ..types import FloatsXd, Ints1d, Ints2d, Ints3d, Ints4d, IntsXd, _Floats
     12 from ..types import DeviceTypes, Generator, Padded, Batchable, SizedGenerator
---> 13 from ..util import get_array_module, is_xp_array, to_numpy
     16 ArrayT = TypeVar("ArrayT", bound=ArrayXd)
     17 FloatsT = TypeVar("FloatsT", bound=_Floats)

File ~/nlp/lib/python3.9/site-packages/thinc/util.py:48
     45     torch_version = Version("0.0.0")
     47 try:  # pragma: no cover
---> 48     import tensorflow.experimental.dlpack
     49     import tensorflow as tf
     51     has_tensorflow = True

File ~/nlp/lib/python3.9/site-packages/tensorflow/__init__.py:51
     49 from ._api.v2 import autograph
     50 from ._api.v2 import bitwise
---> 51 from ._api.v2 import compat
     52 from ._api.v2 import config
     53 from ._api.v2 import data

File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/__init__.py:37
      3 """Compatibility functions.
      4 
      5 The `tf.compat` module contains two sets of compatibility functions.
   (...)
     32 
     33 """
     35 import sys as _sys
---> 37 from . import v1
     38 from . import v2
     39 from tensorflow.python.compat.compat import forward_compatibility_horizon

File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/__init__.py:30
     28 from . import autograph
     29 from . import bitwise
---> 30 from . import compat
     31 from . import config
     32 from . import data

File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/compat/__init__.py:37
      3 """Compatibility functions.
      4 
      5 The `tf.compat` module contains two sets of compatibility functions.
   (...)
     32 
     33 """
     35 import sys as _sys
---> 37 from . import v1
     38 from . import v2
     39 from tensorflow.python.compat.compat import forward_compatibility_horizon

File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/compat/v1/__init__.py:47
     45 from tensorflow._api.v2.compat.v1 import layers
     46 from tensorflow._api.v2.compat.v1 import linalg
---> 47 from tensorflow._api.v2.compat.v1 import lite
     48 from tensorflow._api.v2.compat.v1 import logging
     49 from tensorflow._api.v2.compat.v1 import lookup

File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/lite/__init__.py:9
      6 import sys as _sys
      8 from . import constants
----> 9 from . import experimental
     10 from tensorflow.lite.python.lite import Interpreter
     11 from tensorflow.lite.python.lite import OpHint

File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/lite/experimental/__init__.py:8
      3 """Public API for tf.lite.experimental namespace.
      4 """
      6 import sys as _sys
----> 8 from . import authoring
      9 from tensorflow.lite.python.analyzer import ModelAnalyzer as Analyzer
     10 from tensorflow.lite.python.lite import OpResolverType

File ~/nlp/lib/python3.9/site-packages/tensorflow/_api/v2/compat/v1/lite/experimental/authoring/__init__.py:8
      3 """Public API for tf.lite.experimental.authoring namespace.
      4 """
      6 import sys as _sys
----> 8 from tensorflow.lite.python.authoring.authoring import compatible

File ~/nlp/lib/python3.9/site-packages/tensorflow/lite/python/authoring/authoring.py:43
     39 import functools
     42 # pylint: disable=g-import-not-at-top
---> 43 from tensorflow.lite.python import convert
     44 from tensorflow.lite.python import lite
     45 from tensorflow.lite.python.metrics import converter_error_data_pb2

File ~/nlp/lib/python3.9/site-packages/tensorflow/lite/python/convert.py:33
     31 from tensorflow.lite.python.convert_phase import ConverterError
     32 from tensorflow.lite.python.convert_phase import SubComponent
---> 33 from tensorflow.lite.python.metrics.wrapper import metrics_wrapper as _metrics_wrapper
     34 from tensorflow.lite.toco import model_flags_pb2 as _model_flags_pb2
     35 from tensorflow.lite.toco import toco_flags_pb2 as _conversion_flags_pb2

File ~/nlp/lib/python3.9/site-packages/tensorflow/lite/python/metrics/wrapper/metrics_wrapper.py:19
     17 from tensorflow.lite.python import wrap_toco
     18 from tensorflow.lite.python.metrics import converter_error_data_pb2
---> 19 from tensorflow.lite.python.metrics._pywrap_tensorflow_lite_metrics_wrapper import MetricsWrapper  # pylint: disable=unused-import
     22 def retrieve_collected_errors():
     23   """Returns and clears the list of collected errors in ErrorCollector.
     24 
     25   The RetrieveCollectedErrors function in C++ returns a list of serialized proto
   (...)
     29     A list of ConverterErrorData.
     30   """

KeyboardInterrupt: 

To perform natural language processing tasks for a given language, we must load a language model that has been trained to perform these tasks for the language in question.

spaCy supports many languages, but provides pre-trained language models for fewer languages.

These language models come in different sizes and flavours. We will explore these models and their differences later.

To get acquainted with basic tasks in natural language processing, we will start with a small language model for the English language.

Language models are loaded using spaCy’s load() function, which takes the name of the model as input.

# Load the small language model for English and assign it to the variable 'nlp'
nlp = spacy.load('en_core_web_sm')

# Call the variable to examine the object
nlp

Calling the variable nlp returns a spaCy Language object that contains a language model for the English language.

Esentially, spaCy’s Language object is a pipeline that uses the language model to perform a number of natural language processing tasks. We will return to these tasks shortly below.

What is a language model?#

Most modern language models are based on statistics instead of human-defined rules.

Statistical language models are based on probabilities, e.g.:

  • What is the probability of a given sentence occurring in a language?

  • How likely is a given word to occur in a sequence of words?

Consider the following sentences from the news articles from the previous sections:

From financial exchanges in HIDDEN Manhattan to cloakrooms in Washington and homeless shelters in California, unfamiliar rituals were the order of the day.

Security precautions were being taken around the HIDDEN as the deadline for Iraq to withdraw from Kuwait neared.

You can probably make informed guesses on the HIDDEN words based on your knowledge of the English language and the world in general.

Similarly, creating a statistical language model involves observing the occurrence of words in large corpora and calculating their probabilities of occurrence in a given context. The language model can then be trained by making predictions and adjusting the model based on the errors made during prediction.

How are language models trained?#

The small language model for English, for instance, is trained on a corpus called OntoNotes 5.0, which features texts from different genres such as newswire text, broadcast news, broadcast and telephone conversations and blogs.

This allows the corpus to cover linguistic variation in both written and spoken English.

The OntoNotes 5.0 corpus consists of more than just plain text: the annotations include part-of-speech tags, syntactic dependencies and co-references between words.

This allows modelling not just the occurrence of particular words or their sequences, but their grammatical features as well.

Performing basic NLP tasks using spaCy#

To process text using the Language object containing the language model for English, we simply call the Language object nlp on some text.

Let’s begin by defining a simple test sentence, a Python string object that is stored under the variable text.

As usual, we can print out the contents by calling the variable.

# Assign an example sentence to the variable 'text'
text = "The Federal Bureau of Investigation has been ordered to track down as many as 3,000 Iraqis in this country whose visas have expired, the Justice Department said yesterday."

# Call the variable to examine the output
text

Passing the variable text to the Language object nlp returns a spaCy Doc object, short for document.

In natural language processing, longer pieces of text are commonly referred to as documents, although in this case our document consists of a single sentence.

This object contains both the input text stored under text and the results of natural language processing using spaCy.

# Feed the string object under 'text' to the Language object under 'nlp'
# Store the result under the variable 'doc'
doc = nlp(text)

The Doc object is now stored under the variable doc.

# Call the variable to examine the object
doc

Calling the variable doc returns the contents of the object.

Although the output resembles that of a Python string, the Doc object contains a wealth of information about its linguistic structure, which spaCy generated by passing the text through the NLP pipeline.

We will now examine the tasks that were performed under the hood after we provided the input sentence to the language model.

Tokenization#

What takes place first is a task known as tokenization, which breaks the text down into analytical units in need of further processing.

In most cases, a token corresponds to words separated by whitespace, but punctuation marks are also considered as independent tokens. Because computers treat words as sequences of characters, assigning punctuation marks to their own tokens prevents trailing punctuation from attaching to the words that precede them.

The diagram below the outlines the tasks that spaCy can perform after a text has been tokenised, such as part-of-speech tagging, syntactic parsing and named entity recognition.

The spaCy pipeline from https://spacy.io/usage/linguistic-features#section-tokenization

A spaCy Doc object is consists of a sequence of Token objects, which store the results of various natural language processing tasks.

Let’s print out each Token object stored in the Doc object doc.

# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:  
    
    # Print each token
    print(token)  

The output shows one Token per line. As expected, punctuation marks such as ‘.’ and ‘,’ constitute their own Tokens.

Part-of-speech tagging#

Part-of-speech (POS) tagging is the task of determining the word class of a token. This is crucial for disambiguation, because different parts of speech may have similar forms.

Consider the example: The sailor dogs the hatch.

The present tense of the verb dog (to fasten something with something) is precisely the same as the plural form of the noun dog: dogs.

To identify the correct word class, we must examine the context in which the word appears.

spaCy provides two types of part-of-speech tags, coarse and fine-grained, which are stored under the attributes pos_ and tag_, respectively.

We can access the attributes of a Python object by inserting the attribute after the object and separating them with a full stop, e.g. token.pos_.

To access the results of POS tagging, let’s loop over the Doc object doc and print each Token and its coarse and fine-grained part-of-speech tags.

# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:
    
    # Print the token and the POS tags
    print(token, token.pos_, token.tag_)

The coarse part-of-speech tags available under the pos_ attribute are based on the Universal Dependencies tag set.

The fine-grained part-of-speech tags under tag_, in turn, are based on the OntoNotes 5.0 corpus introduced above.

In contrast to coarse part-of-speech tags, the fine-grained tags also encode grammatical information. The tags for verbs, for example, are distinguished by aspect and tense.

Morphological analysis#

Morphemes constitute the smallest grammatical units that carry meaning. Two types of morphemes are generally recognised: free morphemes, which consist of words that can stand on their own, and bound morphemes, which inflect other morphemes. For the English language, bound morphemes include suffixes such as -s, which is used to indicate the plural form of a noun.

Put differently, morphemes shape the external form of a word, and these forms are associated with given grammatical functions.

spaCy performs morphological analysis automatically and stores the result under the attribute morph of a Token object.

# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:

    # Print the token and the results of morphological analysis
    print(token, token.morph)

As the output shows, all Tokens do not have morphological information, because they consist of free morphemes.

To retrieve morphological information from a Token object, we must use the get() method of the morph attribute.

We can use the brackets [] to access items in the Doc object.

The following line retrieves morphological information about aspect for the 22nd Token in the Doc object.

# Retrieve morphological information on aspect for the Token at index 22 in the Doc object
doc[22].morph.get('Aspect')

This returns a list with a single string item Perf, which refers to the perfective aspect.

What if we attempt to retrieve a morphological feature that a Token does not have?

Let’s attempt to retrieve information on aspect for the 21st Token in the Doc object.

# Retrieve morphological information on aspect for 21st Token in the Doc object
doc[21].morph.get('Aspect')

This returns an empty list, as indicated by the brackets [ ] with nothing between them.

To retrieve all the morphological information available for a given Token, the best solution is to use the to_dict() method of the morph attribute.

This returns a dictionary, a Python data structure consisting of key and value pairs.

# Retrieve morphological information for 21st Token in the Doc object
# Use the to_dict() method to cast the result into a dictionary
doc[21].morph.to_dict()

A Python dictionary is marked by curly brackets { }. Each key/value pair is separated by a colon :. In this case, both keys and values consist of string objects.

The value stored under a key may be accessed by placing the key name in brackets [ ] and placing this right after the name of the dictionary, as shown below.

# Assign morphological information to the dictionary 'morph_dict' 
morph_dict = doc[21].morph.to_dict()

# Get the value corresponding to the key 'Mood'
morph_dict['Mood']

Dictionaries are a powerful data structure in Python, which we will frequently use for storing information.

Syntactic parsing#

Syntactic parsing (or dependency parsing) is the task of defining syntactic dependencies that hold between tokens.

The syntactic dependencies are available under the dep_ attribute of a Token object.

Let’s print out the syntactic dependencies for each Token in the Doc object.

# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:
    
    # Print the token and its dependency tag
    print(token, token.dep_)

Unlike part-of-speech tags that are associated with a single Token, dependency tags indicate a relation that holds between two Tokens.

To better understand the syntactic relations captured by dependency parsing, let’s use some of the additional attributes available for each Token:

  1. i: the position of the Token in the Doc

  2. token: the Token itself

  3. dep_: a tag for the syntactic relation

  4. head and i: the Token that governs the current Token and its index

This illustrates how Python attributes can be used in a flexible manner: the attribute head points to another Token, which naturally has the attribute i that contains its index or position in the Doc. We can combine these two attributes to retrieve this information for any token by referring to .head.i.

# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:
    
    # Print the index of current token, the token itself, the dependency, the head and its index
    print(token.i, token, token.dep_, token.head.i, token.head)

Although the output above helps to clarify the syntactic dependencies between tokens, they are generally much easier to perceive using diagrams.

spaCy provides a visualisation tool for visualising dependencies. This component of the spaCy library, displacy, can be imported using the following command.

from spacy import displacy

The displacy module has a function named render(), which takes a Doc object as input.

To draw a dependency tree, we provide the Doc object doc to the render() function with two arguments:

  1. style: The value dep instructs displacy to draw a visualisation for syntactic dependencies.

  2. options: This argument takes a Python dictionary as input. We provide a dictionary with the key compact and Boolean value True to instruct displacy to draw a compact tree diagram. Additional options for formatting the visualisation can be found in spaCy documentation.

displacy.render(doc, style='dep', options={'compact': True})

The syntactic dependencies are visualised using lines that lead from the head Token to the Token governed by that head.

The dependency tags are based on Universal Dependencies, a framework for describing morphological and syntactic features across languages (for a theoretical discussion of Universal Dependencies, see de Marneffe et al. 2021).

If you don’t know what a particular tag means, spaCy provides a function for explaining the tags, explain(), which takes a tag as input (note that the tags are case-sensitive).

spacy.explain('pobj')

Finally, if you wonder about the underscores _ in the attribute names: spaCy encodes all strings by mapping them to hash values (a numerical representation) for computational efficiency.

Let’s print out the first Token in the Doc [0] and its dependencies to examine how this works.

print(doc[0], doc[0].dep, doc[0].dep_)

As you can see, the hash value 415 is reserved for the tag corresponding to a determiner (det).

If you want human-readable output for dependency parsing and spaCy returns sequences of numbers, then you most likely forgot to add the underscore to the attribute name.

Sentence segmentation#

spaCy also segments Doc objects into sentences. This task is known as sentence segmentation.

Sentence segmentation imposes additional structure to larger texts. By determining the boundaries of a sentence, we can constrain tasks such as dependency parsing to individual sentences.

spaCy provides access to the results of sentence segmentation via the attribute sents of a Doc object.

Let’s loop over the sentences contained in the Doc object doc and count them using Python’s enumerate() function.

Using the enumerate() function returns a count that increases with each item in the loop.

We assign this count to the variable number, whereas each sentence is stored under sent. We then print out both at the same time using the print() function.

# Loop over sentences in the Doc object and count them using enumerate()
for number, sent in enumerate(doc.sents):
    
    # Print the token and its dependency tag
    print(number, sent)

This only returns a single sentence, but the Doc object could easily hold a longer text with multiple sentences, such as an entire newspaper article.

Lemmatization#

A lemma is the base form of a word. Keep in mind that unless explicitly instructed, computers cannot tell the difference between singular and plural forms of words, but treat them as distinct tokens, because their forms differ.

If one wants to count the occurrences of words, for instance, a process known as lemmatization is needed to group together the different forms of the same token.

Lemmas are available for each Token under the attribute lemma_.

# Loop over items in the Doc object, using the variable 'token' to refer to items in the list
for token in doc:
    
    # Print the token and its dependency tag
    print(token, token.lemma_)

Named entity recognition (NER)#

Named entity recognition (NER) is the task of recognising and classifying entities named in a text.

spaCy can recognise the named entities annotated in the OntoNotes 5 corpus, such as persons, geographic locations and products, to name but a few examples.

We can use the Doc object’s .ents attribute to get the named entities.

doc.ents

This returns a tuple with the named entities.

Each item in the tuple is a spaCy Span object. Span objects can consist of multiple Token objects, as many named entities span multiple Tokens.

The named entities and their types are stored under the attributes .text and .label_ of each Span object.

Let’s loop over the Span objects in the tuple and print out both attributes.

# Loop over the named entities in the Doc object 
for ent in doc.ents:

    # Print the named entity and its label
    print(ent.text, ent.label_)

As you can see, the majority of named entities identified in the Doc consist of multiple Tokens, which is why they are represented as Span objects.

We can verify this by accessing the first named entity under doc.ents, which can be found at position 0, because Python starts counting from zero, and feeding this object to Python’s type() function.

# Check the type of the object used to store named entities
type(doc.ents[0])

spaCy Span objects contain several useful arguments.

Most importantly, the attributes start and end return the indices of Tokens, which determine where the Span starts and ends in the Doc object.

We can examine this in greater detail by printing out the start and end attributes for the first named entity in the document.

# Print the named entity and indices of its start and end Tokens
print(doc.ents[0], doc.ents[0].start, doc.ents[0].end)

The named entity starts at index 0 and ends at index 5 in the Doc object.

If we retrieve the sixth Token in the Doc object (at index 5), we will see that this corresponds to the Token “has”.

doc[5]

This shows that the index returned by the end attribute does not correspond to the last Token in the Span that contains the named entity, but returns the index of the first Token following the Span.

Let’s examine this by looping over the slice of the Doc object that corresponds to the first named entity.

# Loop over a slice of the Doc object that covers the first named entity
for token in doc[doc.ents[0].start: doc.ents[0].end]:
    
    # Print the Token and its index
    print(token, token.i)

As you can see, the start attribute means that the Span starts here, whereas the end attribute means that the Span has ended here.

We can also render the named entities using displacy, the spaCy module we used for visualising dependency parses above.

Note that we must pass the string ent to the style argument to indicate that we wish to visualise named entities.

displacy.render(doc, style='ent')

If you don’t recognise a particular tag used for a named entity, you can always ask spaCy for an explanation.

spacy.explain('NORP')

This section should have given you an idea of some basic natural language processing tasks, how they can be performed using spaCy and what kinds of linguistic annotations they produce.

The following section focuses on how to customise the tasks that spaCy performs on an input text.