# 2. Create a blank English nlp object nlp = spacy.blank("en")
# 3. Created by processing a string of text with the nlp object doc = nlp("Hello world!")
# 4. Index into the Doc to get a single Token token = doc[1]
# Get the token text via the .text attribute print(token.text)
# Iterate over tokens in a Doc for token in doc: print(token.text)
# 5. Create another document with text doc = nlp("Hello NLP class!")
# A slice from the Doc is a Span object span = doc[1:3]
# Get the span text via the .text attribute print(span.text)
demonstrations of is_alpha, is_punct, like_num
1 2 3 4 5 6 7 8 9 10 11 12 13 14
doc = nlp("It costs $5.") print("Index: ", [token.i for token in doc]) print("Text: ", [token.text for token in doc])
print("is_alpha:", [token.is_alpha for token in doc]) print("is_punct:", [token.is_punct for token in doc]) print("like_num:", [token.like_num for token in doc])
# Load the small English pipeline nlp = spacy.load("en_core_web_sm")
# Process a text doc = nlp("She ate the pizza")
# Iterate over the tokens for token in doc: # Print the text and the predicted part-of-speech tag print(token.text, token.pos_, token.pos)
# result: # She PRON 95 # ate VERB 100 # the DET 90 # pizza NOUN 92
# also the token.head returns the syntactic head token for token in doc: print(token.text, token.pos_, token.dep_, token.head) # She PRON nsubj ate # ate VERB ROOT ate # the DET det pizza # pizza NOUN dobj ate
Name Entity Recognition
1 2 3 4 5 6 7
# Process a text doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
# Iterate over the predicted entities for ent in doc.ents: # Print the entity text and its label print(ent.text, ent.label_)
# Import the Matcher from spacy.matcher import Matcher
# Load a pipeline and create the nlp object nlp = spacy.load("en_core_web_sm")
# Initialize the matcher with the shared vocab matcher = Matcher(nlp.vocab)
# Add the pattern to the matcher pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}] matcher.add("IPHONE_PATTERN", [pattern])
# Process some text doc = nlp("iPhone X news! Upcoming iPhone X release date leaked")
# Call the matcher on the doc matches = matcher(doc)
# Iterate over the matches for pattern_id, start, end in matches: # Get the matched span matched_span = doc[start:end] print('"{}" - match for pattern {} in span ({}, {})'.format(matched_span.text, pattern_id, start, end)) # "iPhone X" - match for pattern 9528407286733565721 in span (0, 2) # "iPhone X" - match for pattern 9528407286733565721 in span (5, 7)
in match pattern, LEMMA contains the multiple variations of the word
doc = nlp("I loved vanilla but now I love chocolate more.")
matches = matcher(doc) for pattern_id, start, end in matches: matched_span = doc[start:end] print('"{}" - match for pattern {} in span ({}, {})'.format(matched_span.text, pattern_id, start, end))
# "loved vanilla" - match for pattern 4358456325055851256 in span (1, 3) # "love chocolate" - match for pattern 4358456325055851256 in span (6, 8)
1 2 3 4 5 6
Operators and quantifiers let you define how often a token should be matched. They can be added using the "OP" key. "OP" can have one of four values:
- An "!" negates the token, so it's matched 0 times. - A "?" makes the token optional, and matches it 0 or 1 times. - A "+" matches a token 1 or more times. - And finally, an "*" matches 0 or more times.
Word Vector
SpaCy contains the word vector in medium pipeline
It could output the word vectors and calculate the similarity of tokens
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# Load a larger pipeline with vectors nlp = spacy.load("en_core_web_md")
# Look at one word vector doc = nlp("I love chocolate") print(doc[2].vector)
# Compare two documents doc1 = nlp("I like fast food") doc2 = nlp("I like pizza") print("Comparing sentences:", doc1.similarity(doc2))
# Compare two tokens doc = nlp("I like pizza and pasta") token1 = doc[2] token2 = doc[4] print("Comparing 'pizza' and 'paste':", token1.similarity(token2))