Materi dari http://www.pieriandata.com.
This commit is contained in:
parent
d87ee2b630
commit
6669ae958f
@ -0,0 +1,226 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
%%writefile 1.txt
|
||||||
|
This is a story about cats
|
||||||
|
our feline pets
|
||||||
|
Cats are furry animals
|
||||||
|
|
||||||
|
|
||||||
|
%%writefile 2.txt
|
||||||
|
This story is about surfing
|
||||||
|
Catching waves is fun
|
||||||
|
Surfing is a popular water sport
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
vocab = {}
|
||||||
|
i = 1
|
||||||
|
|
||||||
|
with open('1.txt') as f:
|
||||||
|
x = f.read().lower().split()
|
||||||
|
|
||||||
|
for word in x:
|
||||||
|
if word in vocab:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
vocab[word]=i
|
||||||
|
i+=1
|
||||||
|
|
||||||
|
print(vocab)
|
||||||
|
|
||||||
|
|
||||||
|
with open('2.txt') as f:
|
||||||
|
x = f.read().lower().split()
|
||||||
|
|
||||||
|
for word in x:
|
||||||
|
if word in vocab:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
vocab[word]=i
|
||||||
|
i+=1
|
||||||
|
|
||||||
|
print(vocab)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Create an empty vector with space for each word in the vocabulary:
|
||||||
|
one = ['1.txt']+[0]*len(vocab)
|
||||||
|
one
|
||||||
|
|
||||||
|
|
||||||
|
# map the frequencies of each word in 1.txt to our vector:
|
||||||
|
with open('1.txt') as f:
|
||||||
|
x = f.read().lower().split()
|
||||||
|
|
||||||
|
for word in x:
|
||||||
|
one[vocab[word]]+=1
|
||||||
|
|
||||||
|
one
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Do the same for the second document:
|
||||||
|
two = ['2.txt']+[0]*len(vocab)
|
||||||
|
|
||||||
|
with open('2.txt') as f:
|
||||||
|
x = f.read().lower().split()
|
||||||
|
|
||||||
|
for word in x:
|
||||||
|
two[vocab[word]]+=1
|
||||||
|
|
||||||
|
|
||||||
|
# Compare the two vectors:
|
||||||
|
print(f'{one}\n{two}')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Perform imports and load the dataset:
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
df = pd.read_csv('../TextFiles/smsspamcollection.tsv', sep='\t')
|
||||||
|
df.head()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df.isnull().sum()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df['label'].value_counts()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
X = df['message'] # this time we want to look at the text
|
||||||
|
y = df['label']
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.feature_extraction.text import CountVectorizer
|
||||||
|
count_vect = CountVectorizer()
|
||||||
|
|
||||||
|
X_train_counts = count_vect.fit_transform(X_train)
|
||||||
|
X_train_counts.shape
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.feature_extraction.text import TfidfTransformer
|
||||||
|
tfidf_transformer = TfidfTransformer()
|
||||||
|
|
||||||
|
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
|
||||||
|
X_train_tfidf.shape
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
vectorizer = TfidfVectorizer()
|
||||||
|
|
||||||
|
X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set
|
||||||
|
X_train_tfidf.shape
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
clf = LinearSVC()
|
||||||
|
clf.fit(X_train_tfidf,y_train)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
# from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
# from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
|
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
|
||||||
|
('clf', LinearSVC()),
|
||||||
|
])
|
||||||
|
|
||||||
|
# Feed the training data through the pipeline
|
||||||
|
text_clf.fit(X_train, y_train)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Form a prediction set
|
||||||
|
predictions = text_clf.predict(X_test)
|
||||||
|
|
||||||
|
|
||||||
|
# Report the confusion matrix
|
||||||
|
from sklearn import metrics
|
||||||
|
print(metrics.confusion_matrix(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
# Print a classification report
|
||||||
|
print(metrics.classification_report(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
# Print the overall accuracy
|
||||||
|
print(metrics.accuracy_score(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -0,0 +1,296 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
|
||||||
|
df.head()
|
||||||
|
|
||||||
|
|
||||||
|
len(df)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from IPython.display import Markdown, display
|
||||||
|
display(Markdown('> '+df['review'][0]))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Check for the existence of NaN values in a cell:
|
||||||
|
df.isnull().sum()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df.dropna(inplace=True)
|
||||||
|
|
||||||
|
len(df)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
blanks = [] # start with an empty list
|
||||||
|
|
||||||
|
for i,lb,rv in df.itertuples(): # iterate over the DataFrame
|
||||||
|
if type(rv)==str: # avoid NaN values
|
||||||
|
if rv.isspace(): # test 'review' for whitespace
|
||||||
|
blanks.append(i) # add matching index numbers to the list
|
||||||
|
|
||||||
|
print(len(blanks), 'blanks: ', blanks)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df.drop(blanks, inplace=True)
|
||||||
|
|
||||||
|
len(df)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
df['label'].value_counts()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
|
||||||
|
X = df['review']
|
||||||
|
y = df['label']
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
|
||||||
|
# Naïve Bayes:
|
||||||
|
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
|
||||||
|
('clf', MultinomialNB()),
|
||||||
|
])
|
||||||
|
|
||||||
|
# Linear SVC:
|
||||||
|
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
|
||||||
|
('clf', LinearSVC()),
|
||||||
|
])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
text_clf_nb.fit(X_train, y_train)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Form a prediction set
|
||||||
|
predictions = text_clf_nb.predict(X_test)
|
||||||
|
|
||||||
|
|
||||||
|
# Report the confusion matrix
|
||||||
|
from sklearn import metrics
|
||||||
|
print(metrics.confusion_matrix(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
# Print a classification report
|
||||||
|
print(metrics.classification_report(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
# Print the overall accuracy
|
||||||
|
print(metrics.accuracy_score(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
text_clf_lsvc.fit(X_train, y_train)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Form a prediction set
|
||||||
|
predictions = text_clf_lsvc.predict(X_test)
|
||||||
|
|
||||||
|
|
||||||
|
# Report the confusion matrix
|
||||||
|
from sklearn import metrics
|
||||||
|
print(metrics.confusion_matrix(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
# Print a classification report
|
||||||
|
print(metrics.classification_report(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
# Print the overall accuracy
|
||||||
|
print(metrics.accuracy_score(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
|
||||||
|
'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
|
||||||
|
'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
|
||||||
|
'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
|
||||||
|
'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# YOU DO NOT NEED TO RUN THIS CELL UNLESS YOU HAVE
|
||||||
|
# RECENTLY OPENED THIS NOTEBOOK OR RESTARTED THE KERNEL:
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
|
||||||
|
df.dropna(inplace=True)
|
||||||
|
blanks = []
|
||||||
|
for i,lb,rv in df.itertuples():
|
||||||
|
if type(rv)==str:
|
||||||
|
if rv.isspace():
|
||||||
|
blanks.append(i)
|
||||||
|
df.drop(blanks, inplace=True)
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
X = df['review']
|
||||||
|
y = df['label']
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
||||||
|
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn import metrics
|
||||||
|
|
||||||
|
|
||||||
|
# RUN THIS CELL TO ADD STOPWORDS TO THE LINEAR SVC PIPELINE:
|
||||||
|
text_clf_lsvc2 = Pipeline([('tfidf', TfidfVectorizer(stop_words=stopwords)),
|
||||||
|
('clf', LinearSVC()),
|
||||||
|
])
|
||||||
|
text_clf_lsvc2.fit(X_train, y_train)
|
||||||
|
|
||||||
|
|
||||||
|
predictions = text_clf_lsvc2.predict(X_test)
|
||||||
|
print(metrics.confusion_matrix(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
print(metrics.classification_report(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
print(metrics.accuracy_score(y_test,predictions))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# YOU DO NOT NEED TO RUN THIS CELL UNLESS YOU HAVE
|
||||||
|
# RECENTLY OPENED THIS NOTEBOOK OR RESTARTED THE KERNEL:
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\t')
|
||||||
|
df.dropna(inplace=True)
|
||||||
|
blanks = []
|
||||||
|
for i,lb,rv in df.itertuples():
|
||||||
|
if type(rv)==str:
|
||||||
|
if rv.isspace():
|
||||||
|
blanks.append(i)
|
||||||
|
df.drop(blanks, inplace=True)
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
X = df['review']
|
||||||
|
y = df['label']
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
|
||||||
|
|
||||||
|
from sklearn.pipeline import Pipeline
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.naive_bayes import MultinomialNB
|
||||||
|
from sklearn.svm import LinearSVC
|
||||||
|
from sklearn import metrics
|
||||||
|
|
||||||
|
# Naïve Bayes Model:
|
||||||
|
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
|
||||||
|
('clf', MultinomialNB()),
|
||||||
|
])
|
||||||
|
|
||||||
|
# Linear SVC Model:
|
||||||
|
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
|
||||||
|
('clf', LinearSVC()),
|
||||||
|
])
|
||||||
|
|
||||||
|
# Train both models on the moviereviews.tsv training set:
|
||||||
|
text_clf_nb.fit(X_train, y_train)
|
||||||
|
text_clf_lsvc.fit(X_train, y_train)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
myreview = "A movie I really wanted to love was terrible. \
|
||||||
|
I'm sure the producers had the best intentions, but the execution was lacking."
|
||||||
|
|
||||||
|
|
||||||
|
# Use this space to write your own review. Experiment with different lengths and writing styles.
|
||||||
|
myreview =
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
print(text_clf_nb.predict([myreview])) # be sure to put "myreview" inside square brackets
|
||||||
|
|
||||||
|
|
||||||
|
print(text_clf_lsvc.predict([myreview]))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -11,7 +11,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 18,
|
||||||
"id": "53a214ae-c9cf-4d46-925d-068f1685537b",
|
"id": "53a214ae-c9cf-4d46-925d-068f1685537b",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
@ -57,7 +57,7 @@
|
|||||||
"data = {\n",
|
"data = {\n",
|
||||||
" \"text\": [\n",
|
" \"text\": [\n",
|
||||||
" \"Saya suka produk ini, luar biasa\",\n",
|
" \"Saya suka produk ini, luar biasa\",\n",
|
||||||
" \"Layanannya buruk, sangat kecewa\",\n",
|
" \"Layanannya buruk, saya sangat kecewa\",\n",
|
||||||
" \"Pembelian terbaik yang pernah saya lakukan\",\n",
|
" \"Pembelian terbaik yang pernah saya lakukan\",\n",
|
||||||
" \"Saya benci produk ini, buang-buang uang\",\n",
|
" \"Saya benci produk ini, buang-buang uang\",\n",
|
||||||
" \"Kualitasnya sangat bagus, direkomendasikan\",\n",
|
" \"Kualitasnya sangat bagus, direkomendasikan\",\n",
|
||||||
@ -109,7 +109,7 @@
|
|||||||
"# -----------------------------------------\n",
|
"# -----------------------------------------\n",
|
||||||
"# 6. Prediksi Teks Baru\n",
|
"# 6. Prediksi Teks Baru\n",
|
||||||
"# -----------------------------------------\n",
|
"# -----------------------------------------\n",
|
||||||
"sample_text = [\"barang bagus luar biasa\"]\n",
|
"#sample_text = [\"barang bagus luar biasa\"]\n",
|
||||||
"sample_text = [\"barang buruk, saya kecewa\"]\n",
|
"sample_text = [\"barang buruk, saya kecewa\"]\n",
|
||||||
"sample_vec = tfidf.transform(sample_text)\n",
|
"sample_vec = tfidf.transform(sample_text)\n",
|
||||||
"prediction = model.predict(sample_vec)\n",
|
"prediction = model.predict(sample_vec)\n",
|
||||||
@ -120,9 +120,34 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": 20,
|
||||||
"id": "9f7d90fe-4af4-446c-9547-c9312bfa6fc7",
|
"id": "9f7d90fe-4af4-446c-9547-c9312bfa6fc7",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"Prediksi untuk: saya benci barang ini\n",
|
||||||
|
"Hasil: negative\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"#sample_text = [\"barang bagus luar biasa\"]\n",
|
||||||
|
"sample_text = [\"saya benci barang ini\"]\n",
|
||||||
|
"sample_vec = tfidf.transform(sample_text)\n",
|
||||||
|
"prediction = model.predict(sample_vec)\n",
|
||||||
|
"print(\"\\nPrediksi untuk:\", sample_text[0])\n",
|
||||||
|
"print(\"Hasil:\", prediction[0])\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d4b9a7c2-0f08-43fd-8da8-018d839a4917",
|
||||||
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": []
|
"source": []
|
||||||
}
|
}
|
||||||
|
|||||||
Binary file not shown.
|
After Width: | Height: | Size: 5.8 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 19 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 7.2 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 7.7 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 38 KiB |
@ -0,0 +1,814 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Working with Text Files\n",
|
||||||
|
"In this section we'll cover\n",
|
||||||
|
" * Working with f-strings (formatted string literals) to format printed text\n",
|
||||||
|
" * Working with Files - opening, reading, writing and appending text files"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Formatted String Literals (f-strings)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Introduced in Python 3.6, <strong>f-strings</strong> offer several benefits over the older `.format()` string method. <br>For one, you can bring outside variables immediately into to the string rather than pass them through as keyword arguments:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"His name is Fred.\n",
|
||||||
|
"His name is Fred.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"name = 'Fred'\n",
|
||||||
|
"\n",
|
||||||
|
"# Using the old .format() method:\n",
|
||||||
|
"print('His name is {var}.'.format(var=name))\n",
|
||||||
|
"\n",
|
||||||
|
"# Using f-strings:\n",
|
||||||
|
"print(f'His name is {name}.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Pass `!r` to get the <strong>string representation</strong>:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"His name is 'Fred'\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(f'His name is {name!r}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Be careful not to let quotation marks in the replacement fields conflict with the quoting used in the outer string:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "SyntaxError",
|
||||||
|
"evalue": "invalid syntax (<ipython-input-3-b2f08335b9e5>, line 3)",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;36m File \u001b[1;32m\"<ipython-input-3-b2f08335b9e5>\"\u001b[1;36m, line \u001b[1;32m3\u001b[0m\n\u001b[1;33m print(f'Address: {d['a']} Main Street')\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"d = {'a':123,'b':456}\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'Address: {d['a']} Main Street')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Instead, use different styles of quotation marks:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Address: 123 Main Street\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"d = {'a':123,'b':456}\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Address: {d['a']} Main Street\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Minimum Widths, Alignment and Padding\n",
|
||||||
|
"You can pass arguments inside a nested set of curly braces to set a minimum width for the field, the alignment and even padding characters."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Author Topic Pages \n",
|
||||||
|
"Twain Rafting 601\n",
|
||||||
|
"Feynman Physics 95\n",
|
||||||
|
"Hamilton Mythology 144\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]\n",
|
||||||
|
"\n",
|
||||||
|
"for book in library:\n",
|
||||||
|
" print(f'{book[0]:{10}} {book[1]:{8}} {book[2]:{7}}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Here the first three lines align, except `Pages` follows a default left-alignment while numbers are right-aligned. Also, the fourth line's page number is pushed to the right as `Mythology` exceeds the minimum field width of `8`. When setting minimum field widths make sure to take the longest item into account.\n",
|
||||||
|
"\n",
|
||||||
|
"To set the alignment, use the character `<` for left-align, `^` for center, `>` for right.<br>\n",
|
||||||
|
"To set padding, precede the alignment character with the padding character (`-` and `.` are common choices).\n",
|
||||||
|
"\n",
|
||||||
|
"Let's make some adjustments:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Author Topic ..Pages\n",
|
||||||
|
"Twain Rafting ....601\n",
|
||||||
|
"Feynman Physics .....95\n",
|
||||||
|
"Hamilton Mythology ....144\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for book in library:\n",
|
||||||
|
" print(f'{book[0]:{10}} {book[1]:{10}} {book[2]:.>{7}}') # here .> was added"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Date Formatting"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"January 27, 2018\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from datetime import datetime\n",
|
||||||
|
"\n",
|
||||||
|
"today = datetime(year=2018, month=1, day=27)\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'{today:%B %d, %Y}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For more info on formatted string literals visit https://docs.python.org/3/reference/lexical_analysis.html#f-strings\n",
|
||||||
|
"\n",
|
||||||
|
"***"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Files\n",
|
||||||
|
"\n",
|
||||||
|
"Python uses file objects to interact with external files on your computer. These file objects can be any sort of file you have on your computer, whether it be an audio file, a text file, emails, Excel documents, etc. Note: You will probably need to install certain libraries or modules to interact with those various file types, but they are easily available. (We will cover downloading modules later on in the course).\n",
|
||||||
|
"\n",
|
||||||
|
"Python has a built-in open function that allows us to open and play with basic file types. First we will need a file though. We're going to use some IPython magic to create a text file!\n",
|
||||||
|
"\n",
|
||||||
|
"## Creating a File with IPython\n",
|
||||||
|
"#### This function is specific to jupyter notebooks! Alternatively, quickly create a simple .txt file with Sublime text editor."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Overwriting test.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile test.txt\n",
|
||||||
|
"Hello, this is a quick test file.\n",
|
||||||
|
"This is the second line of the file."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Python Opening a File\n",
|
||||||
|
"\n",
|
||||||
|
"### Know Your File's Location\n",
|
||||||
|
"\n",
|
||||||
|
"It's easy to get an error on this step:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "FileNotFoundError",
|
||||||
|
"evalue": "[Errno 2] No such file or directory: 'whoops.txt'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[1;32m<ipython-input-9-410403f4f4b4>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mmyfile\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'whoops.txt'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||||
|
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'whoops.txt'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"myfile = open('whoops.txt')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"To avoid this error, make sure your .txt file is saved in the same location as your notebook. To check your notebook location, use **pwd**:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'C:\\\\Users\\\\Mike\\\\NLP-Bootcamp\\\\00-Python-Text-Basics'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"pwd"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**Alternatively, to grab files from any location on your computer, simply pass in the entire file path. **\n",
|
||||||
|
"\n",
|
||||||
|
"For Windows you need to use double \\ so python doesn't treat the second \\ as an escape character, a file path is in the form:\n",
|
||||||
|
"\n",
|
||||||
|
" myfile = open(\"C:\\\\Users\\\\YourUserName\\\\Home\\\\Folder\\\\myfile.txt\")\n",
|
||||||
|
"\n",
|
||||||
|
"For MacOS and Linux you use slashes in the opposite direction:\n",
|
||||||
|
"\n",
|
||||||
|
" myfile = open(\"/Users/YourUserName/Folder/myfile.txt\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Open the text.txt file we created earlier\n",
|
||||||
|
"my_file = open('test.txt')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<_io.TextIOWrapper name='test.txt' mode='r' encoding='cp1252'>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"my_file"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"`my_file` is now an open file object held in memory. We'll perform some reading and writing exercises, and then we have to close the file to free up memory.\n",
|
||||||
|
"\n",
|
||||||
|
"### .read() and .seek()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Hello, this is a quick test file.\\nThis is the second line of the file.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# We can now read the file\n",
|
||||||
|
"my_file.read()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"''"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# But what happens if we try to read it again?\n",
|
||||||
|
"my_file.read()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This happens because you can imagine the reading \"cursor\" is at the end of the file after having read it. So there is nothing left to read. We can reset the \"cursor\" like this:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Seek to the start of file (index 0)\n",
|
||||||
|
"my_file.seek(0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Hello, this is a quick test file.\\nThis is the second line of the file.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Now read again\n",
|
||||||
|
"my_file.read()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### .readlines()\n",
|
||||||
|
"You can read a file line by line using the readlines method. Use caution with large files, since everything will be held in memory. We will learn how to iterate over large files later in the course."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['Hello, this is a quick test file.\\n', 'This is the second line of the file.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Readlines returns a list of the lines in the file\n",
|
||||||
|
"my_file.seek(0)\n",
|
||||||
|
"my_file.readlines()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"When you have finished using a file, it is always good practice to close it."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"my_file.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Writing to a File\n",
|
||||||
|
"\n",
|
||||||
|
"By default, the `open()` function will only allow us to read the file. We need to pass the argument `'w'` to write over the file. For example:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Add a second argument to the function, 'w' which stands for write.\n",
|
||||||
|
"# Passing 'w+' lets us read and write to the file\n",
|
||||||
|
"\n",
|
||||||
|
"my_file = open('test.txt','w+')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<div class=\"alert alert-danger\" style=\"margin: 20px\">**Use caution!**<br>\n",
|
||||||
|
"Opening a file with 'w' or 'w+' *truncates the original*, meaning that anything that was in the original file **is deleted**!</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"24"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Write to the file\n",
|
||||||
|
"my_file.write('This is a new first line')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'This is a new first line'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Read the file\n",
|
||||||
|
"my_file.seek(0)\n",
|
||||||
|
"my_file.read()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"my_file.close() # always do this when you're done with a file"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Appending to a File\n",
|
||||||
|
"Passing the argument `'a'` opens the file and puts the pointer at the end, so anything written is appended. Like `'w+'`, `'a+'` lets us read and write to a file. If the file does not exist, one will be created."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"23"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"my_file = open('test.txt','a+')\n",
|
||||||
|
"my_file.write('\\nThis line is being appended to test.txt')\n",
|
||||||
|
"my_file.write('\\nAnd another line here.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"This is a new first line\n",
|
||||||
|
"This line is being appended to test.txt\n",
|
||||||
|
"And another line here.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"my_file.seek(0)\n",
|
||||||
|
"print(my_file.read())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"my_file.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Appending with `%%writefile`\n",
|
||||||
|
"Jupyter notebook users can do the same thing using IPython cell magic:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Appending to test.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile -a test.txt\n",
|
||||||
|
"\n",
|
||||||
|
"This is more text being appended to test.txt\n",
|
||||||
|
"And another line here."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Add a blank space if you want the first line to begin on its own line, as Jupyter won't recognize escape sequences like `\\n`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Aliases and Context Managers\n",
|
||||||
|
"You can assign temporary variable names as aliases, and manage the opening and closing of files automatically using a context manager:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"This is a new first line\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"with open('test.txt','r') as txt:\n",
|
||||||
|
" first_line = txt.readlines()[0]\n",
|
||||||
|
" \n",
|
||||||
|
"print(first_line)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Note that the `with ... as ...:` context manager automatically closed `test.txt` after assigning the first line of text to first_line:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "ValueError",
|
||||||
|
"evalue": "I/O operation on closed file.",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[1;32m<ipython-input-28-39ca4397fa0a>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mtxt\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||||
|
"\u001b[1;31mValueError\u001b[0m: I/O operation on closed file."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"txt.read()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Iterating through a File"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"This is a new first line\n",
|
||||||
|
"This line is being appended to test.txt\n",
|
||||||
|
"And another line here.\n",
|
||||||
|
"This is more text being appended to test.txt\n",
|
||||||
|
"And another line here."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"with open('test.txt','r') as txt:\n",
|
||||||
|
" for line in txt:\n",
|
||||||
|
" print(line, end='') # the end='' argument removes extra linebreaks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Great! Now you should be familiar with formatted string literals and working with text files.\n",
|
||||||
|
"## Next up: Working with PDF Text"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,653 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Working with Text Files\n",
|
||||||
|
"In this section we'll cover\n",
|
||||||
|
" * Working with f-strings (formatted string literals) to format printed text\n",
|
||||||
|
" * Working with Files - opening, reading, writing and appending text files"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Formatted String Literals (f-strings)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Introduced in Python 3.6, <strong>f-strings</strong> offer several benefits over the older `.format()` string method. <br>For one, you can bring outside variables immediately into to the string rather than pass them through as keyword arguments:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"His name is Fred.\n",
|
||||||
|
"His name is Fred.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"name = 'Fred'\n",
|
||||||
|
"\n",
|
||||||
|
"# Using the old .format() method:\n",
|
||||||
|
"print('His name is {var}.'.format(var=name))\n",
|
||||||
|
"\n",
|
||||||
|
"# Using f-strings:\n",
|
||||||
|
"print(f'His name is {name}.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Pass `!r` to get the <strong>string representation</strong>:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"His name is 'Fred'\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(f'His name is {name!r}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Be careful not to let quotation marks in the replacement fields conflict with the quoting used in the outer string:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Address: 123 Main Street\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"d = {'a':123,'b':456}\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'Address: {d['a']} Main Street')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Instead, use different styles of quotation marks:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Address: 123 Main Street\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"d = {'a':123,'b':456}\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Address: {d['a']} Main Street\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Minimum Widths, Alignment and Padding\n",
|
||||||
|
"You can pass arguments inside a nested set of curly braces to set a minimum width for the field, the alignment and even padding characters."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Author Topic Pages \n",
|
||||||
|
"Twain Rafting 601\n",
|
||||||
|
"Feynman Physics 95\n",
|
||||||
|
"Hamilton Mythology 144\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"library = [('Author', 'Topic', 'Pages'), ('Twain', 'Rafting', 601), ('Feynman', 'Physics', 95), ('Hamilton', 'Mythology', 144)]\n",
|
||||||
|
"\n",
|
||||||
|
"for book in library:\n",
|
||||||
|
" print(f'{book[0]:{10}} {book[1]:{8}} {book[2]:{7}}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Here the first three lines align, except `Pages` follows a default left-alignment while numbers are right-aligned. Also, the fourth line's page number is pushed to the right as `Mythology` exceeds the minimum field width of `8`. When setting minimum field widths make sure to take the longest item into account.\n",
|
||||||
|
"\n",
|
||||||
|
"To set the alignment, use the character `<` for left-align, `^` for center, `>` for right.<br>\n",
|
||||||
|
"To set padding, precede the alignment character with the padding character (`-` and `.` are common choices).\n",
|
||||||
|
"\n",
|
||||||
|
"Let's make some adjustments:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Author Topic ..Pages\n",
|
||||||
|
"Twain Rafting ....601\n",
|
||||||
|
"Feynman Physics .....95\n",
|
||||||
|
"Hamilton Mythology ....144\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for book in library:\n",
|
||||||
|
" print(f'{book[0]:{10}} {book[1]:{10}} {book[2]:.>{7}}') # here .> was added"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Date Formatting"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"January 27, 2018\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from datetime import datetime\n",
|
||||||
|
"\n",
|
||||||
|
"today = datetime(year=2018, month=1, day=27)\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'{today:%B %d, %Y}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For more info on formatted string literals visit https://docs.python.org/3/reference/lexical_analysis.html#f-strings\n",
|
||||||
|
"\n",
|
||||||
|
"***"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Files\n",
|
||||||
|
"\n",
|
||||||
|
"Python uses file objects to interact with external files on your computer. These file objects can be any sort of file you have on your computer, whether it be an audio file, a text file, emails, Excel documents, etc. Note: You will probably need to install certain libraries or modules to interact with those various file types, but they are easily available. (We will cover downloading modules later on in the course).\n",
|
||||||
|
"\n",
|
||||||
|
"Python has a built-in open function that allows us to open and play with basic file types. First we will need a file though. We're going to use some IPython magic to create a text file!\n",
|
||||||
|
"\n",
|
||||||
|
"## Creating a File with IPython\n",
|
||||||
|
"#### This function is specific to jupyter notebooks! Alternatively, quickly create a simple .txt file with Sublime text editor."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Overwriting test.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile test.txt\n",
|
||||||
|
"Hello, this is a quick test file.\n",
|
||||||
|
"This is the second line of the file."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Python Opening a File\n",
|
||||||
|
"\n",
|
||||||
|
"### Know Your File's Location\n",
|
||||||
|
"\n",
|
||||||
|
"It's easy to get an error on this step:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "FileNotFoundError",
|
||||||
|
"evalue": "[Errno 2] No such file or directory: 'whoops.txt'",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m myfile \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mwhoops.txt\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
|
||||||
|
"File \u001b[0;32m/opt/conda/lib/python3.12/site-packages/IPython/core/interactiveshell.py:324\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 318\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 319\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 320\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 321\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 322\u001b[0m )\n\u001b[0;32m--> 324\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m io_open(file, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
|
||||||
|
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'whoops.txt'"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"myfile = open('whoops.txt')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"To avoid this error, make sure your .txt file is saved in the same location as your notebook. To check your notebook location, use **pwd**:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pwd"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**Alternatively, to grab files from any location on your computer, simply pass in the entire file path. **\n",
|
||||||
|
"\n",
|
||||||
|
"For Windows you need to use double \\ so python doesn't treat the second \\ as an escape character, a file path is in the form:\n",
|
||||||
|
"\n",
|
||||||
|
" myfile = open(\"C:\\\\Users\\\\YourUserName\\\\Home\\\\Folder\\\\myfile.txt\")\n",
|
||||||
|
"\n",
|
||||||
|
"For MacOS and Linux you use slashes in the opposite direction:\n",
|
||||||
|
"\n",
|
||||||
|
" myfile = open(\"/Users/YourUserName/Folder/myfile.txt\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Open the text.txt file we created earlier\n",
|
||||||
|
"my_file = open('test.txt')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"my_file"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"`my_file` is now an open file object held in memory. We'll perform some reading and writing exercises, and then we have to close the file to free up memory.\n",
|
||||||
|
"\n",
|
||||||
|
"### .read() and .seek()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# We can now read the file\n",
|
||||||
|
"my_file.read()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# But what happens if we try to read it again?\n",
|
||||||
|
"my_file.read()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This happens because you can imagine the reading \"cursor\" is at the end of the file after having read it. So there is nothing left to read. We can reset the \"cursor\" like this:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Seek to the start of file (index 0)\n",
|
||||||
|
"my_file.seek(0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Now read again\n",
|
||||||
|
"my_file.read()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### .readlines()\n",
|
||||||
|
"You can read a file line by line using the readlines method. Use caution with large files, since everything will be held in memory. We will learn how to iterate over large files later in the course."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Readlines returns a list of the lines in the file\n",
|
||||||
|
"my_file.seek(0)\n",
|
||||||
|
"my_file.readlines()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"When you have finished using a file, it is always good practice to close it."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"my_file.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Writing to a File\n",
|
||||||
|
"\n",
|
||||||
|
"By default, the `open()` function will only allow us to read the file. We need to pass the argument `'w'` to write over the file. For example:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Add a second argument to the function, 'w' which stands for write.\n",
|
||||||
|
"# Passing 'w+' lets us read and write to the file\n",
|
||||||
|
"\n",
|
||||||
|
"my_file = open('test.txt','w+')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<div class=\"alert alert-danger\" style=\"margin: 20px\">**Use caution!**<br>\n",
|
||||||
|
"Opening a file with 'w' or 'w+' *truncates the original*, meaning that anything that was in the original file **is deleted**!</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Write to the file\n",
|
||||||
|
"my_file.write('This is a new first line')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Read the file\n",
|
||||||
|
"my_file.seek(0)\n",
|
||||||
|
"my_file.read()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"my_file.close() # always do this when you're done with a file"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Appending to a File\n",
|
||||||
|
"Passing the argument `'a'` opens the file and puts the pointer at the end, so anything written is appended. Like `'w+'`, `'a+'` lets us read and write to a file. If the file does not exist, one will be created."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"my_file = open('test.txt','a+')\n",
|
||||||
|
"my_file.write('\\nThis line is being appended to test.txt')\n",
|
||||||
|
"my_file.write('\\nAnd another line here.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"my_file.seek(0)\n",
|
||||||
|
"print(my_file.read())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"my_file.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Appending with `%%writefile`\n",
|
||||||
|
"Jupyter notebook users can do the same thing using IPython cell magic:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%%writefile -a test.txt\n",
|
||||||
|
"\n",
|
||||||
|
"This is more text being appended to test.txt\n",
|
||||||
|
"And another line here."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Add a blank space if you want the first line to begin on its own line, as Jupyter won't recognize escape sequences like `\\n`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Aliases and Context Managers\n",
|
||||||
|
"You can assign temporary variable names as aliases, and manage the opening and closing of files automatically using a context manager:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('test.txt','r') as txt:\n",
|
||||||
|
" first_line = txt.readlines()[0]\n",
|
||||||
|
" \n",
|
||||||
|
"print(first_line)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Note that the `with ... as ...:` context manager automatically closed `test.txt` after assigning the first line of text to first_line:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"txt.read()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Iterating through a File"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('test.txt','r') as txt:\n",
|
||||||
|
" for line in txt:\n",
|
||||||
|
" print(line, end='') # the end='' argument removes extra linebreaks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Great! Now you should be familiar with formatted string literals and working with text files.\n",
|
||||||
|
"## Next up: Working with PDF Text"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
||||||
@ -0,0 +1,340 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Working with PDF Files\n",
|
||||||
|
"\n",
|
||||||
|
"Often you will have to deal with PDF files. There are [many libraries in Python for working with PDFs](https://reachtim.com/articles/PDF-Manipulation.html), each with their pros and cons, the most common one being **PyPDF2**. You can install it with (note the case-sensitivity, you need to make sure your capitilization matches):\n",
|
||||||
|
"\n",
|
||||||
|
" pip install PyPDF2\n",
|
||||||
|
" \n",
|
||||||
|
"Keep in mind that not every PDF file can be read with this library. PDFs that are too blurry, have a special encoding, encrypted, or maybe just created with a particular program that doesn't work well with PyPDF2 won't be able to be read. If you find yourself in this situation, try using the libraries linked above, but keep in mind, these may also not work. The reason for this is because of the many different parameters for a PDF and how non-standard the settings can be, text could be shown as an image instead of a utf-8 encoding. There are many parameters to consider in this aspect.\n",
|
||||||
|
"\n",
|
||||||
|
"As far as PyPDF2 is concerned, it can only read the text from a PDF document, it won't be able to grab images or other media files from a PDF.\n",
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"## Working with PyPDF2\n",
|
||||||
|
"\n",
|
||||||
|
"Let's begin by showing the basics of the PyPDF2 library."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# note the capitalization\n",
|
||||||
|
"import PyPDF2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Reading PDFs\n",
|
||||||
|
"\n",
|
||||||
|
"First we open a pdf, then create a reader object for it. Notice how we use the binary method of reading , 'rb', instead of just 'r'."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Notice we read it as a binary with 'rb'\n",
|
||||||
|
"f = open('US_Declaration.pdf','rb')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pdf_reader = PyPDF2.PdfFileReader(f)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"pdf_reader.numPages"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"page_one = pdf_reader.getPage(0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can then extract the text:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"page_one_text = page_one.extractText()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"\"Declaration of IndependenceIN CONGRESS, July 4, 1776. The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the\\npolitical bands which have connected them with another, and to assume among the powers of the\\nearth, the separate and equal station to which the Laws of Nature and of Nature's God entitle\\n\\nthem, a decent respect to the opinions of mankind requires that they should declare the causes\\n\\nwhich impel them to the separation. \\nWe hold these truths to be self-evident, that all men are created equal, that they are endowed by\\n\\ntheir Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit\\nof Happiness.ŠThat to secure these rights, Governments are instituted among Men, deriving\\n\\ntheir just powers from the consent of the governed,ŠThat whenever any Form of Government\\nbecomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to\\ninstitute new Government, laying its foundation on such principles and organizing its powers in\\nsuch form, as to them shall seem most likely to effect their Safety and Happiness. Prudence,\\n\\nindeed, will dictate that Governments long established should not be changed for light and\\ntransient causes; and accordingly all experience hath shewn, that mankind are more disposed to\\nsuffer, while evils are sufferable, than to right themselves by abolishing the forms to which they\\n\\nare accustomed. But when a long train of abuses and usurpations, pursuing invariably the same\\nObject evinces a design to reduce them under absolute Despotism, it is their right, it is their duty,\\nto throw off such Government, and to provide new Guards for their future security.ŠSuch has\\nbeen the patient sufferance of these Colonies; and such is now the necessity which constrains\\n\\nthem to alter their former Systems of Government. The history of the present King of Great\\n\\nBritain is a history of repeated injuries and usurpations, all having in direct object the\\nestablishment of an absolute Tyranny over these States. To prove this, let Facts be submitted to a\\ncandid world. He has refused his Assent to Laws, the most wholesome and necessary for the\\npublic good.\\nHe has forbidden his Governors to pass Laws of immediate and pressing\\nimportance, unless suspended in their operation till his Assent should be obtained;\\nand when so suspended, he has utterly neglected to attend to them.\\n\\nHe has refused to pass other Laws for the accommodation of large districts of\\npeople, unless those people would relinquish the right of Representation in the\\nLegislature, a right inestimable to them and formidable to tyrants only. \\n\\nHe has called together legislative bodies at places unusual, uncomfortable, and distant\\nfrom the depository of their public Records, for the sole purpose of fatiguing them into\\ncompliance with his measures.\\n\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"page_one_text"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"f.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Adding to PDFs\n",
|
||||||
|
"\n",
|
||||||
|
"We can not write to PDFs using Python because of the differences between the single string type of Python, and the variety of fonts, placements, and other parameters that a PDF could have.\n",
|
||||||
|
"\n",
|
||||||
|
"What we *can* do is copy pages and append pages to the end."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"f = open('US_Declaration.pdf','rb')\n",
|
||||||
|
"pdf_reader = PyPDF2.PdfFileReader(f)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"first_page = pdf_reader.getPage(0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pdf_writer = PyPDF2.PdfFileWriter()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pdf_writer.addPage(first_page)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pdf_output = open(\"Some_New_Doc.pdf\",\"wb\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pdf_writer.write(pdf_output)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pdf_output.close()\n",
|
||||||
|
"f.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Now we have copied a page and added it to another new document!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Simple Example\n",
|
||||||
|
"\n",
|
||||||
|
"Let's try to grab all the text from this PDF file:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"f = open('US_Declaration.pdf','rb')\n",
|
||||||
|
"\n",
|
||||||
|
"# List of every page's text.\n",
|
||||||
|
"# The index will correspond to the page number.\n",
|
||||||
|
"pdf_text = [0] # zero is a placehoder to make page 1 = index 1\n",
|
||||||
|
"\n",
|
||||||
|
"pdf_reader = PyPDF2.PdfFileReader(f)\n",
|
||||||
|
"\n",
|
||||||
|
"for p in range(pdf_reader.numPages):\n",
|
||||||
|
" \n",
|
||||||
|
" page = pdf_reader.getPage(p)\n",
|
||||||
|
" \n",
|
||||||
|
" pdf_text.append(page.extractText())\n",
|
||||||
|
"\n",
|
||||||
|
"f.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[0,\n",
|
||||||
|
" \"Declaration of IndependenceIN CONGRESS, July 4, 1776. The unanimous Declaration of the thirteen united States of America, When in the Course of human events, it becomes necessary for one people to dissolve the\\npolitical bands which have connected them with another, and to assume among the powers of the\\nearth, the separate and equal station to which the Laws of Nature and of Nature's God entitle\\n\\nthem, a decent respect to the opinions of mankind requires that they should declare the causes\\n\\nwhich impel them to the separation. \\nWe hold these truths to be self-evident, that all men are created equal, that they are endowed by\\n\\ntheir Creator with certain unalienable Rights, that among these are Life, Liberty and the pursuit\\nof Happiness.ŠThat to secure these rights, Governments are instituted among Men, deriving\\n\\ntheir just powers from the consent of the governed,ŠThat whenever any Form of Government\\nbecomes destructive of these ends, it is the Right of the People to alter or to abolish it, and to\\ninstitute new Government, laying its foundation on such principles and organizing its powers in\\nsuch form, as to them shall seem most likely to effect their Safety and Happiness. Prudence,\\n\\nindeed, will dictate that Governments long established should not be changed for light and\\ntransient causes; and accordingly all experience hath shewn, that mankind are more disposed to\\nsuffer, while evils are sufferable, than to right themselves by abolishing the forms to which they\\n\\nare accustomed. But when a long train of abuses and usurpations, pursuing invariably the same\\nObject evinces a design to reduce them under absolute Despotism, it is their right, it is their duty,\\nto throw off such Government, and to provide new Guards for their future security.ŠSuch has\\nbeen the patient sufferance of these Colonies; and such is now the necessity which constrains\\n\\nthem to alter their former Systems of Government. The history of the present King of Great\\n\\nBritain is a history of repeated injuries and usurpations, all having in direct object the\\nestablishment of an absolute Tyranny over these States. To prove this, let Facts be submitted to a\\ncandid world. He has refused his Assent to Laws, the most wholesome and necessary for the\\npublic good.\\nHe has forbidden his Governors to pass Laws of immediate and pressing\\nimportance, unless suspended in their operation till his Assent should be obtained;\\nand when so suspended, he has utterly neglected to attend to them.\\n\\nHe has refused to pass other Laws for the accommodation of large districts of\\npeople, unless those people would relinquish the right of Representation in the\\nLegislature, a right inestimable to them and formidable to tyrants only. \\n\\nHe has called together legislative bodies at places unusual, uncomfortable, and distant\\nfrom the depository of their public Records, for the sole purpose of fatiguing them into\\ncompliance with his measures.\\n\",\n",
|
||||||
|
" 'He has dissolved Representative Houses repeatedly, for opposing with manlyfirmness his invasions on the rights of the people.He has refused for a long time, after such dissolutions, to cause others to beelected; whereby the Legislative powers, incapable of Annihilation, have returnedto the People at large for their exercise; the State remaining in the mean timeexposed to all the dangers of invasion from without, and convulsions within.He has endeavoured to prevent the population of these States; for that purposeobstructing the Laws for Naturalization of Foreigners; refusing to pass others toencourage their migrations hither, and raising the conditions of newAppropriations of Lands.He has obstructed the Administration of Justice, by refusing his Assent to Lawsfor establishing Judiciary powers.He has made Judges dependent on his Will alone, for the tenure of their offices,and the amount and payment of their salaries.He has erected a multitude of New Offices, and sent hither swarms of Officers toharrass our people, and eat out their substance.He has kept among us, in times of peace, Standing Armies without the Consent ofour legislatures.He has affected to render the Military independent of and superior to the Civil power.He has combined with others to subject us to a jurisdiction foreign to ourconstitution, and unacknowledged by our laws; giving his Assent to their Acts ofpretended Legislation:For Quartering large bodies of armed troops among us:For protecting them, by a mock Trial, from punishment for any Murders whichthey should commit on the Inhabitants of these States:For cutting off our Trade with all parts of the world:For imposing Taxes on us without our Consent: For depriving us in many cases,of the benefits of Trial by Jury:For transporting us beyond Seas to be tried for pretended offencesFor abolishing the free System of English Laws in a neighbouring Province,establishing therein an Arbitrary government, and enlarging its Boundaries so as',\n",
|
||||||
|
" 'to render it at once an example and fit instrument for introducing the sameabsolute rule into these Colonies:For taking away our Charters, abolishing our most valuable Laws, and alteringfundamentally the Forms of our Governments:For suspending our own Legislatures, and declaring themselves invested withpower to legislate for us in all cases whatsoever.He has abdicated Government here, by declaring us out of his Protection andwaging War against us.He has plundered our seas, ravaged our Coasts, burnt our towns, and destroyed thelives of our people.He is at this time transporting large Armies of foreign Mercenaries to compleatthe works of death, desolation and tyranny, already begun with circumstances ofCruelty & perfidy scarcely paralleled in the most barbarous ages, and totallyunworthy of the Head of a civilized nation.He has constrained our fellow Citizens taken Captive on the high Seas to bearArms against their Country, to become the executioners of their friends and\\nBrethren, or to fall themselves by their Hands.He has excited domestic insurrections amongst us, and has endeavoured to bringon the inhabitants of our frontiers, the merciless Indian Savages, whose known\\nrule of warfare, is an undistinguished destruction of all ages, sexes and conditions. In every stage of these Oppressions We have Petitioned for Redress in the most humble terms:Our repeated Petitions have been answered only by repeated injury. A Prince whose character isthus marked by every act which may define a Tyrant, is unfit to be the ruler of a free people. Nor have We been wanting in attentions to our Brittish brethren. We have warned them fromtime to time of attempts by their legislature to extend an unwarrantable jurisdiction over us. Wehave reminded them of the circumstances of our emigration and settlement here. We haveappealed to their native justice and magnanimity, and we have conjured them by the ties of ourcommon kindred to disavow these usurpations, which, would inevitably interrupt ourconnections and correspondence. They too have been deaf to the voice of justice and ofconsanguinity. We must, therefore, acquiesce in the necessity, which denounces our Separation,and hold them, as we hold the rest of mankind, Enemies in War, in Peace Friends. We, therefore, the Representatives of the united States of America, in General Congress,Assembled, appealing to the Supreme Judge of the world for the rectitude of our intentions, do,in the Name, and by Authority of the good People of these Colonies, solemnly publish anddeclare, That these United Colonies are, and of Right ought to be Free and Independent States;that they are Absolved from all Allegiance to the British Crown, and that all political connection',\n",
|
||||||
|
" 'between them and the State of Great Britain, is and ought to be totally dissolved; and that as Free\\n\\nand Independent States, they have full Power to levy War, conclude Peace, contract Alliances,\\nestablish Commerce, and to do all other Acts and Things which Independent States may of right\\n\\ndo. And for the support of this Declaration, with a firm reliance on the protection of divine\\nProvidence, we mutually pledge to each other our Lives, our Fortunes and our sacred Honor.\\n\\n[The 56 signatures on the Declaration were arranged in six columns:\\n] [Column 1]\\n Georgia: Button Gwinnett\\n Lyman \\nHall George Walton \\n[Column 2]\\n North Carolina: William Hooper\\n Joseph Hewes\\n John Penn\\n South Carolina: Edward Ru\\ntledge Thomas Heyward, Jr.\\n Thomas Lynch, Jr.\\n Arthur Middleton \\n[Column 3]\\n Massachusetts: John Hancock\\n\\n Maryland: Samuel Chase\\n\\n William Paca\\n\\n Thomas Stone\\n\\n Charles Carroll of Carrollton\\n\\n Virginia: George Wythe\\n\\n Richard Henry Lee\\n\\n Thomas Jefferson\\n\\n Benjamin Harrison\\n\\n Thomas Nelson, Jr.\\n\\n Francis Lightfoot Lee\\n\\n Carter Braxton \\n\\n[Column 4]\\n Pennsylvania: Robert Morris\\n\\n Benjamin Rush\\n Benjamin Fran\\nklin John Morton\\n',\n",
|
||||||
|
" ' George Clymer\\n James Smith\\n George Taylor\\n James Wilson\\n George Ross\\n Delaware: Caesar Rodney\\n George Read\\n Thomas McKean \\n[Column 5]\\n New York: Wi\\nlliam Floyd Philip Livingston\\n Francis L\\newis Lewis Morris\\n New Jersey: Richard Stockton\\n John Witherspoon\\n Francis Hopkinson\\n John Hart\\n Abraham Clark \\n[Column 6]\\n New Hampshire: Josiah Bartlett\\n William Whipple\\n Massachusetts: Samuel Adams\\n John Adams\\n Robert Treat Paine\\n Elbridge Gerry\\n Rhode Island: Stephen Hopkins\\n William Ellery\\n Connecticut: Roger Sherman\\n Samuel Huntington\\n William Williams\\n Oliver Wolcott\\n New Hampshire: Matthew Thornton\\n ']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"pdf_text"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"He has dissolved Representative Houses repeatedly, for opposing with manlyfirmness his invasions on the rights of the people.He has refused for a long time, after such dissolutions, to cause others to beelected; whereby the Legislative powers, incapable of Annihilation, have returnedto the People at large for their exercise; the State remaining in the mean timeexposed to all the dangers of invasion from without, and convulsions within.He has endeavoured to prevent the population of these States; for that purposeobstructing the Laws for Naturalization of Foreigners; refusing to pass others toencourage their migrations hither, and raising the conditions of newAppropriations of Lands.He has obstructed the Administration of Justice, by refusing his Assent to Lawsfor establishing Judiciary powers.He has made Judges dependent on his Will alone, for the tenure of their offices,and the amount and payment of their salaries.He has erected a multitude of New Offices, and sent hither swarms of Officers toharrass our people, and eat out their substance.He has kept among us, in times of peace, Standing Armies without the Consent ofour legislatures.He has affected to render the Military independent of and superior to the Civil power.He has combined with others to subject us to a jurisdiction foreign to ourconstitution, and unacknowledged by our laws; giving his Assent to their Acts ofpretended Legislation:For Quartering large bodies of armed troops among us:For protecting them, by a mock Trial, from punishment for any Murders whichthey should commit on the Inhabitants of these States:For cutting off our Trade with all parts of the world:For imposing Taxes on us without our Consent: For depriving us in many cases,of the benefits of Trial by Jury:For transporting us beyond Seas to be tried for pretended offencesFor abolishing the free System of English Laws in a neighbouring Province,establishing therein an Arbitrary government, and enlarging its Boundaries so as\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(pdf_text[2])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Excellent work! \n",
|
||||||
|
"That is all for PyPDF2 for now, remember that this won't work with every PDF file and is limited in its scope to only the text of PDFs.\n",
|
||||||
|
"## Next up: Regular Expressions"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,293 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# Python Text Basics Assessment\n",
|
||||||
|
"\n",
|
||||||
|
"Welcome to your assessment! Complete the tasks described in bold below by typing the relevant code in the cells.<br>\n",
|
||||||
|
"You can compare your answers to the Solutions notebook provided in this folder."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## f-Strings\n",
|
||||||
|
"#### 1. Print an f-string that displays `NLP stands for Natural Language Processing` using the variables provided."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"NLP stands for Natural Language Processing\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"abbr = 'NLP'\n",
|
||||||
|
"full_text = 'Natural Language Processing'\n",
|
||||||
|
"\n",
|
||||||
|
"# Enter your code here:\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Files\n",
|
||||||
|
"#### 2. Create a file in the current working directory called `contacts.txt` by running the cell below:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Overwriting contacts.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile contacts.txt\n",
|
||||||
|
"First_Name Last_Name, Title, Extension, Email"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### 3. Open the file and use .read() to save the contents of the file to a string called `fields`. Make sure the file is closed at the end."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'First_Name Last_Name, Title, Extension, Email'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Write your code here:\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" \n",
|
||||||
|
"# Run fields to see the contents of contacts.txt:\n",
|
||||||
|
"fields"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Working with PDF Files\n",
|
||||||
|
"#### 4. Use PyPDF2 to open the file `Business_Proposal.pdf`. Extract the text of page 2."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"AUTHORS:\n",
|
||||||
|
" \n",
|
||||||
|
"Amy Baker, Finance Chair, x345, abaker@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Perform import\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Open the file as a binary object\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Use PyPDF2 to read the text of the file\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Get the text from page 2 (CHALLENGE: Do this in one step!)\n",
|
||||||
|
"page_two_text = \n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Close the file\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Print the contents of page_two_text\n",
|
||||||
|
"print(page_two_text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### 5. Open the file `contacts.txt` in append mode. Add the text of page 2 from above to `contacts.txt`.\n",
|
||||||
|
"\n",
|
||||||
|
"#### CHALLENGE: See if you can remove the word \"AUTHORS:\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"First_Name Last_Name, Title, Extension, EmailAUTHORS:\n",
|
||||||
|
" \n",
|
||||||
|
"Amy Baker, Finance Chair, x345, abaker@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Simple Solution:\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"First_Name Last_Name, Title, Extension, Email\n",
|
||||||
|
" \n",
|
||||||
|
"Amy Baker, Finance Chair, x345, abaker@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# CHALLENGE Solution (re-run the %%writefile cell above to obtain an unmodified contacts.txt file):\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Regular Expressions\n",
|
||||||
|
"#### 6. Using the `page_two_text` variable created above, extract any email addresses that were contained in the file `Business_Proposal.pdf`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['abaker@ourcompany.com',\n",
|
||||||
|
" 'cdonaldson@ourcompany.com',\n",
|
||||||
|
" 'efreeman@ourcompany.com']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"\n",
|
||||||
|
"# Enter your regex pattern here. This may take several tries!\n",
|
||||||
|
"pattern = \n",
|
||||||
|
"\n",
|
||||||
|
"re.findall(pattern, page_two_text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Great job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,295 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# Python Text Basics Assessment - Solutions\n",
|
||||||
|
"\n",
|
||||||
|
"Welcome to your assessment! Complete the tasks described in bold below by typing the relevant code in the cells."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## f-Strings\n",
|
||||||
|
"#### 1. Print an f-string that displays `NLP stands for Natural Language Processing` using the variables provided."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"NLP stands for Natural Language Processing\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"abbr = 'NLP'\n",
|
||||||
|
"full_text = 'Natural Language Processing'\n",
|
||||||
|
"\n",
|
||||||
|
"# Enter your code here:\n",
|
||||||
|
"print(f'{abbr} stands for {full_text}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Files\n",
|
||||||
|
"#### 2. Create a file in the current working directory called `contacts.txt` by running the cell below:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Overwriting contacts.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile contacts.txt\n",
|
||||||
|
"First_Name Last_Name, Title, Extension, Email"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### 3. Open the file and use .read() to save the contents of the file to a string called `fields`. Make sure the file is closed at the end."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'First_Name Last_Name, Title, Extension, Email'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Write your code here:\n",
|
||||||
|
"with open('contacts.txt') as c:\n",
|
||||||
|
" fields = c.read()\n",
|
||||||
|
"\n",
|
||||||
|
" \n",
|
||||||
|
"# Run fields to see the contents of contacts.txt:\n",
|
||||||
|
"fields"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Working with PDF Files\n",
|
||||||
|
"#### 4. Use PyPDF2 to open the file `Business_Proposal.pdf`. Extract the text of page 2."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"AUTHORS:\n",
|
||||||
|
" \n",
|
||||||
|
"Amy Baker, Finance Chair, x345, abaker@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Perform import\n",
|
||||||
|
"import PyPDF2\n",
|
||||||
|
"\n",
|
||||||
|
"# Open the file as a binary object\n",
|
||||||
|
"f = open('Business_Proposal.pdf','rb')\n",
|
||||||
|
"\n",
|
||||||
|
"# Use PyPDF2 to read the text of the file\n",
|
||||||
|
"pdf_reader = PyPDF2.PdfFileReader(f)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Get the text from page 2 (CHALLENGE: Do this in one step!)\n",
|
||||||
|
"page_two_text = pdf_reader.getPage(1).extractText()\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Close the file\n",
|
||||||
|
"f.close()\n",
|
||||||
|
"\n",
|
||||||
|
"# Print the contents of page_two_text\n",
|
||||||
|
"print(page_two_text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### 5. Open the file `contacts.txt` in append mode. Add the text of page 2 from above to `contacts.txt`.\n",
|
||||||
|
"\n",
|
||||||
|
"#### CHALLENGE: See if you can remove the word \"AUTHORS:\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"First_Name Last_Name, Title, Extension, EmailAUTHORS:\n",
|
||||||
|
" \n",
|
||||||
|
"Amy Baker, Finance Chair, x345, abaker@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Simple Solution:\n",
|
||||||
|
"with open('contacts.txt','a+') as c:\n",
|
||||||
|
" c.write(page_two_text)\n",
|
||||||
|
" c.seek(0)\n",
|
||||||
|
" print(c.read())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"First_Name Last_Name, Title, Extension, Email\n",
|
||||||
|
" \n",
|
||||||
|
"Amy Baker, Finance Chair, x345, abaker@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Chris Donaldson, Accounting Dir., x621, cdonaldson@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"Erin Freeman, Sr. VP, x879, efreeman@ourcompany.com\n",
|
||||||
|
" \n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# CHALLENGE Solution (re-run the %%writefile cell above to obtain an unmodified contacts.txt file):\n",
|
||||||
|
"with open('contacts.txt','a+') as c:\n",
|
||||||
|
" c.write(page_two_text[8:])\n",
|
||||||
|
" c.seek(0)\n",
|
||||||
|
" print(c.read())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Regular Expressions\n",
|
||||||
|
"#### 6. Using the `page_two_text` variable created above, extract any email addresses that were contained in the file `Business_Proposal.pdf`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['abaker@ourcompany.com',\n",
|
||||||
|
" 'cdonaldson@ourcompany.com',\n",
|
||||||
|
" 'efreeman@ourcompany.com']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import re\n",
|
||||||
|
"\n",
|
||||||
|
"# Enter your regex pattern here. This may take several tries!\n",
|
||||||
|
"pattern = r'\\w+@\\w+.\\w{3}'\n",
|
||||||
|
"\n",
|
||||||
|
"re.findall(pattern, page_two_text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Great job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
Binary file not shown.
1876
Praktikum Python Code/00-Python-Text-Basics/Some_New_Doc.pdf
Normal file
1876
Praktikum Python Code/00-Python-Text-Basics/Some_New_Doc.pdf
Normal file
File diff suppressed because it is too large
Load Diff
4596
Praktikum Python Code/00-Python-Text-Basics/US_Declaration.pdf
Normal file
4596
Praktikum Python Code/00-Python-Text-Basics/US_Declaration.pdf
Normal file
File diff suppressed because it is too large
Load Diff
2
Praktikum Python Code/00-Python-Text-Basics/test.txt
Normal file
2
Praktikum Python Code/00-Python-Text-Basics/test.txt
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
Hello, this is a quick test file.
|
||||||
|
This is the second line of the file.
|
||||||
@ -0,0 +1,656 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# spaCy Basics\n",
|
||||||
|
"\n",
|
||||||
|
"**spaCy** (https://spacy.io/) is an open-source Python library that parses and \"understands\" large volumes of text. Separate models are available that cater to specific languages (English, French, German, etc.).\n",
|
||||||
|
"\n",
|
||||||
|
"In this section we'll install and setup spaCy to work with Python, and then introduce some concepts related to Natural Language Processing."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Installation and Setup\n",
|
||||||
|
"\n",
|
||||||
|
"Installation is a two-step process. First, install spaCy using either conda or pip. Next, download the specific model you want, based on language.<br> For more info visit https://spacy.io/usage/\n",
|
||||||
|
"\n",
|
||||||
|
"### 1. From the command line or terminal:\n",
|
||||||
|
"> `conda install -c conda-forge spacy`\n",
|
||||||
|
"> <br>*or*<br>\n",
|
||||||
|
"> `pip install -U spacy`\n",
|
||||||
|
"\n",
|
||||||
|
"> ### Alternatively you can create a virtual environment:\n",
|
||||||
|
"> `conda create -n spacyenv python=3 spacy=2`\n",
|
||||||
|
"\n",
|
||||||
|
"### 2. Next, also from the command line (you must run this as admin or use sudo):\n",
|
||||||
|
"\n",
|
||||||
|
"> `python -m spacy download en`\n",
|
||||||
|
"\n",
|
||||||
|
"> ### If successful, you should see a message like:\n",
|
||||||
|
"\n",
|
||||||
|
"> **`Linking successful`**<br>\n",
|
||||||
|
"> ` C:\\Anaconda3\\envs\\spacyenv\\lib\\site-packages\\en_core_web_sm -->`<br>\n",
|
||||||
|
"> ` C:\\Anaconda3\\envs\\spacyenv\\lib\\site-packages\\spacy\\data\\en`<br>\n",
|
||||||
|
"> ` `<br>\n",
|
||||||
|
"> ` You can now load the model via spacy.load('en')`\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Working with spaCy in Python\n",
|
||||||
|
"\n",
|
||||||
|
"This is a typical set of instructions for importing and working with spaCy. Don't be surprised if this takes awhile - spaCy has a fairly large library to load:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tesla PROPN nsubj\n",
|
||||||
|
"is VERB aux\n",
|
||||||
|
"looking VERB ROOT\n",
|
||||||
|
"at ADP prep\n",
|
||||||
|
"buying VERB pcomp\n",
|
||||||
|
"U.S. PROPN compound\n",
|
||||||
|
"startup NOUN dobj\n",
|
||||||
|
"for ADP prep\n",
|
||||||
|
"$ SYM quantmod\n",
|
||||||
|
"6 NUM compound\n",
|
||||||
|
"million NUM pobj\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Import spaCy and load the language library\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')\n",
|
||||||
|
"\n",
|
||||||
|
"# Create a Doc object\n",
|
||||||
|
"doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')\n",
|
||||||
|
"\n",
|
||||||
|
"# Print each token separately\n",
|
||||||
|
"for token in doc:\n",
|
||||||
|
" print(token.text, token.pos_, token.dep_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This doesn't look very user-friendly, but right away we see some interesting things happen:\n",
|
||||||
|
"1. Tesla is recognized to be a Proper Noun, not just a word at the start of a sentence\n",
|
||||||
|
"2. U.S. is kept together as one entity (we call this a 'token')\n",
|
||||||
|
"\n",
|
||||||
|
"As we dive deeper into spaCy we'll see what each of these abbreviations mean and how they're derived. We'll also see how spaCy can interpret the last three tokens combined `$6 million` as referring to ***money***."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# spaCy Objects\n",
|
||||||
|
"\n",
|
||||||
|
"After importing the spacy module in the cell above we loaded a **model** and named it `nlp`.<br>Next we created a **Doc** object by applying the model to our text, and named it `doc`.<br>spaCy also builds a companion **Vocab** object that we'll cover in later sections.<br>The **Doc** object that holds the processed text is our focus here."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Pipeline\n",
|
||||||
|
"When we run `nlp`, our text enters a *processing pipeline* that first breaks down the text and then performs a series of operations to tag, parse and describe the data. Image source: https://spacy.io/usage/spacy-101#pipelines"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<img src=\"../pipeline1.png\" width=\"600\">"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can check to see what components currently live in the pipeline. In later sections we'll learn how to disable components and add new ones as needed."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[('tagger', <spacy.pipeline.Tagger at 0x237cb1e8f98>),\n",
|
||||||
|
" ('parser', <spacy.pipeline.DependencyParser at 0x237cb2852b0>),\n",
|
||||||
|
" ('ner', <spacy.pipeline.EntityRecognizer at 0x237cb285360>)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nlp.pipeline"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['tagger', 'parser', 'ner']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nlp.pipe_names"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Tokenization\n",
|
||||||
|
"The first step in processing text is to split up all the component parts (words & punctuation) into \"tokens\". These tokens are annotated inside the Doc object to contain descriptive information. We'll go into much more detail on tokenization in an upcoming lecture. For now, let's look at another example:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tesla PROPN nsubj\n",
|
||||||
|
"is VERB aux\n",
|
||||||
|
"n't ADV neg\n",
|
||||||
|
" SPACE \n",
|
||||||
|
"looking VERB ROOT\n",
|
||||||
|
"into ADP prep\n",
|
||||||
|
"startups NOUN pobj\n",
|
||||||
|
"anymore ADV advmod\n",
|
||||||
|
". PUNCT punct\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc2 = nlp(u\"Tesla isn't looking into startups anymore.\")\n",
|
||||||
|
"\n",
|
||||||
|
"for token in doc2:\n",
|
||||||
|
" print(token.text, token.pos_, token.dep_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Notice how `isn't` has been split into two tokens. spaCy recognizes both the root verb `is` and the negation attached to it. Notice also that both the extended whitespace and the period at the end of the sentence are assigned their own tokens.\n",
|
||||||
|
"\n",
|
||||||
|
"It's important to note that even though `doc2` contains processed information about each token, it also retains the original text:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Tesla isn't looking into startups anymore."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Tesla"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc2[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"spacy.tokens.doc.Doc"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"type(doc2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Part-of-Speech Tagging (POS)\n",
|
||||||
|
"The next step after splitting the text up into tokens is to assign parts of speech. In the above example, `Tesla` was recognized to be a ***proper noun***. Here some statistical modeling is required. For example, words that follow \"the\" are typically nouns.\n",
|
||||||
|
"\n",
|
||||||
|
"For a full list of POS Tags visit https://spacy.io/api/annotation#pos-tagging"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'PROPN'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc2[0].pos_"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Dependencies\n",
|
||||||
|
"We also looked at the syntactic dependencies assigned to each token. `Tesla` is identified as an `nsubj` or the ***nominal subject*** of the sentence.\n",
|
||||||
|
"\n",
|
||||||
|
"For a full list of Syntactic Dependencies visit https://spacy.io/api/annotation#dependency-parsing\n",
|
||||||
|
"<br>A good explanation of typed dependencies can be found [here](https://nlp.stanford.edu/software/dependencies_manual.pdf)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'nsubj'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc2[0].dep_"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"To see the full name of a tag use `spacy.explain(tag)`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'proper noun'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"spacy.explain('PROPN')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'nominal subject'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"spacy.explain('nsubj')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Additional Token Attributes\n",
|
||||||
|
"We'll see these again in upcoming lectures. For now we just want to illustrate some of the other information that spaCy assigns to tokens:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"|Tag|Description|doc2[0].tag|\n",
|
||||||
|
"|:------|:------:|:------|\n",
|
||||||
|
"|`.text`|The original word text<!-- .element: style=\"text-align:left;\" -->|`Tesla`|\n",
|
||||||
|
"|`.lemma_`|The base form of the word|`tesla`|\n",
|
||||||
|
"|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|\n",
|
||||||
|
"|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|\n",
|
||||||
|
"|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|\n",
|
||||||
|
"|`.is_alpha`|Is the token an alpha character?|`True`|\n",
|
||||||
|
"|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"looking\n",
|
||||||
|
"look\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Lemmas (the base form of the word):\n",
|
||||||
|
"print(doc2[4].text)\n",
|
||||||
|
"print(doc2[4].lemma_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"VERB\n",
|
||||||
|
"VBG / verb, gerund or present participle\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Simple Parts-of-Speech & Detailed Tags:\n",
|
||||||
|
"print(doc2[4].pos_)\n",
|
||||||
|
"print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tesla: Xxxxx\n",
|
||||||
|
"U.S. : X.X.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Word Shapes:\n",
|
||||||
|
"print(doc2[0].text+': '+doc2[0].shape_)\n",
|
||||||
|
"print(doc[5].text+' : '+doc[5].shape_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"True\n",
|
||||||
|
"False\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Boolean Values:\n",
|
||||||
|
"print(doc2[0].is_alpha)\n",
|
||||||
|
"print(doc2[0].is_stop)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Spans\n",
|
||||||
|
"Large Doc objects can be hard to work with at times. A **span** is a slice of Doc object in the form `Doc[start:stop]`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc3 = nlp(u'Although commmonly attributed to John Lennon from his song \"Beautiful Boy\", \\\n",
|
||||||
|
"the phrase \"Life is what happens to us while we are making other plans\" was written by \\\n",
|
||||||
|
"cartoonist Allen Saunders and published in Reader\\'s Digest in 1957, when Lennon was 17.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\"Life is what happens to us while we are making other plans\"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"life_quote = doc3[16:30]\n",
|
||||||
|
"print(life_quote)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"spacy.tokens.span.Span"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"type(life_quote)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In upcoming lectures we'll see how to create Span objects using `Span()`. This will allow us to assign additional information to the Span."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Sentences\n",
|
||||||
|
"Certain tokens inside a Doc object may also receive a \"start of sentence\" tag. While this doesn't immediately build a list of sentences, these tags enable the generation of sentence segments through `Doc.sents`. Later we'll write our own segmentation rules."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"This is the first sentence.\n",
|
||||||
|
"This is another sentence.\n",
|
||||||
|
"This is the last sentence.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for sent in doc4.sents:\n",
|
||||||
|
" print(sent)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc4[6].is_sent_start"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Next up: Tokenization"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,888 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tokenization\n",
|
||||||
|
"The first step in creating a `Doc` object is to break down the incoming text into component pieces or \"tokens\"."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import spaCy and load the language library\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\"We're moving to L.A.!\"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create a string that includes opening and closing quotation marks\n",
|
||||||
|
"mystring = '\"We\\'re moving to L.A.!\"'\n",
|
||||||
|
"print(mystring)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\" | We | 're | moving | to | L.A. | ! | \" | "
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create a Doc object and explore tokens\n",
|
||||||
|
"doc = nlp(mystring)\n",
|
||||||
|
"\n",
|
||||||
|
"for token in doc:\n",
|
||||||
|
" print(token.text, end=' | ')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<img src=\"../tokenization.png\" width=\"600\">"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"- **Prefix**:\tCharacter(s) at the beginning ▸ `$ ( “ ¿`\n",
|
||||||
|
"- **Suffix**:\tCharacter(s) at the end ▸ `km ) , . ! ”`\n",
|
||||||
|
"- **Infix**:\tCharacter(s) in between ▸ `- -- / ...`\n",
|
||||||
|
"- **Exception**: Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied ▸ `St. U.S.`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Notice that tokens are pieces of the original text. That is, we don't see any conversion to word stems or lemmas (base forms of words) and we haven't seen anything about organizations/places/money etc. Tokens are the basic building blocks of a Doc object - everything that helps us understand the meaning of the text is derived from tokens and their relationship to one another."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Prefixes, Suffixes and Infixes\n",
|
||||||
|
"spaCy will isolate punctuation that does *not* form an integral part of a word. Quotation marks, commas, and punctuation at the end of a sentence will be assigned their own token. However, punctuation that exists as part of an email address, website or numerical value will be kept as part of the token."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"We\n",
|
||||||
|
"'re\n",
|
||||||
|
"here\n",
|
||||||
|
"to\n",
|
||||||
|
"help\n",
|
||||||
|
"!\n",
|
||||||
|
"Send\n",
|
||||||
|
"snail\n",
|
||||||
|
"-\n",
|
||||||
|
"mail\n",
|
||||||
|
",\n",
|
||||||
|
"email\n",
|
||||||
|
"support@oursite.com\n",
|
||||||
|
"or\n",
|
||||||
|
"visit\n",
|
||||||
|
"us\n",
|
||||||
|
"at\n",
|
||||||
|
"http://www.oursite.com\n",
|
||||||
|
"!\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc2 = nlp(u\"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!\")\n",
|
||||||
|
"\n",
|
||||||
|
"for t in doc2:\n",
|
||||||
|
" print(t)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Note that the exclamation points, comma, and the hyphen in 'snail-mail' are assigned their own tokens, yet both the email address and website are preserved.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"A\n",
|
||||||
|
"5\n",
|
||||||
|
"km\n",
|
||||||
|
"NYC\n",
|
||||||
|
"cab\n",
|
||||||
|
"ride\n",
|
||||||
|
"costs\n",
|
||||||
|
"$\n",
|
||||||
|
"10.30\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc3 = nlp(u'A 5km NYC cab ride costs $10.30')\n",
|
||||||
|
"\n",
|
||||||
|
"for t in doc3:\n",
|
||||||
|
" print(t)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Here the distance unit and dollar sign are assigned their own tokens, yet the dollar amount is preserved.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Exceptions\n",
|
||||||
|
"Punctuation that exists as part of a known abbreviation will be kept as part of the token."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Let\n",
|
||||||
|
"'s\n",
|
||||||
|
"visit\n",
|
||||||
|
"St.\n",
|
||||||
|
"Louis\n",
|
||||||
|
"in\n",
|
||||||
|
"the\n",
|
||||||
|
"U.S.\n",
|
||||||
|
"next\n",
|
||||||
|
"year\n",
|
||||||
|
".\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc4 = nlp(u\"Let's visit St. Louis in the U.S. next year.\")\n",
|
||||||
|
"\n",
|
||||||
|
"for t in doc4:\n",
|
||||||
|
" print(t)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Here the abbreviations for \"Saint\" and \"United States\" are both preserved.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Counting Tokens\n",
|
||||||
|
"`Doc` objects have a set number of tokens:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"8"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Counting Vocab Entries\n",
|
||||||
|
"`Vocab` objects contain a full library of items!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"57852"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(doc.vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>NOTE: This number changes based on the language library loaded at the start, and any new lexemes introduced to the `vocab` when the `Doc` was created.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Tokens can be retrieved by index position and slice\n",
|
||||||
|
"`Doc` objects can be thought of as lists of `token` objects. As such, individual tokens can be retrieved by index position, and spans of tokens can be retrieved through slicing:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"better"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc5 = nlp(u'It is better to give than to receive.')\n",
|
||||||
|
"\n",
|
||||||
|
"# Retrieve the third token:\n",
|
||||||
|
"doc5[2]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"better to give"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Retrieve three tokens from the middle:\n",
|
||||||
|
"doc5[2:5]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"than to receive."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Retrieve the last four tokens:\n",
|
||||||
|
"doc5[-4:]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Tokens cannot be reassigned\n",
|
||||||
|
"Although `Doc` objects can be considered lists of tokens, they do *not* support item reassignment:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc6 = nlp(u'My dinner was horrible.')\n",
|
||||||
|
"doc7 = nlp(u'Your dinner was delicious.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "TypeError",
|
||||||
|
"evalue": "'spacy.tokens.doc.Doc' object does not support item assignment",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[1;32m<ipython-input-13-d4fb8c39c40b>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# Try to change \"My dinner was horrible\" to \"My dinner was delicious\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mdoc6\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdoc7\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||||
|
"\u001b[1;31mTypeError\u001b[0m: 'spacy.tokens.doc.Doc' object does not support item assignment"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Try to change \"My dinner was horrible\" to \"My dinner was delicious\"\n",
|
||||||
|
"doc6[3] = doc7[3]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Named Entities\n",
|
||||||
|
"Going a step beyond tokens, *named entities* add another layer of context. The language model recognizes that certain words are organizational names while others are locations, and still other combinations relate to money, dates, etc. Named entities are accessible through the `ents` property of a `Doc` object."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | \n",
|
||||||
|
"----\n",
|
||||||
|
"Apple - ORG - Companies, agencies, institutions, etc.\n",
|
||||||
|
"Hong Kong - GPE - Countries, cities, states\n",
|
||||||
|
"$6 million - MONEY - Monetary values, including unit\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')\n",
|
||||||
|
"\n",
|
||||||
|
"for token in doc8:\n",
|
||||||
|
" print(token.text, end=' | ')\n",
|
||||||
|
"\n",
|
||||||
|
"print('\\n----')\n",
|
||||||
|
"\n",
|
||||||
|
"for ent in doc8.ents:\n",
|
||||||
|
" print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Note how two tokens combine to form the entity `Hong Kong`, and three tokens combine to form the monetary entity: `$6 million`</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(doc8.ents)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Named Entity Recognition (NER) is an important machine learning tool applied to Natural Language Processing.<br>We'll do a lot more with it in an upcoming section. For more info on **named entities** visit https://spacy.io/usage/linguistic-features#named-entities"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"---\n",
|
||||||
|
"# Noun Chunks\n",
|
||||||
|
"Similar to `Doc.ents`, `Doc.noun_chunks` are another object property. *Noun chunks* are \"base noun phrases\" – flat phrases that have a noun as their head. You can think of noun chunks as a noun plus the words describing the noun – for example, in [Sheb Wooley's 1958 song](https://en.wikipedia.org/wiki/The_Purple_People_Eater), a *\"one-eyed, one-horned, flying, purple people-eater\"* would be one long noun chunk."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Autonomous cars\n",
|
||||||
|
"insurance liability\n",
|
||||||
|
"manufacturers\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc9 = nlp(u\"Autonomous cars shift insurance liability toward manufacturers.\")\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in doc9.noun_chunks:\n",
|
||||||
|
" print(chunk.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Red cars\n",
|
||||||
|
"higher insurance rates\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc10 = nlp(u\"Red cars do not carry higher insurance rates.\")\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in doc10.noun_chunks:\n",
|
||||||
|
" print(chunk.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"He\n",
|
||||||
|
"a one-eyed, one-horned, flying, purple people-eater\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc11 = nlp(u\"He was a one-eyed, one-horned, flying, purple people-eater.\")\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in doc11.noun_chunks:\n",
|
||||||
|
" print(chunk.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We'll look at additional noun_chunks components besides `.text` in an upcoming section.<br>For more info on **noun_chunks** visit https://spacy.io/usage/linguistic-features#noun-chunks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Built-in Visualizers\n",
|
||||||
|
"\n",
|
||||||
|
"spaCy includes a built-in visualization tool called **displaCy**. displaCy is able to detect whether you're working in a Jupyter notebook, and will return markup that can be rendered in a cell right away. When you export your notebook, the visualizations will be included as HTML.\n",
|
||||||
|
"\n",
|
||||||
|
"For more info visit https://spacy.io/usage/visualizers"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Visualizing the dependency parse\n",
|
||||||
|
"Run the cell below to import displacy and display the dependency graphic"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"0\" class=\"displacy\" width=\"1370\" height=\"357.0\" style=\"max-width: none; height: 357.0px; color: #000000; background: #ffffff; font-family: Arial\">\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">Apple</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PROPN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"160\">is</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"160\">VERB</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"270\">going</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"270\">VERB</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"380\">to</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"380\">PART</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"490\">build</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"490\">VERB</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"600\">a</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"600\">DET</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"710\">U.K.</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"710\">PROPN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"820\">factory</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"820\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"930\">for</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"930\">ADP</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1040\">$</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1040\">SYM</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1150\">6</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1150\">NUM</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1260\">million.</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1260\">NUM</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-0\" stroke-width=\"2px\" d=\"M70,222.0 C70,112.0 260.0,112.0 260.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-0\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M70,224.0 L62,212.0 78,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-1\" stroke-width=\"2px\" d=\"M180,222.0 C180,167.0 255.0,167.0 255.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-1\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M180,224.0 L172,212.0 188,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-2\" stroke-width=\"2px\" d=\"M400,222.0 C400,167.0 475.0,167.0 475.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-2\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M400,224.0 L392,212.0 408,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-3\" stroke-width=\"2px\" d=\"M290,222.0 C290,112.0 480.0,112.0 480.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-3\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">xcomp</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M480.0,224.0 L488.0,212.0 472.0,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-4\" stroke-width=\"2px\" d=\"M620,222.0 C620,112.0 810.0,112.0 810.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-4\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M620,224.0 L612,212.0 628,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-5\" stroke-width=\"2px\" d=\"M730,222.0 C730,167.0 805.0,167.0 805.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-5\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M730,224.0 L722,212.0 738,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-6\" stroke-width=\"2px\" d=\"M510,222.0 C510,57.0 815.0,57.0 815.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-6\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">dobj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M815.0,224.0 L823.0,212.0 807.0,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-7\" stroke-width=\"2px\" d=\"M510,222.0 C510,2.0 930.0,2.0 930.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-7\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M930.0,224.0 L938.0,212.0 922.0,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-8\" stroke-width=\"2px\" d=\"M1060,222.0 C1060,112.0 1250.0,112.0 1250.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-8\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">quantmod</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1060,224.0 L1052,212.0 1068,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-9\" stroke-width=\"2px\" d=\"M1170,222.0 C1170,167.0 1245.0,167.0 1245.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-9\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1170,224.0 L1162,212.0 1178,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-10\" stroke-width=\"2px\" d=\"M950,222.0 C950,57.0 1255.0,57.0 1255.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-10\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1255.0,224.0 L1263.0,212.0 1247.0,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"</svg>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from spacy import displacy\n",
|
||||||
|
"\n",
|
||||||
|
"doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')\n",
|
||||||
|
"displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The optional `'distance'` argument sets the distance between tokens. If the distance is made too small, text that appears beneath short arrows may become too compressed to read."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Visualizing the entity recognizer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">Over \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" the last quarter\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Apple\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" nearly 20 thousand\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">CARDINAL</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" iPods\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" for a profit of \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" $6 million\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MONEY</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
".</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')\n",
|
||||||
|
"displacy.render(doc, style='ent', jupyter=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Creating Visualizations Outside of Jupyter\n",
|
||||||
|
"If you're using another Python IDE or writing a script, you can choose to have spaCy serve up html separately:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
" Serving on port 5000...\n",
|
||||||
|
" Using the 'dep' visualizer\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" Shutting down server on port 5000.\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'This is a sentence.')\n",
|
||||||
|
"displacy.serve(doc, style='dep')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=blue>**After running the cell above, click the link below to view the dependency parse**:</font>\n",
|
||||||
|
"\n",
|
||||||
|
"http://127.0.0.1:5000\n",
|
||||||
|
"<br><br>\n",
|
||||||
|
"<font color=red>**To shut down the server and return to jupyter**, interrupt the kernel either through the **Kernel** menu above, by hitting the black square on the toolbar, or by typing the keyboard shortcut `Esc`, `I`, `I`</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Great! Now you should have an understanding of how tokenization divides text up into individual elements, how named entities provide context, and how certain tools help to visualize grammar rules and entity labels.\n",
|
||||||
|
"## Next up: Stemming"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,435 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# NLP Basics Assessment - Solutions"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For this assessment we'll be using the short story [_An Occurrence at Owl Creek Bridge_](https://en.wikipedia.org/wiki/An_Occurrence_at_Owl_Creek_Bridge) by Ambrose Bierce (1890). <br>The story is in the public domain; the text file was obtained from [Project Gutenberg](https://www.gutenberg.org/ebooks/375.txt.utf-8)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# RUN THIS CELL to perform standard imports:\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**1. Create a Doc object from the file `owlcreek.txt`**<br>\n",
|
||||||
|
"> HINT: Use `with open('../TextFiles/owlcreek.txt') as f:`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Enter your code here:\n",
|
||||||
|
"\n",
|
||||||
|
"with open('../TextFiles/owlcreek.txt') as f:\n",
|
||||||
|
" doc = nlp(f.read())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AN OCCURRENCE AT OWL CREEK BRIDGE\n",
|
||||||
|
"\n",
|
||||||
|
"by Ambrose Bierce\n",
|
||||||
|
"\n",
|
||||||
|
"I\n",
|
||||||
|
"\n",
|
||||||
|
"A man stood upon a railroad bridge in northern Alabama, looking down\n",
|
||||||
|
"into the swift water twenty feet below. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Run this cell to verify it worked:\n",
|
||||||
|
"\n",
|
||||||
|
"doc[:36]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**2. How many tokens are contained in the file?**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"4833"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**3. How many sentences are contained in the file?**<br>HINT: You'll want to build a list first!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"211"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sents = [sent for sent in doc.sents]\n",
|
||||||
|
"len(sents)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**4. Print the second sentence in the document**<br> HINT: Indexing starts at zero, and the title counts as the first sentence."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"A man stood upon a railroad bridge in northern Alabama, looking down\n",
|
||||||
|
"into the swift water twenty feet below. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(sents[1].text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"** 5. For each token in the sentence above, print its `text`, `POS` tag, `dep` tag and `lemma`<br>\n",
|
||||||
|
"CHALLENGE: Have values line up in columns in the print output.**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"A DET det a\n",
|
||||||
|
"man NOUN nsubj man\n",
|
||||||
|
"stood VERB ROOT stand\n",
|
||||||
|
"upon ADP prep upon\n",
|
||||||
|
"a DET det a\n",
|
||||||
|
"railroad NOUN compound railroad\n",
|
||||||
|
"bridge NOUN pobj bridge\n",
|
||||||
|
"in ADP prep in\n",
|
||||||
|
"northern ADJ amod northern\n",
|
||||||
|
"Alabama PROPN pobj alabama\n",
|
||||||
|
", PUNCT punct ,\n",
|
||||||
|
"looking VERB advcl look\n",
|
||||||
|
"down PART prt down\n",
|
||||||
|
"\n",
|
||||||
|
" SPACE \n",
|
||||||
|
"\n",
|
||||||
|
"into ADP prep into\n",
|
||||||
|
"the DET det the\n",
|
||||||
|
"swift ADJ amod swift\n",
|
||||||
|
"water NOUN pobj water\n",
|
||||||
|
"twenty NUM nummod twenty\n",
|
||||||
|
"feet NOUN npadvmod foot\n",
|
||||||
|
"below ADV advmod below\n",
|
||||||
|
". PUNCT punct .\n",
|
||||||
|
" SPACE \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# NORMAL SOLUTION:\n",
|
||||||
|
"for token in sents[1]:\n",
|
||||||
|
" print(token.text, token.pos_, token.dep_, token.lemma_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"A DET det a \n",
|
||||||
|
"man NOUN nsubj man \n",
|
||||||
|
"stood VERB ROOT stand \n",
|
||||||
|
"upon ADP prep upon \n",
|
||||||
|
"a DET det a \n",
|
||||||
|
"railroad NOUN compound railroad \n",
|
||||||
|
"bridge NOUN pobj bridge \n",
|
||||||
|
"in ADP prep in \n",
|
||||||
|
"northern ADJ amod northern \n",
|
||||||
|
"Alabama PROPN pobj alabama \n",
|
||||||
|
", PUNCT punct , \n",
|
||||||
|
"looking VERB advcl look \n",
|
||||||
|
"down PART prt down \n",
|
||||||
|
"\n",
|
||||||
|
" SPACE \n",
|
||||||
|
" \n",
|
||||||
|
"into ADP prep into \n",
|
||||||
|
"the DET det the \n",
|
||||||
|
"swift ADJ amod swift \n",
|
||||||
|
"water NOUN pobj water \n",
|
||||||
|
"twenty NUM nummod twenty \n",
|
||||||
|
"feet NOUN npadvmod foot \n",
|
||||||
|
"below ADV advmod below \n",
|
||||||
|
". PUNCT punct . \n",
|
||||||
|
" SPACE \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# CHALLENGE SOLUTION:\n",
|
||||||
|
"for token in sents[1]:\n",
|
||||||
|
" print(f'{token.text:{15}} {token.pos_:{5}} {token.dep_:{10}} {token.lemma_:{15}}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**6. Write a matcher called 'Swimming' that finds both occurrences of the phrase \"swimming vigorously\" in the text**<br>\n",
|
||||||
|
"HINT: You should include an `'IS_SPACE': True` pattern between the two words!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import the Matcher library:\n",
|
||||||
|
"\n",
|
||||||
|
"from spacy.matcher import Matcher\n",
|
||||||
|
"matcher = Matcher(nlp.vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Create a pattern and add it to matcher:\n",
|
||||||
|
"\n",
|
||||||
|
"pattern = [{'LOWER': 'swimming'}, {'IS_SPACE': True, 'OP':'*'}, {'LOWER': 'vigorously'}]\n",
|
||||||
|
"\n",
|
||||||
|
"matcher.add('Swimming', None, pattern)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[(12881893835109366681, 1274, 1277), (12881893835109366681, 3607, 3610)]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create a list of matches called \"found_matches\" and print the list:\n",
|
||||||
|
"\n",
|
||||||
|
"found_matches = matcher(doc)\n",
|
||||||
|
"print(found_matches)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**7. Print the text surrounding each found match**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"By diving I could evade the bullets and, swimming\n",
|
||||||
|
"vigorously, reach the bank, take to the woods and get away home\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(doc[1265:1290])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"over his shoulder; he was now swimming\n",
|
||||||
|
"vigorously with the current. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(doc[3600:3615])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**EXTRA CREDIT:<br>Print the *sentence* that contains each found match**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"By diving I could evade the bullets and, swimming\n",
|
||||||
|
"vigorously, reach the bank, take to the woods and get away home. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for sent in sents:\n",
|
||||||
|
" if found_matches[0][1] < sent.end:\n",
|
||||||
|
" print(sent)\n",
|
||||||
|
" break"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The hunted man saw all this over his shoulder; he was now swimming\n",
|
||||||
|
"vigorously with the current. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for sent in sents:\n",
|
||||||
|
" if found_matches[1][1] < sent.end:\n",
|
||||||
|
" print(sent)\n",
|
||||||
|
" break"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Great Job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
459
Praktikum Python Code/01-NLP-Python-Basics/00-Spacy-Basics.ipynb
Normal file
459
Praktikum Python Code/01-NLP-Python-Basics/00-Spacy-Basics.ipynb
Normal file
@ -0,0 +1,459 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# spaCy Basics\n",
|
||||||
|
"\n",
|
||||||
|
"**spaCy** (https://spacy.io/) is an open-source Python library that parses and \"understands\" large volumes of text. Separate models are available that cater to specific languages (English, French, German, etc.).\n",
|
||||||
|
"\n",
|
||||||
|
"In this section we'll install and setup spaCy to work with Python, and then introduce some concepts related to Natural Language Processing."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Installation and Setup\n",
|
||||||
|
"\n",
|
||||||
|
"Installation is a two-step process. First, install spaCy using either conda or pip. Next, download the specific model you want, based on language.<br> For more info visit https://spacy.io/usage/\n",
|
||||||
|
"\n",
|
||||||
|
"### 1. From the command line or terminal:\n",
|
||||||
|
"> `conda install -c conda-forge spacy`\n",
|
||||||
|
"> <br>*or*<br>\n",
|
||||||
|
"> `pip install -U spacy`\n",
|
||||||
|
"\n",
|
||||||
|
"> ### Alternatively you can create a virtual environment:\n",
|
||||||
|
"> `conda create -n spacyenv python=3 spacy=2`\n",
|
||||||
|
"\n",
|
||||||
|
"### 2. Next, also from the command line (you must run this as admin or use sudo):\n",
|
||||||
|
"\n",
|
||||||
|
"> `python -m spacy download en`\n",
|
||||||
|
"\n",
|
||||||
|
"> ### If successful, you should see a message like:\n",
|
||||||
|
"\n",
|
||||||
|
"> **`Linking successful`**<br>\n",
|
||||||
|
"> ` C:\\Anaconda3\\envs\\spacyenv\\lib\\site-packages\\en_core_web_sm -->`<br>\n",
|
||||||
|
"> ` C:\\Anaconda3\\envs\\spacyenv\\lib\\site-packages\\spacy\\data\\en`<br>\n",
|
||||||
|
"> ` `<br>\n",
|
||||||
|
"> ` You can now load the model via spacy.load('en')`\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Working with spaCy in Python\n",
|
||||||
|
"\n",
|
||||||
|
"This is a typical set of instructions for importing and working with spaCy. Don't be surprised if this takes awhile - spaCy has a fairly large library to load:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "OSError",
|
||||||
|
"evalue": "[E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001b[0;32mIn[3], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Import spaCy and load the language library\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mspacy\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m nlp \u001b[38;5;241m=\u001b[39m spacy\u001b[38;5;241m.\u001b[39mload(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124men_core_web_sm\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 5\u001b[0m \u001b[38;5;66;03m# Create a Doc object\u001b[39;00m\n\u001b[1;32m 6\u001b[0m doc \u001b[38;5;241m=\u001b[39m nlp(\u001b[38;5;124mu\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mTesla is looking at buying U.S. startup for $6 million\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
|
||||||
|
"File \u001b[0;32m/opt/conda/lib/python3.12/site-packages/spacy/__init__.py:52\u001b[0m, in \u001b[0;36mload\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload\u001b[39m(\n\u001b[1;32m 29\u001b[0m name: Union[\u001b[38;5;28mstr\u001b[39m, Path],\n\u001b[1;32m 30\u001b[0m \u001b[38;5;241m*\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 35\u001b[0m config: Union[Dict[\u001b[38;5;28mstr\u001b[39m, Any], Config] \u001b[38;5;241m=\u001b[39m util\u001b[38;5;241m.\u001b[39mSimpleFrozenDict(),\n\u001b[1;32m 36\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Language:\n\u001b[1;32m 37\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load a spaCy model from an installed package or a local path.\u001b[39;00m\n\u001b[1;32m 38\u001b[0m \n\u001b[1;32m 39\u001b[0m \u001b[38;5;124;03m name (str): Package name or model path.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m RETURNS (Language): The loaded nlp object.\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 52\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m util\u001b[38;5;241m.\u001b[39mload_model(\n\u001b[1;32m 53\u001b[0m name,\n\u001b[1;32m 54\u001b[0m vocab\u001b[38;5;241m=\u001b[39mvocab,\n\u001b[1;32m 55\u001b[0m disable\u001b[38;5;241m=\u001b[39mdisable,\n\u001b[1;32m 56\u001b[0m enable\u001b[38;5;241m=\u001b[39menable,\n\u001b[1;32m 57\u001b[0m exclude\u001b[38;5;241m=\u001b[39mexclude,\n\u001b[1;32m 58\u001b[0m config\u001b[38;5;241m=\u001b[39mconfig,\n\u001b[1;32m 59\u001b[0m )\n",
|
||||||
|
"File \u001b[0;32m/opt/conda/lib/python3.12/site-packages/spacy/util.py:531\u001b[0m, in \u001b[0;36mload_model\u001b[0;34m(name, vocab, disable, enable, exclude, config)\u001b[0m\n\u001b[1;32m 529\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m name \u001b[38;5;129;01min\u001b[39;00m OLD_MODEL_SHORTCUTS:\n\u001b[1;32m 530\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE941\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname, full\u001b[38;5;241m=\u001b[39mOLD_MODEL_SHORTCUTS[name])) \u001b[38;5;66;03m# type: ignore[index]\u001b[39;00m\n\u001b[0;32m--> 531\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mIOError\u001b[39;00m(Errors\u001b[38;5;241m.\u001b[39mE050\u001b[38;5;241m.\u001b[39mformat(name\u001b[38;5;241m=\u001b[39mname))\n",
|
||||||
|
"\u001b[0;31mOSError\u001b[0m: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Import spaCy and load the language library\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')\n",
|
||||||
|
"\n",
|
||||||
|
"# Create a Doc object\n",
|
||||||
|
"doc = nlp(u'Tesla is looking at buying U.S. startup for $6 million')\n",
|
||||||
|
"\n",
|
||||||
|
"# Print each token separately\n",
|
||||||
|
"for token in doc:\n",
|
||||||
|
" print(token.text, token.pos_, token.dep_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This doesn't look very user-friendly, but right away we see some interesting things happen:\n",
|
||||||
|
"1. Tesla is recognized to be a Proper Noun, not just a word at the start of a sentence\n",
|
||||||
|
"2. U.S. is kept together as one entity (we call this a 'token')\n",
|
||||||
|
"\n",
|
||||||
|
"As we dive deeper into spaCy we'll see what each of these abbreviations mean and how they're derived. We'll also see how spaCy can interpret the last three tokens combined `$6 million` as referring to ***money***."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# spaCy Objects\n",
|
||||||
|
"\n",
|
||||||
|
"After importing the spacy module in the cell above we loaded a **model** and named it `nlp`.<br>Next we created a **Doc** object by applying the model to our text, and named it `doc`.<br>spaCy also builds a companion **Vocab** object that we'll cover in later sections.<br>The **Doc** object that holds the processed text is our focus here."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Pipeline\n",
|
||||||
|
"When we run `nlp`, our text enters a *processing pipeline* that first breaks down the text and then performs a series of operations to tag, parse and describe the data. Image source: https://spacy.io/usage/spacy-101#pipelines"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<img src=\"../pipeline1.png\" width=\"600\">"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can check to see what components currently live in the pipeline. In later sections we'll learn how to disable components and add new ones as needed."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"nlp.pipeline"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"nlp.pipe_names"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Tokenization\n",
|
||||||
|
"The first step in processing text is to split up all the component parts (words & punctuation) into \"tokens\". These tokens are annotated inside the Doc object to contain descriptive information. We'll go into much more detail on tokenization in an upcoming lecture. For now, let's look at another example:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc2 = nlp(u\"Tesla isn't looking into startups anymore.\")\n",
|
||||||
|
"\n",
|
||||||
|
"for token in doc2:\n",
|
||||||
|
" print(token.text, token.pos_, token.dep_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Notice how `isn't` has been split into two tokens. spaCy recognizes both the root verb `is` and the negation attached to it. Notice also that both the extended whitespace and the period at the end of the sentence are assigned their own tokens.\n",
|
||||||
|
"\n",
|
||||||
|
"It's important to note that even though `doc2` contains processed information about each token, it also retains the original text:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc2[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"type(doc2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Part-of-Speech Tagging (POS)\n",
|
||||||
|
"The next step after splitting the text up into tokens is to assign parts of speech. In the above example, `Tesla` was recognized to be a ***proper noun***. Here some statistical modeling is required. For example, words that follow \"the\" are typically nouns.\n",
|
||||||
|
"\n",
|
||||||
|
"For a full list of POS Tags visit https://spacy.io/api/annotation#pos-tagging"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc2[0].pos_"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Dependencies\n",
|
||||||
|
"We also looked at the syntactic dependencies assigned to each token. `Tesla` is identified as an `nsubj` or the ***nominal subject*** of the sentence.\n",
|
||||||
|
"\n",
|
||||||
|
"For a full list of Syntactic Dependencies visit https://spacy.io/api/annotation#dependency-parsing\n",
|
||||||
|
"<br>A good explanation of typed dependencies can be found [here](https://nlp.stanford.edu/software/dependencies_manual.pdf)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc2[0].dep_"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"To see the full name of a tag use `spacy.explain(tag)`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"spacy.explain('PROPN')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"spacy.explain('nsubj')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Additional Token Attributes\n",
|
||||||
|
"We'll see these again in upcoming lectures. For now we just want to illustrate some of the other information that spaCy assigns to tokens:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"|Tag|Description|doc2[0].tag|\n",
|
||||||
|
"|:------|:------:|:------|\n",
|
||||||
|
"|`.text`|The original word text<!-- .element: style=\"text-align:left;\" -->|`Tesla`|\n",
|
||||||
|
"|`.lemma_`|The base form of the word|`tesla`|\n",
|
||||||
|
"|`.pos_`|The simple part-of-speech tag|`PROPN`/`proper noun`|\n",
|
||||||
|
"|`.tag_`|The detailed part-of-speech tag|`NNP`/`noun, proper singular`|\n",
|
||||||
|
"|`.shape_`|The word shape – capitalization, punctuation, digits|`Xxxxx`|\n",
|
||||||
|
"|`.is_alpha`|Is the token an alpha character?|`True`|\n",
|
||||||
|
"|`.is_stop`|Is the token part of a stop list, i.e. the most common words of the language?|`False`|"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Lemmas (the base form of the word):\n",
|
||||||
|
"print(doc2[4].text)\n",
|
||||||
|
"print(doc2[4].lemma_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Simple Parts-of-Speech & Detailed Tags:\n",
|
||||||
|
"print(doc2[4].pos_)\n",
|
||||||
|
"print(doc2[4].tag_ + ' / ' + spacy.explain(doc2[4].tag_))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Word Shapes:\n",
|
||||||
|
"print(doc2[0].text+': '+doc2[0].shape_)\n",
|
||||||
|
"print(doc[5].text+' : '+doc[5].shape_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Boolean Values:\n",
|
||||||
|
"print(doc2[0].is_alpha)\n",
|
||||||
|
"print(doc2[0].is_stop)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Spans\n",
|
||||||
|
"Large Doc objects can be hard to work with at times. A **span** is a slice of Doc object in the form `Doc[start:stop]`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc3 = nlp(u'Although commmonly attributed to John Lennon from his song \"Beautiful Boy\", \\\n",
|
||||||
|
"the phrase \"Life is what happens to us while we are making other plans\" was written by \\\n",
|
||||||
|
"cartoonist Allen Saunders and published in Reader\\'s Digest in 1957, when Lennon was 17.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"life_quote = doc3[16:30]\n",
|
||||||
|
"print(life_quote)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"type(life_quote)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In upcoming lectures we'll see how to create Span objects using `Span()`. This will allow us to assign additional information to the Span."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Sentences\n",
|
||||||
|
"Certain tokens inside a Doc object may also receive a \"start of sentence\" tag. While this doesn't immediately build a list of sentences, these tags enable the generation of sentence segments through `Doc.sents`. Later we'll write our own segmentation rules."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc4 = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for sent in doc4.sents:\n",
|
||||||
|
" print(sent)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc4[6].is_sent_start"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Next up: Tokenization"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
||||||
891
Praktikum Python Code/01-NLP-Python-Basics/01-Tokenization.ipynb
Normal file
891
Praktikum Python Code/01-NLP-Python-Basics/01-Tokenization.ipynb
Normal file
@ -0,0 +1,891 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Tokenization\n",
|
||||||
|
"The first step in creating a `Doc` object is to break down the incoming text into component pieces or \"tokens\"."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import spaCy and load the language library\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\"We're moving to L.A.!\"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create a string that includes opening and closing quotation marks\n",
|
||||||
|
"mystring = '\"We\\'re moving to L.A.!\"'\n",
|
||||||
|
"print(mystring)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\" | We | 're | moving | to | L.A. | ! | \" | "
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create a Doc object and explore tokens\n",
|
||||||
|
"doc = nlp(mystring)\n",
|
||||||
|
"\n",
|
||||||
|
"for token in doc:\n",
|
||||||
|
" print(token.text, end=' | ')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<img src=\"../tokenization.png\" width=\"600\">"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"- **Prefix**:\tCharacter(s) at the beginning ▸ `$ ( “ ¿`\n",
|
||||||
|
"- **Suffix**:\tCharacter(s) at the end ▸ `km ) , . ! ”`\n",
|
||||||
|
"- **Infix**:\tCharacter(s) in between ▸ `- -- / ...`\n",
|
||||||
|
"- **Exception**: Special-case rule to split a string into several tokens or prevent a token from being split when punctuation rules are applied ▸ `St. U.S.`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Notice that tokens are pieces of the original text. That is, we don't see any conversion to word stems or lemmas (base forms of words) and we haven't seen anything about organizations/places/money etc. Tokens are the basic building blocks of a Doc object - everything that helps us understand the meaning of the text is derived from tokens and their relationship to one another."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Prefixes, Suffixes and Infixes\n",
|
||||||
|
"spaCy will isolate punctuation that does *not* form an integral part of a word. Quotation marks, commas, and punctuation at the end of a sentence will be assigned their own token. However, punctuation that exists as part of an email address, website or numerical value will be kept as part of the token."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"We\n",
|
||||||
|
"'re\n",
|
||||||
|
"here\n",
|
||||||
|
"to\n",
|
||||||
|
"help\n",
|
||||||
|
"!\n",
|
||||||
|
"Send\n",
|
||||||
|
"snail\n",
|
||||||
|
"-\n",
|
||||||
|
"mail\n",
|
||||||
|
",\n",
|
||||||
|
"email\n",
|
||||||
|
"support@oursite.com\n",
|
||||||
|
"or\n",
|
||||||
|
"visit\n",
|
||||||
|
"us\n",
|
||||||
|
"at\n",
|
||||||
|
"http://www.oursite.com\n",
|
||||||
|
"!\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc2 = nlp(u\"We're here to help! Send snail-mail, email support@oursite.com or visit us at http://www.oursite.com!\")\n",
|
||||||
|
"\n",
|
||||||
|
"for t in doc2:\n",
|
||||||
|
" print(t)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Note that the exclamation points, comma, and the hyphen in 'snail-mail' are assigned their own tokens, yet both the email address and website are preserved.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"A\n",
|
||||||
|
"5\n",
|
||||||
|
"km\n",
|
||||||
|
"NYC\n",
|
||||||
|
"cab\n",
|
||||||
|
"ride\n",
|
||||||
|
"costs\n",
|
||||||
|
"$\n",
|
||||||
|
"10.30\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc3 = nlp(u'A 5km NYC cab ride costs $10.30')\n",
|
||||||
|
"\n",
|
||||||
|
"for t in doc3:\n",
|
||||||
|
" print(t)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Here the distance unit and dollar sign are assigned their own tokens, yet the dollar amount is preserved.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Exceptions\n",
|
||||||
|
"Punctuation that exists as part of a known abbreviation will be kept as part of the token."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Let\n",
|
||||||
|
"'s\n",
|
||||||
|
"visit\n",
|
||||||
|
"St.\n",
|
||||||
|
"Louis\n",
|
||||||
|
"in\n",
|
||||||
|
"the\n",
|
||||||
|
"U.S.\n",
|
||||||
|
"next\n",
|
||||||
|
"year\n",
|
||||||
|
".\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc4 = nlp(u\"Let's visit St. Louis in the U.S. next year.\")\n",
|
||||||
|
"\n",
|
||||||
|
"for t in doc4:\n",
|
||||||
|
" print(t)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Here the abbreviations for \"Saint\" and \"United States\" are both preserved.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Counting Tokens\n",
|
||||||
|
"`Doc` objects have a set number of tokens:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"8"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Counting Vocab Entries\n",
|
||||||
|
"`Vocab` objects contain a full library of items!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"57852"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(doc.vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>NOTE: This number changes based on the language library loaded at the start, and any new lexemes introduced to the `vocab` when the `Doc` was created.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Tokens can be retrieved by index position and slice\n",
|
||||||
|
"`Doc` objects can be thought of as lists of `token` objects. As such, individual tokens can be retrieved by index position, and spans of tokens can be retrieved through slicing:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"better"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc5 = nlp(u'It is better to give than to receive.')\n",
|
||||||
|
"\n",
|
||||||
|
"# Retrieve the third token:\n",
|
||||||
|
"doc5[2]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"better to give"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Retrieve three tokens from the middle:\n",
|
||||||
|
"doc5[2:5]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"than to receive."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Retrieve the last four tokens:\n",
|
||||||
|
"doc5[-4:]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Tokens cannot be reassigned\n",
|
||||||
|
"Although `Doc` objects can be considered lists of tokens, they do *not* support item reassignment:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc6 = nlp(u'My dinner was horrible.')\n",
|
||||||
|
"doc7 = nlp(u'Your dinner was delicious.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "TypeError",
|
||||||
|
"evalue": "'spacy.tokens.doc.Doc' object does not support item assignment",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[1;32m<ipython-input-13-d4fb8c39c40b>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# Try to change \"My dinner was horrible\" to \"My dinner was delicious\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mdoc6\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdoc7\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m3\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||||
|
"\u001b[1;31mTypeError\u001b[0m: 'spacy.tokens.doc.Doc' object does not support item assignment"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Try to change \"My dinner was horrible\" to \"My dinner was delicious\"\n",
|
||||||
|
"doc6[3] = doc7[3]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true,
|
||||||
|
"jupyter": {
|
||||||
|
"outputs_hidden": true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Named Entities\n",
|
||||||
|
"Going a step beyond tokens, *named entities* add another layer of context. The language model recognizes that certain words are organizational names while others are locations, and still other combinations relate to money, dates, etc. Named entities are accessible through the `ents` property of a `Doc` object."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Apple | to | build | a | Hong | Kong | factory | for | $ | 6 | million | \n",
|
||||||
|
"----\n",
|
||||||
|
"Apple - ORG - Companies, agencies, institutions, etc.\n",
|
||||||
|
"Hong Kong - GPE - Countries, cities, states\n",
|
||||||
|
"$6 million - MONEY - Monetary values, including unit\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc8 = nlp(u'Apple to build a Hong Kong factory for $6 million')\n",
|
||||||
|
"\n",
|
||||||
|
"for token in doc8:\n",
|
||||||
|
" print(token.text, end=' | ')\n",
|
||||||
|
"\n",
|
||||||
|
"print('\\n----')\n",
|
||||||
|
"\n",
|
||||||
|
"for ent in doc8.ents:\n",
|
||||||
|
" print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Note how two tokens combine to form the entity `Hong Kong`, and three tokens combine to form the monetary entity: `$6 million`</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(doc8.ents)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Named Entity Recognition (NER) is an important machine learning tool applied to Natural Language Processing.<br>We'll do a lot more with it in an upcoming section. For more info on **named entities** visit https://spacy.io/usage/linguistic-features#named-entities"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"---\n",
|
||||||
|
"# Noun Chunks\n",
|
||||||
|
"Similar to `Doc.ents`, `Doc.noun_chunks` are another object property. *Noun chunks* are \"base noun phrases\" – flat phrases that have a noun as their head. You can think of noun chunks as a noun plus the words describing the noun – for example, in [Sheb Wooley's 1958 song](https://en.wikipedia.org/wiki/The_Purple_People_Eater), a *\"one-eyed, one-horned, flying, purple people-eater\"* would be one long noun chunk."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Autonomous cars\n",
|
||||||
|
"insurance liability\n",
|
||||||
|
"manufacturers\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc9 = nlp(u\"Autonomous cars shift insurance liability toward manufacturers.\")\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in doc9.noun_chunks:\n",
|
||||||
|
" print(chunk.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Red cars\n",
|
||||||
|
"higher insurance rates\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc10 = nlp(u\"Red cars do not carry higher insurance rates.\")\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in doc10.noun_chunks:\n",
|
||||||
|
" print(chunk.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"He\n",
|
||||||
|
"a one-eyed, one-horned, flying, purple people-eater\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc11 = nlp(u\"He was a one-eyed, one-horned, flying, purple people-eater.\")\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in doc11.noun_chunks:\n",
|
||||||
|
" print(chunk.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We'll look at additional noun_chunks components besides `.text` in an upcoming section.<br>For more info on **noun_chunks** visit https://spacy.io/usage/linguistic-features#noun-chunks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Built-in Visualizers\n",
|
||||||
|
"\n",
|
||||||
|
"spaCy includes a built-in visualization tool called **displaCy**. displaCy is able to detect whether you're working in a Jupyter notebook, and will return markup that can be rendered in a cell right away. When you export your notebook, the visualizations will be included as HTML.\n",
|
||||||
|
"\n",
|
||||||
|
"For more info visit https://spacy.io/usage/visualizers"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Visualizing the dependency parse\n",
|
||||||
|
"Run the cell below to import displacy and display the dependency graphic"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"0\" class=\"displacy\" width=\"1370\" height=\"357.0\" style=\"max-width: none; height: 357.0px; color: #000000; background: #ffffff; font-family: Arial\">\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">Apple</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PROPN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"160\">is</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"160\">VERB</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"270\">going</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"270\">VERB</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"380\">to</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"380\">PART</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"490\">build</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"490\">VERB</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"600\">a</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"600\">DET</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"710\">U.K.</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"710\">PROPN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"820\">factory</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"820\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"930\">for</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"930\">ADP</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1040\">$</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1040\">SYM</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1150\">6</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1150\">NUM</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1260\">million.</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1260\">NUM</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-0\" stroke-width=\"2px\" d=\"M70,222.0 C70,112.0 260.0,112.0 260.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-0\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M70,224.0 L62,212.0 78,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-1\" stroke-width=\"2px\" d=\"M180,222.0 C180,167.0 255.0,167.0 255.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-1\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M180,224.0 L172,212.0 188,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-2\" stroke-width=\"2px\" d=\"M400,222.0 C400,167.0 475.0,167.0 475.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-2\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">aux</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M400,224.0 L392,212.0 408,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-3\" stroke-width=\"2px\" d=\"M290,222.0 C290,112.0 480.0,112.0 480.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-3\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">xcomp</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M480.0,224.0 L488.0,212.0 472.0,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-4\" stroke-width=\"2px\" d=\"M620,222.0 C620,112.0 810.0,112.0 810.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-4\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M620,224.0 L612,212.0 628,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-5\" stroke-width=\"2px\" d=\"M730,222.0 C730,167.0 805.0,167.0 805.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-5\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M730,224.0 L722,212.0 738,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-6\" stroke-width=\"2px\" d=\"M510,222.0 C510,57.0 815.0,57.0 815.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-6\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">dobj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M815.0,224.0 L823.0,212.0 807.0,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-7\" stroke-width=\"2px\" d=\"M510,222.0 C510,2.0 930.0,2.0 930.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-7\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M930.0,224.0 L938.0,212.0 922.0,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-8\" stroke-width=\"2px\" d=\"M1060,222.0 C1060,112.0 1250.0,112.0 1250.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-8\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">quantmod</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1060,224.0 L1052,212.0 1068,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-9\" stroke-width=\"2px\" d=\"M1170,222.0 C1170,167.0 1245.0,167.0 1245.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-9\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1170,224.0 L1162,212.0 1178,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-10\" stroke-width=\"2px\" d=\"M950,222.0 C950,57.0 1255.0,57.0 1255.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-10\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1255.0,224.0 L1263.0,212.0 1247.0,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"</svg>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from spacy import displacy\n",
|
||||||
|
"\n",
|
||||||
|
"doc = nlp(u'Apple is going to build a U.K. factory for $6 million.')\n",
|
||||||
|
"displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The optional `'distance'` argument sets the distance between tokens. If the distance is made too small, text that appears beneath short arrows may become too compressed to read."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Visualizing the entity recognizer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">Over \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" the last quarter\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Apple\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" nearly 20 thousand\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">CARDINAL</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" iPods\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" for a profit of \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" $6 million\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MONEY</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
".</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million.')\n",
|
||||||
|
"displacy.render(doc, style='ent', jupyter=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Creating Visualizations Outside of Jupyter\n",
|
||||||
|
"If you're using another Python IDE or writing a script, you can choose to have spaCy serve up html separately:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
" Serving on port 5000...\n",
|
||||||
|
" Using the 'dep' visualizer\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" Shutting down server on port 5000.\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'This is a sentence.')\n",
|
||||||
|
"displacy.serve(doc, style='dep')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=blue>**After running the cell above, click the link below to view the dependency parse**:</font>\n",
|
||||||
|
"\n",
|
||||||
|
"http://127.0.0.1:5000\n",
|
||||||
|
"<br><br>\n",
|
||||||
|
"<font color=red>**To shut down the server and return to jupyter**, interrupt the kernel either through the **Kernel** menu above, by hitting the black square on the toolbar, or by typing the keyboard shortcut `Esc`, `I`, `I`</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Great! Now you should have an understanding of how tokenization divides text up into individual elements, how named entities provide context, and how certain tools help to visualize grammar rules and entity labels.\n",
|
||||||
|
"## Next up: Stemming"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
||||||
317
Praktikum Python Code/01-NLP-Python-Basics/02-Stemming.ipynb
Normal file
317
Praktikum Python Code/01-NLP-Python-Basics/02-Stemming.ipynb
Normal file
@ -0,0 +1,317 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Stemming\n",
|
||||||
|
"Often when searching text for a certain keyword, it helps if the search returns variations of the word. For instance, searching for \"boat\" might also return \"boats\" and \"boating\". Here, \"boat\" would be the **stem** for [boat, boater, boating, boats].\n",
|
||||||
|
"\n",
|
||||||
|
"Stemming is a somewhat crude method for cataloging related words; it essentially chops off letters from the end until the stem is reached. This works fairly well in most cases, but unfortunately English has many exceptions where a more sophisticated process is required. In fact, spaCy doesn't include a stemmer, opting instead to rely entirely on lemmatization. For those interested, there's some background on this decision [here](https://github.com/explosion/spaCy/issues/327). We discuss the virtues of *lemmatization* in the next section.\n",
|
||||||
|
"\n",
|
||||||
|
"Instead, we'll use another popular NLP tool called **nltk**, which stands for *Natural Language Toolkit*. For more information on nltk visit https://www.nltk.org/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Porter Stemmer\n",
|
||||||
|
"\n",
|
||||||
|
"One of the most common - and effective - stemming tools is [*Porter's Algorithm*](https://tartarus.org/martin/PorterStemmer/) developed by Martin Porter in [1980](https://tartarus.org/martin/PorterStemmer/def.txt). The algorithm employs five phases of word reduction, each with its own set of mapping rules. In the first phase, simple suffix mapping rules are defined, such as:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"From a given set of stemming rules only one rule is applied, based on the longest suffix S1. Thus, `caresses` reduces to `caress` but not `cares`.\n",
|
||||||
|
"\n",
|
||||||
|
"More sophisticated phases consider the length/complexity of the word before applying a rule. For example:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Here `m>0` describes the \"measure\" of the stem, such that the rule is applied to all but the most basic stems."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import the toolkit and the full Porter Stemmer library\n",
|
||||||
|
"import nltk\n",
|
||||||
|
"\n",
|
||||||
|
"from nltk.stem.porter import *"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"p_stemmer = PorterStemmer()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"words = ['run','runner','running','ran','runs','easily','fairly']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"run --> run\n",
|
||||||
|
"runner --> runner\n",
|
||||||
|
"running --> run\n",
|
||||||
|
"ran --> ran\n",
|
||||||
|
"runs --> run\n",
|
||||||
|
"easily --> easili\n",
|
||||||
|
"fairly --> fairli\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for word in words:\n",
|
||||||
|
" print(word+' --> '+p_stemmer.stem(word))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Note how the stemmer recognizes \"runner\" as a noun, not a verb form or participle. Also, the adverbs \"easily\" and \"fairly\" are stemmed to the unusual root \"easili\" and \"fairli\"</font>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Snowball Stemmer\n",
|
||||||
|
"This is somewhat of a misnomer, as Snowball is the name of a stemming language developed by Martin Porter. The algorithm used here is more acurately called the \"English Stemmer\" or \"Porter2 Stemmer\". It offers a slight improvement over the original Porter stemmer, both in logic and speed. Since **nltk** uses the name SnowballStemmer, we'll use it here."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from nltk.stem.snowball import SnowballStemmer\n",
|
||||||
|
"\n",
|
||||||
|
"# The Snowball Stemmer requires that you pass a language parameter\n",
|
||||||
|
"s_stemmer = SnowballStemmer(language='english')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"words = ['run','runner','running','ran','runs','easily','fairly']\n",
|
||||||
|
"# words = ['generous','generation','generously','generate']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"run --> run\n",
|
||||||
|
"runner --> runner\n",
|
||||||
|
"running --> run\n",
|
||||||
|
"ran --> ran\n",
|
||||||
|
"runs --> run\n",
|
||||||
|
"easily --> easili\n",
|
||||||
|
"fairly --> fair\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for word in words:\n",
|
||||||
|
" print(word+' --> '+s_stemmer.stem(word))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>In this case the stemmer performed the same as the Porter Stemmer, with the exception that it handled the stem of \"fairly\" more appropriately with \"fair\"</font>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Try it yourself!\n",
|
||||||
|
"#### Pass in some of your own words and test each stemmer on them. Remember to pass them as strings!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"words = ['consolingly']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Porter Stemmer:\n",
|
||||||
|
"consolingly --> consolingli\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print('Porter Stemmer:')\n",
|
||||||
|
"for word in words:\n",
|
||||||
|
" print(word+' --> '+p_stemmer.stem(word))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Porter2 Stemmer:\n",
|
||||||
|
"consolingly --> consol\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print('Porter2 Stemmer:')\n",
|
||||||
|
"for word in words:\n",
|
||||||
|
" print(word+' --> '+s_stemmer.stem(word))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"Stemming has its drawbacks. If given the token `saw`, stemming might always return `saw`, whereas lemmatization would likely return either `see` or `saw` depending on whether the use of the token was as a verb or a noun. As an example, consider the following:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"I --> I\n",
|
||||||
|
"am --> am\n",
|
||||||
|
"meeting --> meet\n",
|
||||||
|
"him --> him\n",
|
||||||
|
"tomorrow --> tomorrow\n",
|
||||||
|
"at --> at\n",
|
||||||
|
"the --> the\n",
|
||||||
|
"meeting --> meet\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"phrase = 'I am meeting him tomorrow at the meeting'\n",
|
||||||
|
"for word in phrase.split():\n",
|
||||||
|
" print(word+' --> '+p_stemmer.stem(word))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Here the word \"meeting\" appears twice - once as a verb, and once as a noun, and yet the stemmer treats both equally."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Next up: Lemmatization"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,229 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Lemmatization\n",
|
||||||
|
"In contrast to stemming, lemmatization looks beyond word reduction, and considers a language's full vocabulary to apply a *morphological analysis* to words. The lemma of 'was' is 'be' and the lemma of 'mice' is 'mouse'. Further, the lemma of 'meeting' might be 'meet' or 'meeting' depending on its use in a sentence."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports:\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"I \t PRON \t 561228191312463089 \t -PRON-\n",
|
||||||
|
"am \t VERB \t 10382539506755952630 \t be\n",
|
||||||
|
"a \t DET \t 11901859001352538922 \t a\n",
|
||||||
|
"runner \t NOUN \t 12640964157389618806 \t runner\n",
|
||||||
|
"running \t VERB \t 12767647472892411841 \t run\n",
|
||||||
|
"in \t ADP \t 3002984154512732771 \t in\n",
|
||||||
|
"a \t DET \t 11901859001352538922 \t a\n",
|
||||||
|
"race \t NOUN \t 8048469955494714898 \t race\n",
|
||||||
|
"because \t ADP \t 16950148841647037698 \t because\n",
|
||||||
|
"I \t PRON \t 561228191312463089 \t -PRON-\n",
|
||||||
|
"love \t VERB \t 3702023516439754181 \t love\n",
|
||||||
|
"to \t PART \t 3791531372978436496 \t to\n",
|
||||||
|
"run \t VERB \t 12767647472892411841 \t run\n",
|
||||||
|
"since \t ADP \t 10066841407251338481 \t since\n",
|
||||||
|
"I \t PRON \t 561228191312463089 \t -PRON-\n",
|
||||||
|
"ran \t VERB \t 12767647472892411841 \t run\n",
|
||||||
|
"today \t NOUN \t 11042482332948150395 \t today\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc1 = nlp(u\"I am a runner running in a race because I love to run since I ran today\")\n",
|
||||||
|
"\n",
|
||||||
|
"for token in doc1:\n",
|
||||||
|
" print(token.text, '\\t', token.pos_, '\\t', token.lemma, '\\t', token.lemma_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>In the above sentence, `running`, `run` and `ran` all point to the same lemma `run` (...11841) to avoid duplication.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Function to display lemmas\n",
|
||||||
|
"Since the display above is staggared and hard to read, let's write a function that displays the information we want more neatly."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def show_lemmas(text):\n",
|
||||||
|
" for token in text:\n",
|
||||||
|
" print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Here we're using an **f-string** to format the printed text by setting minimum field widths and adding a left-align to the lemma hash value."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"I PRON 561228191312463089 -PRON-\n",
|
||||||
|
"saw VERB 11925638236994514241 see\n",
|
||||||
|
"eighteen NUM 9609336664675087640 eighteen\n",
|
||||||
|
"mice NOUN 1384165645700560590 mouse\n",
|
||||||
|
"today NOUN 11042482332948150395 today\n",
|
||||||
|
"! PUNCT 17494803046312582752 !\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc2 = nlp(u\"I saw eighteen mice today!\")\n",
|
||||||
|
"\n",
|
||||||
|
"show_lemmas(doc2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Notice that the lemma of `saw` is `see`, `mice` is the plural form of `mouse`, and yet `eighteen` is its own number, *not* an expanded form of `eight`.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"I PRON 561228191312463089 -PRON-\n",
|
||||||
|
"am VERB 10382539506755952630 be\n",
|
||||||
|
"meeting VERB 6880656908171229526 meet\n",
|
||||||
|
"him PRON 561228191312463089 -PRON-\n",
|
||||||
|
"tomorrow NOUN 3573583789758258062 tomorrow\n",
|
||||||
|
"at ADP 11667289587015813222 at\n",
|
||||||
|
"the DET 7425985699627899538 the\n",
|
||||||
|
"meeting NOUN 14798207169164081740 meeting\n",
|
||||||
|
". PUNCT 12646065887601541794 .\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc3 = nlp(u\"I am meeting him tomorrow at the meeting.\")\n",
|
||||||
|
"\n",
|
||||||
|
"show_lemmas(doc3)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Here the lemma of `meeting` is determined by its Part of Speech tag.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"That DET 4380130941430378203 that\n",
|
||||||
|
"'s VERB 10382539506755952630 be\n",
|
||||||
|
"an DET 15099054000809333061 an\n",
|
||||||
|
"enormous ADJ 17917224542039855524 enormous\n",
|
||||||
|
"automobile NOUN 7211811266693931283 automobile\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc4 = nlp(u\"That's an enormous automobile\")\n",
|
||||||
|
"\n",
|
||||||
|
"show_lemmas(doc4)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Note that lemmatization does *not* reduce words to their most basic synonym - that is, `enormous` doesn't become `big` and `automobile` doesn't become `car`.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We should point out that although lemmatization looks at surrounding text to determine a given word's part of speech, it does not categorize phrases. In an upcoming lecture we'll investigate *word vectors and similarity*.\n",
|
||||||
|
"\n",
|
||||||
|
"## Next up: Stop Words"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
276
Praktikum Python Code/01-NLP-Python-Basics/04-Stop-Words.ipynb
Normal file
276
Praktikum Python Code/01-NLP-Python-Basics/04-Stop-Words.ipynb
Normal file
@ -0,0 +1,276 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Stop Words\n",
|
||||||
|
"Words like \"a\" and \"the\" appear so frequently that they don't require tagging as thoroughly as nouns, verbs and modifiers. We call these *stop words*, and they can be filtered from the text to be processed. spaCy holds a built-in list of some 305 English stop words."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports:\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'hers', 'show', 'though', 'various', 'sixty', 'say', 'quite', 'ten', 'anything', 'although', 'hereby', 'in', 'ours', 'herself', 'among', 'unless', 'and', 'whole', 'anywhere', 'latter', 'therein', 'whereafter', 'that', 'one', 'whose', 'either', 'within', 'eight', 'three', 'latterly', 'anyone', 'a', 'less', 'former', 'been', 'same', 'anyway', 'else', 'cannot', 'five', 'i', 'until', 'last', 'thus', 'give', 'move', 'thereafter', 'via', 'than', 'empty', 'off', 'neither', 'too', 'please', 'over', 'just', 'otherwise', 'has', 'her', 'put', 'its', 'whether', 'herein', 'myself', 'me', 'nevertheless', 'whatever', 'someone', 'towards', 'whereby', 'onto', 'sometimes', 'thence', 'them', 'done', 'at', 'back', 'nor', 'another', 'behind', 'together', 'take', 'amongst', 'being', 'seemed', 'seeming', 'fifteen', 'do', 'further', 'something', 'again', 'this', 'were', 'wherein', 'how', 'up', 'must', 'get', 'whereas', 'much', 'upon', 'yet', 'both', 'many', 'very', 'may', 'after', 'regarding', 'full', 'through', 'below', 'his', 'well', 'everything', 'so', 'our', 'should', 'seem', 'while', 'for', 'might', 'mine', 'when', 'with', 'you', 'few', 'never', 'because', 'own', 'also', 'due', 'hence', 'it', 'more', 'their', 'such', 'becomes', 'first', 'hereupon', 'since', 'third', 'twenty', 'who', 'she', 'nobody', 'name', 'really', 'enough', 'least', 'two', 'whoever', 'which', 'yours', 'moreover', 'seems', 'before', 'therefore', 'then', 'used', 'even', 'nowhere', 'without', 'other', 'around', 'made', 'hundred', 'no', 'twelve', 'several', 'your', 'meanwhile', 'per', 'except', 'yourselves', 'why', 'some', 'not', 'yourself', 'sometime', 'somehow', 'become', 'beyond', 'almost', 'will', 'somewhere', 'the', 'everyone', 'about', 'everywhere', 'anyhow', 'side', 'next', 'fifty', 'they', 'most', 'perhaps', 'across', 'themselves', 'besides', 'against', 'can', 'him', 'there', 'noone', 'under', 'formerly', 'already', 'all', 'if', 'my', 'or', 'serious', 'four', 'thereupon', 'whence', 'here', 'whither', 'beside', 'wherever', 'to', 'himself', 'between', 'ourselves', 'none', 'on', 'became', 'an', 'have', 'part', 'did', 'had', 'each', 'six', 'those', 'from', 'whenever', 'any', 'am', 'would', 'make', 'could', 'does', 'go', 'call', 'indeed', 'these', 'often', 'above', 'during', 'by', 'nine', 'thereby', 'others', 'afterwards', 'throughout', 'whom', 'amount', 'as', 'hereafter', 'top', 'mostly', 'us', 'whereupon', 'once', 'only', 'still', 'namely', 'forty', 'ca', 'along', 'be', 'itself', 'where', 'see', 'into', 'toward', 'but', 'is', 'keep', 'bottom', 'ever', 'becoming', 'every', 'always', 'front', 'nothing', 'we', 'of', 'out', 'eleven', 'alone', 'he', 'however', 'rather', 'down', 'thru', 'now', 'using', 'are', 'doing', 'what', 'beforehand', 're', 'was', 'elsewhere'}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print the set of spaCy's default stop words (remember that sets are unordered):\n",
|
||||||
|
"print(nlp.Defaults.stop_words)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"305"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(nlp.Defaults.stop_words)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## To see if a word is a stop word"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nlp.vocab['myself'].is_stop"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"False"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nlp.vocab['mystery'].is_stop"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## To add a stop word\n",
|
||||||
|
"There may be times when you wish to add a stop word to the default set. Perhaps you decide that `'btw'` (common shorthand for \"by the way\") should be considered a stop word."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Add the word to the set of stop words. Use lowercase!\n",
|
||||||
|
"nlp.Defaults.stop_words.add('btw')\n",
|
||||||
|
"\n",
|
||||||
|
"# Set the stop_word tag on the lexeme\n",
|
||||||
|
"nlp.vocab['btw'].is_stop = True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"306"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(nlp.Defaults.stop_words)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nlp.vocab['btw'].is_stop"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>When adding stop words, always use lowercase. Lexemes are converted to lowercase before being added to **vocab**.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## To remove a stop word\n",
|
||||||
|
"Alternatively, you may decide that `'beyond'` should not be considered a stop word."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Remove the word from the set of stop words\n",
|
||||||
|
"nlp.Defaults.stop_words.remove('beyond')\n",
|
||||||
|
"\n",
|
||||||
|
"# Remove the stop_word tag from the lexeme\n",
|
||||||
|
"nlp.vocab['beyond'].is_stop = False"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"305"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(nlp.Defaults.stop_words)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"False"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nlp.vocab['beyond'].is_stop"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Great! Now you should be able to access spaCy's default set of stop words, and add or remove stop words as needed.\n",
|
||||||
|
"## Next up: Vocabulary and Matching"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,599 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Vocabulary and Matching\n",
|
||||||
|
"So far we've seen how a body of text is divided into tokens, and how individual tokens are parsed and tagged with parts of speech, dependencies and lemmas.\n",
|
||||||
|
"\n",
|
||||||
|
"In this section we will identify and label specific phrases that match patterns we can define ourselves. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Rule-based Matching\n",
|
||||||
|
"spaCy offers a rule-matching tool called `Matcher` that allows you to build a library of token patterns, then match those patterns against a Doc object to return a list of found matches. You can match on any part of the token including text and annotations, and you can add multiple patterns to the same matcher."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import the Matcher library\n",
|
||||||
|
"from spacy.matcher import Matcher\n",
|
||||||
|
"matcher = Matcher(nlp.vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Here `matcher` is an object that pairs to the current `Vocab` object. We can add and remove specific named matchers to `matcher` as needed.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Creating patterns\n",
|
||||||
|
"In literature, the phrase 'solar power' might appear as one word or two, with or without a hyphen. In this section we'll develop a matcher named 'SolarPower' that finds all three:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pattern1 = [{'LOWER': 'solarpower'}]\n",
|
||||||
|
"pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]\n",
|
||||||
|
"pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]\n",
|
||||||
|
"\n",
|
||||||
|
"matcher.add('SolarPower', None, pattern1, pattern2, pattern3)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Let's break this down:\n",
|
||||||
|
"* `pattern1` looks for a single token whose lowercase text reads 'solarpower'\n",
|
||||||
|
"* `pattern2` looks for two adjacent tokens that read 'solar' and 'power' in that order\n",
|
||||||
|
"* `pattern3` looks for three adjacent tokens, with a middle token that can be any punctuation.<font color=green>*</font>\n",
|
||||||
|
"\n",
|
||||||
|
"<font color=green>\\* Remember that single spaces are not tokenized, so they don't count as punctuation.</font>\n",
|
||||||
|
"<br>Once we define our patterns, we pass them into `matcher` with the name 'SolarPower', and set *callbacks* to `None` (more on callbacks later)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Applying the matcher to a Doc object"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'The Solar Power industry continues to grow as demand \\\n",
|
||||||
|
"for solarpower increases. Solar-power cars are gaining popularity.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"found_matches = matcher(doc)\n",
|
||||||
|
"print(found_matches)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"`matcher` returns a list of tuples. Each tuple contains an ID for the match, with start & end tokens that map to the span `doc[start:end]`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"8656102463236116519 SolarPower 1 3 Solar Power\n",
|
||||||
|
"8656102463236116519 SolarPower 10 11 solarpower\n",
|
||||||
|
"8656102463236116519 SolarPower 13 16 Solar-power\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for match_id, start, end in found_matches:\n",
|
||||||
|
" string_id = nlp.vocab.strings[match_id] # get string representation\n",
|
||||||
|
" span = doc[start:end] # get the matched span\n",
|
||||||
|
" print(match_id, string_id, start, end, span.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The `match_id` is simply the hash value of the `string_ID` 'SolarPower'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Setting pattern options and quantifiers\n",
|
||||||
|
"You can make token rules optional by passing an `'OP':'*'` argument. This lets us streamline our patterns list:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Redefine the patterns:\n",
|
||||||
|
"pattern1 = [{'LOWER': 'solarpower'}]\n",
|
||||||
|
"pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]\n",
|
||||||
|
"\n",
|
||||||
|
"# Remove the old patterns to avoid duplication:\n",
|
||||||
|
"matcher.remove('SolarPower')\n",
|
||||||
|
"\n",
|
||||||
|
"# Add the new set of patterns to the 'SolarPower' matcher:\n",
|
||||||
|
"matcher.add('SolarPower', None, pattern1, pattern2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"found_matches = matcher(doc)\n",
|
||||||
|
"print(found_matches)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This found both two-word patterns, with and without the hyphen!\n",
|
||||||
|
"\n",
|
||||||
|
"The following quantifiers can be passed to the `'OP'` key:\n",
|
||||||
|
"<table><tr><th>OP</th><th>Description</th></tr>\n",
|
||||||
|
"\n",
|
||||||
|
"<tr ><td><span >\\!</span></td><td>Negate the pattern, by requiring it to match exactly 0 times</td></tr>\n",
|
||||||
|
"<tr ><td><span >?</span></td><td>Make the pattern optional, by allowing it to match 0 or 1 times</td></tr>\n",
|
||||||
|
"<tr ><td><span >\\+</span></td><td>Require the pattern to match 1 or more times</td></tr>\n",
|
||||||
|
"<tr ><td><span >\\*</span></td><td>Allow the pattern to match zero or more times</td></tr>\n",
|
||||||
|
"</table>\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Be careful with lemmas!\n",
|
||||||
|
"If we wanted to match on both 'solar power' and 'solar powered', it might be tempting to look for the *lemma* of 'powered' and expect it to be 'power'. This is not always the case! The lemma of the *adjective* 'powered' is still 'powered':"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pattern1 = [{'LOWER': 'solarpower'}]\n",
|
||||||
|
"pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LEMMA': 'power'}] # CHANGE THIS PATTERN\n",
|
||||||
|
"\n",
|
||||||
|
"# Remove the old patterns to avoid duplication:\n",
|
||||||
|
"matcher.remove('SolarPower')\n",
|
||||||
|
"\n",
|
||||||
|
"# Add the new set of patterns to the 'SolarPower' matcher:\n",
|
||||||
|
"matcher.add('SolarPower', None, pattern1, pattern2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc2 = nlp(u'Solar-powered energy runs solar-powered cars.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[(8656102463236116519, 0, 3)]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"found_matches = matcher(doc2)\n",
|
||||||
|
"print(found_matches)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>The matcher found the first occurrence because the lemmatizer treated 'Solar-powered' as a verb, but not the second as it considered it an adjective.<br>For this case it may be better to set explicit token patterns.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"pattern1 = [{'LOWER': 'solarpower'}]\n",
|
||||||
|
"pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]\n",
|
||||||
|
"pattern3 = [{'LOWER': 'solarpowered'}]\n",
|
||||||
|
"pattern4 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'powered'}]\n",
|
||||||
|
"\n",
|
||||||
|
"# Remove the old patterns to avoid duplication:\n",
|
||||||
|
"matcher.remove('SolarPower')\n",
|
||||||
|
"\n",
|
||||||
|
"# Add the new set of patterns to the 'SolarPower' matcher:\n",
|
||||||
|
"matcher.add('SolarPower', None, pattern1, pattern2, pattern3, pattern4)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"found_matches = matcher(doc2)\n",
|
||||||
|
"print(found_matches)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Other token attributes\n",
|
||||||
|
"Besides lemmas, there are a variety of token attributes we can use to determine matching rules:\n",
|
||||||
|
"<table><tr><th>Attribute</th><th>Description</th></tr>\n",
|
||||||
|
"\n",
|
||||||
|
"<tr ><td><span >`ORTH`</span></td><td>The exact verbatim text of a token</td></tr>\n",
|
||||||
|
"<tr ><td><span >`LOWER`</span></td><td>The lowercase form of the token text</td></tr>\n",
|
||||||
|
"<tr ><td><span >`LENGTH`</span></td><td>The length of the token text</td></tr>\n",
|
||||||
|
"<tr ><td><span >`IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`</span></td><td>Token text consists of alphanumeric characters, ASCII characters, digits</td></tr>\n",
|
||||||
|
"<tr ><td><span >`IS_LOWER`, `IS_UPPER`, `IS_TITLE`</span></td><td>Token text is in lowercase, uppercase, titlecase</td></tr>\n",
|
||||||
|
"<tr ><td><span >`IS_PUNCT`, `IS_SPACE`, `IS_STOP`</span></td><td>Token is punctuation, whitespace, stop word</td></tr>\n",
|
||||||
|
"<tr ><td><span >`LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`</span></td><td>Token text resembles a number, URL, email</td></tr>\n",
|
||||||
|
"<tr ><td><span >`POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE`</span></td><td>The token's simple and extended part-of-speech tag, dependency label, lemma, shape</td></tr>\n",
|
||||||
|
"<tr ><td><span >`ENT_TYPE`</span></td><td>The token's entity label</td></tr>\n",
|
||||||
|
"\n",
|
||||||
|
"</table>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Token wildcard\n",
|
||||||
|
"You can pass an empty dictionary `{}` as a wildcard to represent **any token**. For example, you might want to retrieve hashtags without knowing what might follow the `#` character:\n",
|
||||||
|
">`[{'ORTH': '#'}, {}]`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## PhraseMatcher\n",
|
||||||
|
"In the above section we used token patterns to perform rule-based matching. An alternative - and often more efficient - method is to match on terminology lists. In this case we use PhraseMatcher to create a Doc object from a list of phrases, and pass that into `matcher` instead."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports, reset nlp\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import the PhraseMatcher library\n",
|
||||||
|
"from spacy.matcher import PhraseMatcher\n",
|
||||||
|
"matcher = PhraseMatcher(nlp.vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For this exercise we're going to import a Wikipedia article on *Reaganomics*<br>\n",
|
||||||
|
"Source: https://en.wikipedia.org/wiki/Reaganomics"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('../TextFiles/reaganomics.txt', encoding='utf8') as f:\n",
|
||||||
|
" doc3 = nlp(f.read())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# First, create a list of match phrases:\n",
|
||||||
|
"phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']\n",
|
||||||
|
"\n",
|
||||||
|
"# Next, convert each phrase to a Doc object:\n",
|
||||||
|
"phrase_patterns = [nlp(text) for text in phrase_list]\n",
|
||||||
|
"\n",
|
||||||
|
"# Pass each Doc object into matcher (note the use of the asterisk!):\n",
|
||||||
|
"matcher.add('VoodooEconomics', None, *phrase_patterns)\n",
|
||||||
|
"\n",
|
||||||
|
"# Build a list of matches:\n",
|
||||||
|
"matches = matcher(doc3)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[(3473369816841043438, 41, 45),\n",
|
||||||
|
" (3473369816841043438, 49, 53),\n",
|
||||||
|
" (3473369816841043438, 54, 56),\n",
|
||||||
|
" (3473369816841043438, 61, 65),\n",
|
||||||
|
" (3473369816841043438, 673, 677),\n",
|
||||||
|
" (3473369816841043438, 2985, 2989)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# (match_id, start, end)\n",
|
||||||
|
"matches"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>The first four matches are where these terms are used in the definition of Reaganomics:</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"REAGANOMICS\n",
|
||||||
|
"https://en.wikipedia.org/wiki/Reaganomics\n",
|
||||||
|
"\n",
|
||||||
|
"Reaganomics (a portmanteau of [Ronald] Reagan and economics attributed to Paul Harvey)[1] refers to the economic policies promoted by U.S. President Ronald Reagan during the 1980s. These policies are commonly associated with supply-side economics, referred to as trickle-down economics or voodoo economics by political opponents, and free-market economics by political advocates.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc3[:70]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Viewing Matches\n",
|
||||||
|
"There are a few ways to fetch the text surrounding a match. The simplest is to grab a slice of tokens from the doc that is wider than the match:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"same time he attracted a following from the supply-side economics movement, which formed in opposition to Keynesian"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc3[665:685] # Note that the fifth match starts at doc3[673]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"against institutions.[66] His policies became widely known as \"trickle-down economics\", due to the significant"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc3[2975:2995] # The sixth match starts at doc3[2985]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Another way is to first apply the `sentencizer` to the Doc, then iterate through the sentences to the match point:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"0 35\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Build a list of sentences\n",
|
||||||
|
"sents = [sent for sent in doc3.sents]\n",
|
||||||
|
"\n",
|
||||||
|
"# In the next section we'll see that sentences contain start and end token values:\n",
|
||||||
|
"print(sents[0].start, sents[0].end)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"At the same time he attracted a following from the supply-side economics movement, which formed in opposition to Keynesian demand-stimulus economics.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Iterate over the sentence list until the sentence end value exceeds a match start value:\n",
|
||||||
|
"for sent in sents:\n",
|
||||||
|
" if matches[4][1] < sent.end: # this is the fifth match, that starts at doc3[673]\n",
|
||||||
|
" print(sent)\n",
|
||||||
|
" break"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For additional information visit https://spacy.io/usage/linguistic-features#section-rule-based-matching\n",
|
||||||
|
"## Next Up: NLP Basics Assessment"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,418 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# NLP Basics Assessment"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For this assessment we'll be using the short story [_An Occurrence at Owl Creek Bridge_](https://en.wikipedia.org/wiki/An_Occurrence_at_Owl_Creek_Bridge) by Ambrose Bierce (1890). <br>The story is in the public domain; the text file was obtained from [Project Gutenberg](https://www.gutenberg.org/ebooks/375.txt.utf-8)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# RUN THIS CELL to perform standard imports:\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**1. Create a Doc object from the file `owlcreek.txt`**<br>\n",
|
||||||
|
"> HINT: Use `with open('../TextFiles/owlcreek.txt') as f:`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Enter your code here:\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AN OCCURRENCE AT OWL CREEK BRIDGE\n",
|
||||||
|
"\n",
|
||||||
|
"by Ambrose Bierce\n",
|
||||||
|
"\n",
|
||||||
|
"I\n",
|
||||||
|
"\n",
|
||||||
|
"A man stood upon a railroad bridge in northern Alabama, looking down\n",
|
||||||
|
"into the swift water twenty feet below. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Run this cell to verify it worked:\n",
|
||||||
|
"\n",
|
||||||
|
"doc[:36]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**2. How many tokens are contained in the file?**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"4833"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**3. How many sentences are contained in the file?**<br>HINT: You'll want to build a list first!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"211"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**4. Print the second sentence in the document**<br> HINT: Indexing starts at zero, and the title counts as the first sentence."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"A man stood upon a railroad bridge in northern Alabama, looking down\n",
|
||||||
|
"into the swift water twenty feet below. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"** 5. For each token in the sentence above, print its `text`, `POS` tag, `dep` tag and `lemma`<br>\n",
|
||||||
|
"CHALLENGE: Have values line up in columns in the print output.**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"A DET det a\n",
|
||||||
|
"man NOUN nsubj man\n",
|
||||||
|
"stood VERB ROOT stand\n",
|
||||||
|
"upon ADP prep upon\n",
|
||||||
|
"a DET det a\n",
|
||||||
|
"railroad NOUN compound railroad\n",
|
||||||
|
"bridge NOUN pobj bridge\n",
|
||||||
|
"in ADP prep in\n",
|
||||||
|
"northern ADJ amod northern\n",
|
||||||
|
"Alabama PROPN pobj alabama\n",
|
||||||
|
", PUNCT punct ,\n",
|
||||||
|
"looking VERB advcl look\n",
|
||||||
|
"down PART prt down\n",
|
||||||
|
"\n",
|
||||||
|
" SPACE \n",
|
||||||
|
"\n",
|
||||||
|
"into ADP prep into\n",
|
||||||
|
"the DET det the\n",
|
||||||
|
"swift ADJ amod swift\n",
|
||||||
|
"water NOUN pobj water\n",
|
||||||
|
"twenty NUM nummod twenty\n",
|
||||||
|
"feet NOUN npadvmod foot\n",
|
||||||
|
"below ADV advmod below\n",
|
||||||
|
". PUNCT punct .\n",
|
||||||
|
" SPACE \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# NORMAL SOLUTION:\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"A DET det a \n",
|
||||||
|
"man NOUN nsubj man \n",
|
||||||
|
"stood VERB ROOT stand \n",
|
||||||
|
"upon ADP prep upon \n",
|
||||||
|
"a DET det a \n",
|
||||||
|
"railroad NOUN compound railroad \n",
|
||||||
|
"bridge NOUN pobj bridge \n",
|
||||||
|
"in ADP prep in \n",
|
||||||
|
"northern ADJ amod northern \n",
|
||||||
|
"Alabama PROPN pobj alabama \n",
|
||||||
|
", PUNCT punct , \n",
|
||||||
|
"looking VERB advcl look \n",
|
||||||
|
"down PART prt down \n",
|
||||||
|
"\n",
|
||||||
|
" SPACE \n",
|
||||||
|
" \n",
|
||||||
|
"into ADP prep into \n",
|
||||||
|
"the DET det the \n",
|
||||||
|
"swift ADJ amod swift \n",
|
||||||
|
"water NOUN pobj water \n",
|
||||||
|
"twenty NUM nummod twenty \n",
|
||||||
|
"feet NOUN npadvmod foot \n",
|
||||||
|
"below ADV advmod below \n",
|
||||||
|
". PUNCT punct . \n",
|
||||||
|
" SPACE \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# CHALLENGE SOLUTION:\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**6. Write a matcher called 'Swimming' that finds both occurrences of the phrase \"swimming vigorously\" in the text**<br>\n",
|
||||||
|
"HINT: You should include an `'IS_SPACE': True` pattern between the two words!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import the Matcher library:\n",
|
||||||
|
"\n",
|
||||||
|
"from spacy.matcher import Matcher\n",
|
||||||
|
"matcher = Matcher(nlp.vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Create a pattern and add it to matcher:\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[(12881893835109366681, 1274, 1277), (12881893835109366681, 3607, 3610)]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create a list of matches called \"found_matches\" and print the list:\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**7. Print the text surrounding each found match**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"By diving I could evade the bullets and, swimming\n",
|
||||||
|
"vigorously, reach the bank, take to the woods and get away home\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"over his shoulder; he was now swimming\n",
|
||||||
|
"vigorously with the current. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**EXTRA CREDIT:<br>Print the *sentence* that contains each found match**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"By diving I could evade the bullets and, swimming\n",
|
||||||
|
"vigorously, reach the bank, take to the woods and get away home. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The hunted man saw all this over his shoulder; he was now swimming\n",
|
||||||
|
"vigorously with the current. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Great Job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,435 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# NLP Basics Assessment - Solutions"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For this assessment we'll be using the short story [_An Occurrence at Owl Creek Bridge_](https://en.wikipedia.org/wiki/An_Occurrence_at_Owl_Creek_Bridge) by Ambrose Bierce (1890). <br>The story is in the public domain; the text file was obtained from [Project Gutenberg](https://www.gutenberg.org/ebooks/375.txt.utf-8)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# RUN THIS CELL to perform standard imports:\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**1. Create a Doc object from the file `owlcreek.txt`**<br>\n",
|
||||||
|
"> HINT: Use `with open('../TextFiles/owlcreek.txt') as f:`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Enter your code here:\n",
|
||||||
|
"\n",
|
||||||
|
"with open('../TextFiles/owlcreek.txt') as f:\n",
|
||||||
|
" doc = nlp(f.read())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AN OCCURRENCE AT OWL CREEK BRIDGE\n",
|
||||||
|
"\n",
|
||||||
|
"by Ambrose Bierce\n",
|
||||||
|
"\n",
|
||||||
|
"I\n",
|
||||||
|
"\n",
|
||||||
|
"A man stood upon a railroad bridge in northern Alabama, looking down\n",
|
||||||
|
"into the swift water twenty feet below. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Run this cell to verify it worked:\n",
|
||||||
|
"\n",
|
||||||
|
"doc[:36]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**2. How many tokens are contained in the file?**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"4833"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**3. How many sentences are contained in the file?**<br>HINT: You'll want to build a list first!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"211"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sents = [sent for sent in doc.sents]\n",
|
||||||
|
"len(sents)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**4. Print the second sentence in the document**<br> HINT: Indexing starts at zero, and the title counts as the first sentence."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"A man stood upon a railroad bridge in northern Alabama, looking down\n",
|
||||||
|
"into the swift water twenty feet below. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(sents[1].text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"** 5. For each token in the sentence above, print its `text`, `POS` tag, `dep` tag and `lemma`<br>\n",
|
||||||
|
"CHALLENGE: Have values line up in columns in the print output.**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"A DET det a\n",
|
||||||
|
"man NOUN nsubj man\n",
|
||||||
|
"stood VERB ROOT stand\n",
|
||||||
|
"upon ADP prep upon\n",
|
||||||
|
"a DET det a\n",
|
||||||
|
"railroad NOUN compound railroad\n",
|
||||||
|
"bridge NOUN pobj bridge\n",
|
||||||
|
"in ADP prep in\n",
|
||||||
|
"northern ADJ amod northern\n",
|
||||||
|
"Alabama PROPN pobj alabama\n",
|
||||||
|
", PUNCT punct ,\n",
|
||||||
|
"looking VERB advcl look\n",
|
||||||
|
"down PART prt down\n",
|
||||||
|
"\n",
|
||||||
|
" SPACE \n",
|
||||||
|
"\n",
|
||||||
|
"into ADP prep into\n",
|
||||||
|
"the DET det the\n",
|
||||||
|
"swift ADJ amod swift\n",
|
||||||
|
"water NOUN pobj water\n",
|
||||||
|
"twenty NUM nummod twenty\n",
|
||||||
|
"feet NOUN npadvmod foot\n",
|
||||||
|
"below ADV advmod below\n",
|
||||||
|
". PUNCT punct .\n",
|
||||||
|
" SPACE \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# NORMAL SOLUTION:\n",
|
||||||
|
"for token in sents[1]:\n",
|
||||||
|
" print(token.text, token.pos_, token.dep_, token.lemma_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"A DET det a \n",
|
||||||
|
"man NOUN nsubj man \n",
|
||||||
|
"stood VERB ROOT stand \n",
|
||||||
|
"upon ADP prep upon \n",
|
||||||
|
"a DET det a \n",
|
||||||
|
"railroad NOUN compound railroad \n",
|
||||||
|
"bridge NOUN pobj bridge \n",
|
||||||
|
"in ADP prep in \n",
|
||||||
|
"northern ADJ amod northern \n",
|
||||||
|
"Alabama PROPN pobj alabama \n",
|
||||||
|
", PUNCT punct , \n",
|
||||||
|
"looking VERB advcl look \n",
|
||||||
|
"down PART prt down \n",
|
||||||
|
"\n",
|
||||||
|
" SPACE \n",
|
||||||
|
" \n",
|
||||||
|
"into ADP prep into \n",
|
||||||
|
"the DET det the \n",
|
||||||
|
"swift ADJ amod swift \n",
|
||||||
|
"water NOUN pobj water \n",
|
||||||
|
"twenty NUM nummod twenty \n",
|
||||||
|
"feet NOUN npadvmod foot \n",
|
||||||
|
"below ADV advmod below \n",
|
||||||
|
". PUNCT punct . \n",
|
||||||
|
" SPACE \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# CHALLENGE SOLUTION:\n",
|
||||||
|
"for token in sents[1]:\n",
|
||||||
|
" print(f'{token.text:{15}} {token.pos_:{5}} {token.dep_:{10}} {token.lemma_:{15}}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**6. Write a matcher called 'Swimming' that finds both occurrences of the phrase \"swimming vigorously\" in the text**<br>\n",
|
||||||
|
"HINT: You should include an `'IS_SPACE': True` pattern between the two words!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import the Matcher library:\n",
|
||||||
|
"\n",
|
||||||
|
"from spacy.matcher import Matcher\n",
|
||||||
|
"matcher = Matcher(nlp.vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Create a pattern and add it to matcher:\n",
|
||||||
|
"\n",
|
||||||
|
"pattern = [{'LOWER': 'swimming'}, {'IS_SPACE': True, 'OP':'*'}, {'LOWER': 'vigorously'}]\n",
|
||||||
|
"\n",
|
||||||
|
"matcher.add('Swimming', None, pattern)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[(12881893835109366681, 1274, 1277), (12881893835109366681, 3607, 3610)]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create a list of matches called \"found_matches\" and print the list:\n",
|
||||||
|
"\n",
|
||||||
|
"found_matches = matcher(doc)\n",
|
||||||
|
"print(found_matches)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**7. Print the text surrounding each found match**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"By diving I could evade the bullets and, swimming\n",
|
||||||
|
"vigorously, reach the bank, take to the woods and get away home\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(doc[1265:1290])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"over his shoulder; he was now swimming\n",
|
||||||
|
"vigorously with the current. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(doc[3600:3615])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**EXTRA CREDIT:<br>Print the *sentence* that contains each found match**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"By diving I could evade the bullets and, swimming\n",
|
||||||
|
"vigorously, reach the bank, take to the woods and get away home. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for sent in sents:\n",
|
||||||
|
" if found_matches[0][1] < sent.end:\n",
|
||||||
|
" print(sent)\n",
|
||||||
|
" break"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The hunted man saw all this over his shoulder; he was now swimming\n",
|
||||||
|
"vigorously with the current. \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for sent in sents:\n",
|
||||||
|
" if found_matches[1][1] < sent.end:\n",
|
||||||
|
" print(sent)\n",
|
||||||
|
" break"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Great Job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,529 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Part of Speech Basics\n",
|
||||||
|
"The challenge of correctly identifying parts of speech is summed up nicely in the [spaCy docs](https://spacy.io/usage/linguistic-features):\n",
|
||||||
|
"<div class=\"alert alert-info\" style=\"margin: 20px\">Processing raw text intelligently is difficult: most words are rare, and it's common for words that look completely different to mean almost the same thing. The same words in a different order can mean something completely different. Even splitting text into useful word-like units can be difficult in many languages. While it's possible to solve some problems starting from only the raw characters, it's usually better to use linguistic knowledge to add useful information. That's exactly what spaCy is designed to do: you put in raw text, and get back a **Doc** object, that comes with a variety of annotations.</div>\n",
|
||||||
|
"In this section we'll take a closer look at coarse POS tags (noun, verb, adjective) and fine-grained tags (plural noun, past-tense verb, superlative adjective)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Create a simple Doc object\n",
|
||||||
|
"doc = nlp(u\"The quick brown fox jumped over the lazy dog's back.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## View token tags\n",
|
||||||
|
"Recall that you can obtain a particular token by its index position.\n",
|
||||||
|
"* To view the coarse POS tag use `token.pos_`\n",
|
||||||
|
"* To view the fine-grained tag use `token.tag_`\n",
|
||||||
|
"* To view the description of either type of tag use `spacy.explain(tag)`\n",
|
||||||
|
"\n",
|
||||||
|
"<div class=\"alert alert-success\">Note that `token.pos` and `token.tag` return integer hash values; by adding the underscores we get the text equivalent that lives in **doc.vocab**.</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The quick brown fox jumped over the lazy dog's back.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print the full text:\n",
|
||||||
|
"print(doc.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"jumped VERB VBD verb, past tense\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print the fifth word and associated tags:\n",
|
||||||
|
"print(doc[4].text, doc[4].pos_, doc[4].tag_, spacy.explain(doc[4].tag_))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can apply this technique to the entire Doc object:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The DET DT determiner\n",
|
||||||
|
"quick ADJ JJ adjective\n",
|
||||||
|
"brown ADJ JJ adjective\n",
|
||||||
|
"fox NOUN NN noun, singular or mass\n",
|
||||||
|
"jumped VERB VBD verb, past tense\n",
|
||||||
|
"over ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"the DET DT determiner\n",
|
||||||
|
"lazy ADJ JJ adjective\n",
|
||||||
|
"dog NOUN NN noun, singular or mass\n",
|
||||||
|
"'s PART POS possessive ending\n",
|
||||||
|
"back NOUN NN noun, singular or mass\n",
|
||||||
|
". PUNCT . punctuation mark, sentence closer\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for token in doc:\n",
|
||||||
|
" print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"## Coarse-grained Part-of-speech Tags\n",
|
||||||
|
"Every token is assigned a POS Tag from the following list:\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"<table><tr><th>POS</th><th>DESCRIPTION</th><th>EXAMPLES</th></tr>\n",
|
||||||
|
" \n",
|
||||||
|
"<tr><td>ADJ</td><td>adjective</td><td>*big, old, green, incomprehensible, first*</td></tr>\n",
|
||||||
|
"<tr><td>ADP</td><td>adposition</td><td>*in, to, during*</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>adverb</td><td>*very, tomorrow, down, where, there*</td></tr>\n",
|
||||||
|
"<tr><td>AUX</td><td>auxiliary</td><td>*is, has (done), will (do), should (do)*</td></tr>\n",
|
||||||
|
"<tr><td>CONJ</td><td>conjunction</td><td>*and, or, but*</td></tr>\n",
|
||||||
|
"<tr><td>CCONJ</td><td>coordinating conjunction</td><td>*and, or, but*</td></tr>\n",
|
||||||
|
"<tr><td>DET</td><td>determiner</td><td>*a, an, the*</td></tr>\n",
|
||||||
|
"<tr><td>INTJ</td><td>interjection</td><td>*psst, ouch, bravo, hello*</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td>noun</td><td>*girl, cat, tree, air, beauty*</td></tr>\n",
|
||||||
|
"<tr><td>NUM</td><td>numeral</td><td>*1, 2017, one, seventy-seven, IV, MMXIV*</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td>particle</td><td>*'s, not,*</td></tr>\n",
|
||||||
|
"<tr><td>PRON</td><td>pronoun</td><td>*I, you, he, she, myself, themselves, somebody*</td></tr>\n",
|
||||||
|
"<tr><td>PROPN</td><td>proper noun</td><td>*Mary, John, London, NATO, HBO*</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td>punctuation</td><td>*., (, ), ?*</td></tr>\n",
|
||||||
|
"<tr><td>SCONJ</td><td>subordinating conjunction</td><td>*if, while, that*</td></tr>\n",
|
||||||
|
"<tr><td>SYM</td><td>symbol</td><td>*$, %, §, ©, +, −, ×, ÷, =, :), 😝*</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>verb</td><td>*run, runs, running, eat, ate, eating*</td></tr>\n",
|
||||||
|
"<tr><td>X</td><td>other</td><td>*sfpksdpsxmsa*</td></tr>\n",
|
||||||
|
"<tr><td>SPACE</td><td>space</td></tr>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Fine-grained Part-of-speech Tags\n",
|
||||||
|
"Tokens are subsequently given a fine-grained tag as determined by morphology:\n",
|
||||||
|
"<table>\n",
|
||||||
|
"<tr><th>POS</th><th>Description</th><th>Fine-grained Tag</th><th>Description</th><th>Morphology</th></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>adjective</td><td>AFX</td><td>affix</td><td>Hyph=yes</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>JJ</td><td>adjective</td><td>Degree=pos</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>JJR</td><td>adjective, comparative</td><td>Degree=comp</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>JJS</td><td>adjective, superlative</td><td>Degree=sup</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>PDT</td><td>predeterminer</td><td>AdjType=pdt PronType=prn</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>PRP\\$</td><td>pronoun, possessive</td><td>PronType=prs Poss=yes</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>WDT</td><td>wh-determiner</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>WP\\$</td><td>wh-pronoun, possessive</td><td>Poss=yes PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>ADP</td><td>adposition</td><td>IN</td><td>conjunction, subordinating or preposition</td><td></td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>adverb</td><td>EX</td><td>existential there</td><td>AdvType=ex</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td></td><td>RB</td><td>adverb</td><td>Degree=pos</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td></td><td>RBR</td><td>adverb, comparative</td><td>Degree=comp</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td></td><td>RBS</td><td>adverb, superlative</td><td>Degree=sup</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td></td><td>WRB</td><td>wh-adverb</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>CONJ</td><td>conjunction</td><td>CC</td><td>conjunction, coordinating</td><td>ConjType=coor</td></tr>\n",
|
||||||
|
"<tr><td>DET</td><td>determiner</td><td>DT</td><td>determiner</td><td></td></tr>\n",
|
||||||
|
"<tr><td>INTJ</td><td>interjection</td><td>UH</td><td>interjection</td><td></td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td>noun</td><td>NN</td><td>noun, singular or mass</td><td>Number=sing</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td></td><td>NNS</td><td>noun, plural</td><td>Number=plur</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td></td><td>WP</td><td>wh-pronoun, personal</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>NUM</td><td>numeral</td><td>CD</td><td>cardinal number</td><td>NumType=card</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td>particle</td><td>POS</td><td>possessive ending</td><td>Poss=yes</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td></td><td>RP</td><td>adverb, particle</td><td></td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td></td><td>TO</td><td>infinitival to</td><td>PartType=inf VerbForm=inf</td></tr>\n",
|
||||||
|
"<tr><td>PRON</td><td>pronoun</td><td>PRP</td><td>pronoun, personal</td><td>PronType=prs</td></tr>\n",
|
||||||
|
"<tr><td>PROPN</td><td>proper noun</td><td>NNP</td><td>noun, proper singular</td><td>NounType=prop Number=sign</td></tr>\n",
|
||||||
|
"<tr><td>PROPN</td><td></td><td>NNPS</td><td>noun, proper plural</td><td>NounType=prop Number=plur</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td>punctuation</td><td>-LRB-</td><td>left round bracket</td><td>PunctType=brck PunctSide=ini</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>-RRB-</td><td>right round bracket</td><td>PunctType=brck PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>,</td><td>punctuation mark, comma</td><td>PunctType=comm</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>:</td><td>punctuation mark, colon or ellipsis</td><td></td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>.</td><td>punctuation mark, sentence closer</td><td>PunctType=peri</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>''</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>\"\"</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>``</td><td>opening quotation mark</td><td>PunctType=quot PunctSide=ini</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>HYPH</td><td>punctuation mark, hyphen</td><td>PunctType=dash</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>LS</td><td>list item marker</td><td>NumType=ord</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>NFP</td><td>superfluous punctuation</td><td></td></tr>\n",
|
||||||
|
"<tr><td>SYM</td><td>symbol</td><td>#</td><td>symbol, number sign</td><td>SymType=numbersign</td></tr>\n",
|
||||||
|
"<tr><td>SYM</td><td></td><td>\\$</td><td>symbol, currency</td><td>SymType=currency</td></tr>\n",
|
||||||
|
"<tr><td>SYM</td><td></td><td>SYM</td><td>symbol</td><td></td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>verb</td><td>BES</td><td>auxiliary \"be\"</td><td></td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>HVS</td><td>forms of \"have\"</td><td></td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>MD</td><td>verb, modal auxiliary</td><td>VerbType=mod</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VB</td><td>verb, base form</td><td>VerbForm=inf</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VBD</td><td>verb, past tense</td><td>VerbForm=fin Tense=past</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VBG</td><td>verb, gerund or present participle</td><td>VerbForm=part Tense=pres Aspect=prog</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VBN</td><td>verb, past participle</td><td>VerbForm=part Tense=past Aspect=perf</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VBP</td><td>verb, non-3rd person singular present</td><td>VerbForm=fin Tense=pres</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VBZ</td><td>verb, 3rd person singular present</td><td>VerbForm=fin Tense=pres Number=sing Person=3</td></tr>\n",
|
||||||
|
"<tr><td>X</td><td>other</td><td>ADD</td><td>email</td><td></td></tr>\n",
|
||||||
|
"<tr><td>X</td><td></td><td>FW</td><td>foreign word</td><td>Foreign=yes</td></tr>\n",
|
||||||
|
"<tr><td>X</td><td></td><td>GW</td><td>additional word in multi-word expression</td><td></td></tr>\n",
|
||||||
|
"<tr><td>X</td><td></td><td>XX</td><td>unknown</td><td></td></tr>\n",
|
||||||
|
"<tr><td>SPACE</td><td>space</td><td>_SP</td><td>space</td><td></td></tr>\n",
|
||||||
|
"<tr><td></td><td></td><td>NIL</td><td>missing tag</td><td></td></tr>\n",
|
||||||
|
"</table>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For a current list of tags for all languages visit https://spacy.io/api/annotation#pos-tagging"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Working with POS Tags\n",
|
||||||
|
"In the English language, the same string of characters can have different meanings, even within the same sentence. For this reason, morphology is important. **spaCy** uses machine learning algorithms to best predict the use of a token in a sentence. Is *\"I read books on NLP\"* present or past tense? Is *wind* a verb or a noun?"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"read VERB VBP verb, non-3rd person singular present\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'I read books on NLP.')\n",
|
||||||
|
"r = doc[1]\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"read VERB VBD verb, past tense\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'I read a book on NLP.')\n",
|
||||||
|
"r = doc[1]\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In the first example, with no other cues to work from, spaCy assumed that ***read*** was present tense.<br>In the second example the present tense form would be ***I am reading a book***, so spaCy assigned the past tense."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Counting POS Tags\n",
|
||||||
|
"The `Doc.count_by()` method accepts a specific token attribute as its argument, and returns a frequency count of the given attribute as a dictionary object. Keys in the dictionary are the integer values of the given attribute ID, and values are the frequency. Counts of zero are not included."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{83: 3, 84: 1, 89: 2, 91: 3, 93: 1, 96: 1, 99: 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u\"The quick brown fox jumped over the lazy dog's back.\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Count the frequencies of different coarse-grained POS tags:\n",
|
||||||
|
"POS_counts = doc.count_by(spacy.attrs.POS)\n",
|
||||||
|
"POS_counts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This isn't very helpful until you decode the attribute ID:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'ADJ'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc.vocab[83].text"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Create a frequency list of POS tags from the entire document\n",
|
||||||
|
"Since `POS_counts` returns a dictionary, we can obtain a list of keys with `POS_counts.items()`.<br>By sorting the list we have access to the tag and its count, in order."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"83. ADJ : 3\n",
|
||||||
|
"84. ADP : 1\n",
|
||||||
|
"89. DET : 2\n",
|
||||||
|
"91. NOUN : 3\n",
|
||||||
|
"93. PART : 1\n",
|
||||||
|
"96. PUNCT: 1\n",
|
||||||
|
"99. VERB : 1\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for k,v in sorted(POS_counts.items()):\n",
|
||||||
|
" print(f'{k}. {doc.vocab[k].text:{5}}: {v}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"74. POS : 1\n",
|
||||||
|
"1292078113972184607. IN : 1\n",
|
||||||
|
"10554686591937588953. JJ : 3\n",
|
||||||
|
"12646065887601541794. . : 1\n",
|
||||||
|
"15267657372422890137. DT : 2\n",
|
||||||
|
"15308085513773655218. NN : 3\n",
|
||||||
|
"17109001835818727656. VBD : 1\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Count the different fine-grained tags:\n",
|
||||||
|
"TAG_counts = doc.count_by(spacy.attrs.TAG)\n",
|
||||||
|
"\n",
|
||||||
|
"for k,v in sorted(TAG_counts.items()):\n",
|
||||||
|
" print(f'{k}. {doc.vocab[k].text:{4}}: {v}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<div class=\"alert alert-success\">**Why did the ID numbers get so big?** In spaCy, certain text values are hardcoded into `Doc.vocab` and take up the first several hundred ID numbers. Strings like 'NOUN' and 'VERB' are used frequently by internal operations. Others, like fine-grained tags, are assigned hash values as needed.</div>\n",
|
||||||
|
"<div class=\"alert alert-success\">**Why don't SPACE tags appear?** In spaCy, only strings of spaces (two or more) are assigned tokens. Single spaces are not.</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"399. amod: 3\n",
|
||||||
|
"412. det : 2\n",
|
||||||
|
"426. nsubj: 1\n",
|
||||||
|
"436. pobj: 1\n",
|
||||||
|
"437. poss: 1\n",
|
||||||
|
"440. prep: 1\n",
|
||||||
|
"442. punct: 1\n",
|
||||||
|
"8110129090154140942. case: 1\n",
|
||||||
|
"8206900633647566924. ROOT: 1\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Count the different dependencies:\n",
|
||||||
|
"DEP_counts = doc.count_by(spacy.attrs.DEP)\n",
|
||||||
|
"\n",
|
||||||
|
"for k,v in sorted(DEP_counts.items()):\n",
|
||||||
|
" print(f'{k}. {doc.vocab[k].text:{4}}: {v}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Here we've shown `spacy.attrs.POS`, `spacy.attrs.TAG` and `spacy.attrs.DEP`.<br>Refer back to the **Vocabulary and Matching** lecture from the previous section for a table of **Other token attributes**."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Fine-grained POS Tag Examples\n",
|
||||||
|
"These are some grammatical examples (shown in **bold**) of specific fine-grained tags. We've removed punctuation and rarely used tags:\n",
|
||||||
|
"<table>\n",
|
||||||
|
"<tr><th>POS</th><th>TAG</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>AFX</td><td>affix</td><td>The Flintstones were a **pre**-historic family.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>JJ</td><td>adjective</td><td>This is a **good** sentence.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>JJR</td><td>adjective, comparative</td><td>This is a **better** sentence.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>JJS</td><td>adjective, superlative</td><td>This is the **best** sentence.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>PDT</td><td>predeterminer</td><td>Waking up is **half** the battle.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>PRP\\$</td><td>pronoun, possessive</td><td>**His** arm hurts.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>WDT</td><td>wh-determiner</td><td>It's blue, **which** is odd.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>WP\\$</td><td>wh-pronoun, possessive</td><td>We don't know **whose** it is.</td></tr>\n",
|
||||||
|
"<tr><td>ADP</td><td>IN</td><td>conjunction, subordinating or preposition</td><td>It arrived **in** a box.</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>EX</td><td>existential there</td><td>**There** is cake.</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>RB</td><td>adverb</td><td>He ran **quickly**.</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>RBR</td><td>adverb, comparative</td><td>He ran **quicker**.</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>RBS</td><td>adverb, superlative</td><td>He ran **fastest**.</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>WRB</td><td>wh-adverb</td><td>**When** was that?</td></tr>\n",
|
||||||
|
"<tr><td>CONJ</td><td>CC</td><td>conjunction, coordinating</td><td>The balloon popped **and** everyone jumped.</td></tr>\n",
|
||||||
|
"<tr><td>DET</td><td>DT</td><td>determiner</td><td>**This** is **a** sentence.</td></tr>\n",
|
||||||
|
"<tr><td>INTJ</td><td>UH</td><td>interjection</td><td>**Um**, I don't know.</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td>NN</td><td>noun, singular or mass</td><td>This is a **sentence**.</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td>NNS</td><td>noun, plural</td><td>These are **words**.</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td>WP</td><td>wh-pronoun, personal</td><td>**Who** was that?</td></tr>\n",
|
||||||
|
"<tr><td>NUM</td><td>CD</td><td>cardinal number</td><td>I want **three** things.</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td>POS</td><td>possessive ending</td><td>Fred**'s** name is short.</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td>RP</td><td>adverb, particle</td><td>Put it **back**!</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td>TO</td><td>infinitival to</td><td>I want **to** go.</td></tr>\n",
|
||||||
|
"<tr><td>PRON</td><td>PRP</td><td>pronoun, personal</td><td>**I** want **you** to go.</td></tr>\n",
|
||||||
|
"<tr><td>PROPN</td><td>NNP</td><td>noun, proper singular</td><td>**Kilroy** was here.</td></tr>\n",
|
||||||
|
"<tr><td>PROPN</td><td>NNPS</td><td>noun, proper plural</td><td>The **Flintstones** were a pre-historic family.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>MD</td><td>verb, modal auxiliary</td><td>This **could** work.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VB</td><td>verb, base form</td><td>I want to **go**.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VBD</td><td>verb, past tense</td><td>This **was** a sentence.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VBG</td><td>verb, gerund or present participle</td><td>I am **going**.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VBN</td><td>verb, past participle</td><td>The treasure was **lost**.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VBP</td><td>verb, non-3rd person singular present</td><td>I **want** to go.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VBZ</td><td>verb, 3rd person singular present</td><td>He **wants** to go.</td></tr>\n",
|
||||||
|
"</table>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Up Next: Visualizing POS"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,199 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# POS Challenge - OPTIONAL\n",
|
||||||
|
"Just for fun, we've developed a game to test your knowledge of Part of Speech tags. The object of the game is to write a body of text that contains as many different ** *fine-grained tags* ** as possible. The highest possible score is 100 (or thereabouts). Points are awarded for the number of unique tags used, and for the fewest possible tokens used. Below is an example. Feel free to post your results in the Q&A Forum for this lecture, and good luck!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')\n",
|
||||||
|
"\n",
|
||||||
|
"# Import the game script\n",
|
||||||
|
"import game"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Enter your text here:\n",
|
||||||
|
"text = u\"The quick brown fox jumped over the lazy dog's back.\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Unique tags: 7\n",
|
||||||
|
"Tokens used: 12\n",
|
||||||
|
"SCORE: 9\n",
|
||||||
|
"CONGRATULATIONS!\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Make your Doc object and pass it into the scorer:\n",
|
||||||
|
"doc = nlp(text)\n",
|
||||||
|
"print(game.scorer(doc))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"TOKEN COARSE FINE DESCRIPTION\n",
|
||||||
|
"----- ------ ---- -----------\n",
|
||||||
|
"The DET DT determiner\n",
|
||||||
|
"quick ADJ JJ adjective\n",
|
||||||
|
"brown ADJ JJ adjective\n",
|
||||||
|
"fox NOUN NN noun, singular or mass\n",
|
||||||
|
"jumped VERB VBD verb, past tense\n",
|
||||||
|
"over ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"the DET DT determiner\n",
|
||||||
|
"lazy ADJ JJ adjective\n",
|
||||||
|
"dog NOUN NN noun, singular or mass\n",
|
||||||
|
"'s PART POS possessive ending\n",
|
||||||
|
"back NOUN NN noun, singular or mass\n",
|
||||||
|
". PUNCT . punctuation mark, sentence closer\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# For practice, visualize your fine-grained POS tags (shown in the third column):\n",
|
||||||
|
"print(f\"{'TOKEN':{10}} {'COARSE':{8}} {'FINE':{6}} {'DESCRIPTION'}\")\n",
|
||||||
|
"print(f\"{'-----':{10}} {'------':{8}} {'----':{6}} {'-----------'}\")\n",
|
||||||
|
"\n",
|
||||||
|
"for token in doc:\n",
|
||||||
|
" print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Fine-grained Part-of-speech Tags\n",
|
||||||
|
"\n",
|
||||||
|
"<table>\n",
|
||||||
|
"<tr><th></th><th>Coarse POS Tag</th><th>Description</th><th>Fine-grained Tag</th><th>Description</th><th>Morphology</th></tr>\n",
|
||||||
|
"<tr><td>1.</td><td>ADJ</td><td>adjective</td><td>AFX</td><td>affix</td><td>Hyph=yes</td></tr>\n",
|
||||||
|
"<tr><td>2.</td><td>ADJ</td><td></td><td>JJ</td><td>adjective</td><td>Degree=pos</td></tr>\n",
|
||||||
|
"<tr><td>3.</td><td>ADJ</td><td></td><td>JJR</td><td>adjective, comparative</td><td>Degree=comp</td></tr>\n",
|
||||||
|
"<tr><td>4.</td><td>ADJ</td><td></td><td>JJS</td><td>adjective, superlative</td><td>Degree=sup</td></tr>\n",
|
||||||
|
"<tr><td>5.</td><td>ADJ</td><td></td><td>PDT</td><td>predeterminer</td><td>AdjType=pdt PronType=prn</td></tr>\n",
|
||||||
|
"<tr><td>6.</td><td>ADJ</td><td></td><td>PRP\\$</td><td>pronoun, possessive</td><td>PronType=prs Poss=yes</td></tr>\n",
|
||||||
|
"<tr><td>7.</td><td>ADJ</td><td></td><td>WDT</td><td>wh-determiner</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>8.</td><td>ADJ</td><td></td><td>WP\\$</td><td>wh-pronoun, possessive</td><td>Poss=yes PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>9.</td><td>ADP</td><td>adposition</td><td>IN</td><td>conjunction, subordinating or preposition</td><td></td></tr>\n",
|
||||||
|
"<tr><td>10.</td><td>ADV</td><td>adverb</td><td>EX</td><td>existential there</td><td>AdvType=ex</td></tr>\n",
|
||||||
|
"<tr><td>11.</td><td>ADV</td><td></td><td>RB</td><td>adverb</td><td>Degree=pos</td></tr>\n",
|
||||||
|
"<tr><td>12.</td><td>ADV</td><td></td><td>RBR</td><td>adverb, comparative</td><td>Degree=comp</td></tr>\n",
|
||||||
|
"<tr><td>13.</td><td>ADV</td><td></td><td>RBS</td><td>adverb, superlative</td><td>Degree=sup</td></tr>\n",
|
||||||
|
"<tr><td>14.</td><td>ADV</td><td></td><td>WRB</td><td>wh-adverb</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>15.</td><td>CONJ</td><td>conjunction</td><td>CC</td><td>conjunction, coordinating</td><td>ConjType=coor</td></tr>\n",
|
||||||
|
"<tr><td>16.</td><td>DET</td><td>determiner</td><td>DT</td><td></td><td>determiner</td></tr>\n",
|
||||||
|
"<tr><td>17.</td><td>INTJ</td><td>interjection</td><td>UH</td><td>interjection</td><td></td></tr>\n",
|
||||||
|
"<tr><td>18.</td><td>NOUN</td><td>noun</td><td>NN</td><td>noun, singular or mass</td><td>Number=sing</td></tr>\n",
|
||||||
|
"<tr><td>19.</td><td>NOUN</td><td></td><td>NNS</td><td>noun, plural</td><td>Number=plur</td></tr>\n",
|
||||||
|
"<tr><td>20.</td><td>NOUN</td><td></td><td>WP</td><td>wh-pronoun, personal</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>21.</td><td>NUM</td><td>numeral</td><td>CD</td><td>cardinal number</td><td>NumType=card</td></tr>\n",
|
||||||
|
"<tr><td>22.</td><td>PART</td><td>particle</td><td>POS</td><td>possessive ending</td><td>Poss=yes</td></tr>\n",
|
||||||
|
"<tr><td>23.</td><td>PART</td><td></td><td>RP</td><td>adverb, particle</td><td></td></tr>\n",
|
||||||
|
"<tr><td>24.</td><td>PART</td><td></td><td>TO</td><td>infinitival to</td><td>PartType=inf VerbForm=inf</td></tr>\n",
|
||||||
|
"<tr><td>25.</td><td>PRON</td><td>pronoun</td><td>PRP</td><td>pronoun, personal</td><td>PronType=prs</td></tr>\n",
|
||||||
|
"<tr><td>26.</td><td>PROPN</td><td>proper noun</td><td>NNP</td><td>noun, proper singular</td><td>NounType=prop Number=sign</td></tr>\n",
|
||||||
|
"<tr><td>27.</td><td>PROPN</td><td></td><td>NNPS</td><td>noun, proper plural</td><td>NounType=prop Number=plur</td></tr>\n",
|
||||||
|
"<tr><td>28.</td><td>PUNCT</td><td>punctuation</td><td>-LRB-</td><td>left round bracket</td><td>PunctType=brck PunctSide=ini</td></tr>\n",
|
||||||
|
"<tr><td>29.</td><td>PUNCT</td><td></td><td>-RRB-</td><td>right round bracket</td><td>PunctType=brck PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>30.</td><td>PUNCT</td><td></td><td>,</td><td>punctuation mark, comma</td><td>PunctType=comm</td></tr>\n",
|
||||||
|
"<tr><td>31.</td><td>PUNCT</td><td></td><td>:</td><td>punctuation mark, colon or ellipsis</td><td></td></tr>\n",
|
||||||
|
"<tr><td>32.</td><td>PUNCT</td><td></td><td>.</td><td>punctuation mark, sentence closer</td><td>PunctType=peri</td></tr>\n",
|
||||||
|
"<tr><td>33.</td><td>PUNCT</td><td></td><td>''</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>34.</td><td>PUNCT</td><td></td><td>\"\"</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>35.</td><td>PUNCT</td><td></td><td>``</td><td>opening quotation mark</td><td>PunctType=quot PunctSide=ini</td></tr>\n",
|
||||||
|
"<tr><td>36.</td><td>PUNCT</td><td></td><td>HYPH</td><td>punctuation mark, hyphen</td><td>PunctType=dash</td></tr>\n",
|
||||||
|
"<tr><td>37.</td><td>PUNCT</td><td></td><td>LS</td><td>list item marker</td><td>NumType=ord</td></tr>\n",
|
||||||
|
"<tr><td>38.</td><td>PUNCT</td><td></td><td>NFP</td><td>superfluous punctuation</td><td></td></tr>\n",
|
||||||
|
"<tr><td>39.</td><td>SYM</td><td>symbol</td><td>#</td><td>symbol, number sign</td><td>SymType=numbersign</td></tr>\n",
|
||||||
|
"<tr><td>40.</td><td>SYM</td><td></td><td>\\$</td><td>symbol, currency</td><td>SymType=currency</td></tr>\n",
|
||||||
|
"<tr><td>41.</td><td>SYM</td><td></td><td>SYM</td><td>symbol</td><td></td></tr>\n",
|
||||||
|
"<tr><td>42.</td><td>VERB</td><td>verb</td><td>BES</td><td>auxiliary \"be\"</td><td></td></tr>\n",
|
||||||
|
"<tr><td>43.</td><td>VERB</td><td></td><td>HVS</td><td>forms of \"have\"</td><td></td></tr>\n",
|
||||||
|
"<tr><td>44.</td><td>VERB</td><td></td><td>MD</td><td>verb, modal auxiliary</td><td>VerbType=mod</td></tr>\n",
|
||||||
|
"<tr><td>45.</td><td>VERB</td><td></td><td>VB</td><td>verb, base form</td><td>VerbForm=inf</td></tr>\n",
|
||||||
|
"<tr><td>46.</td><td>VERB</td><td></td><td>VBD</td><td>verb, past tense</td><td>VerbForm=fin Tense=past</td></tr>\n",
|
||||||
|
"<tr><td>47.</td><td>VERB</td><td></td><td>VBG</td><td>verb, gerund or present participle</td><td>VerbForm=part Tense=pres Aspect=prog</td></tr>\n",
|
||||||
|
"<tr><td>48.</td><td>VERB</td><td></td><td>VBN</td><td>verb, past participle</td><td>VerbForm=part Tense=past Aspect=perf</td></tr>\n",
|
||||||
|
"<tr><td>49.</td><td>VERB</td><td></td><td>VBP</td><td>verb, non-3rd person singular present</td><td>VerbForm=fin Tense=pres</td></tr>\n",
|
||||||
|
"<tr><td>50.</td><td>VERB</td><td></td><td>VBZ</td><td>verb, 3rd person singular present</td><td>VerbForm=fin Tense=pres Number=sing Person=3</td></tr>\n",
|
||||||
|
"<tr><td>51.</td><td>X</td><td>other</td><td>ADD</td><td>email</td><td></td></tr>\n",
|
||||||
|
"<tr><td>52.</td><td>X</td><td></td><td>FW</td><td>foreign word</td><td>Foreign=yes</td></tr>\n",
|
||||||
|
"<tr><td>53.</td><td>X</td><td></td><td>GW</td><td>additional word in multi-word expression</td><td></td></tr>\n",
|
||||||
|
"<tr><td>54.</td><td>X</td><td></td><td>XX</td><td>unknown</td><td></td></tr>\n",
|
||||||
|
"<tr><td>55.</td><td>SPACE</td><td>space</td><td>_SP</td><td>space</td><td></td></tr>\n",
|
||||||
|
"<tr><td>56.</td><td></td><td></td><td>NIL</td><td>missing tag</td><td></td></tr>\n",
|
||||||
|
"</table>"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
def scorer(doc):
|
||||||
|
tags = ['AFX', 'JJ', 'JJR', 'JJS', 'PDT', 'PRP$', 'WDT', 'WP$', 'IN', 'EX', 'RB', 'RBR', 'RBS', 'WRB', 'CC', 'DT', 'UH', 'NN', 'NNS', 'WP', 'CD', 'POS', 'RP', 'TO', 'PRP', 'NNP', 'NNPS', '-LRB-', '-RRB-', ',', ':', '.', "''", '""', '``', 'HYPH', 'LS', 'NFP', '_SP', '#', '$', 'SYM', 'BES', 'HVS', 'MD', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'ADD', 'FW', 'GW', 'XX', 'NIL']
|
||||||
|
counter = 0
|
||||||
|
for tag in tags:
|
||||||
|
for token in doc:
|
||||||
|
if token.tag_ == tag:
|
||||||
|
counter+=1
|
||||||
|
break
|
||||||
|
score = max(counter*3 - len(doc),counter)
|
||||||
|
return f'Unique tags: {counter}\nTokens used: {len(doc)}\nSCORE: {score}\nCONGRATULATIONS!'
|
||||||
@ -0,0 +1,529 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Part of Speech Basics\n",
|
||||||
|
"The challenge of correctly identifying parts of speech is summed up nicely in the [spaCy docs](https://spacy.io/usage/linguistic-features):\n",
|
||||||
|
"<div class=\"alert alert-info\" style=\"margin: 20px\">Processing raw text intelligently is difficult: most words are rare, and it's common for words that look completely different to mean almost the same thing. The same words in a different order can mean something completely different. Even splitting text into useful word-like units can be difficult in many languages. While it's possible to solve some problems starting from only the raw characters, it's usually better to use linguistic knowledge to add useful information. That's exactly what spaCy is designed to do: you put in raw text, and get back a **Doc** object, that comes with a variety of annotations.</div>\n",
|
||||||
|
"In this section we'll take a closer look at coarse POS tags (noun, verb, adjective) and fine-grained tags (plural noun, past-tense verb, superlative adjective)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Create a simple Doc object\n",
|
||||||
|
"doc = nlp(u\"The quick brown fox jumped over the lazy dog's back.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## View token tags\n",
|
||||||
|
"Recall that you can obtain a particular token by its index position.\n",
|
||||||
|
"* To view the coarse POS tag use `token.pos_`\n",
|
||||||
|
"* To view the fine-grained tag use `token.tag_`\n",
|
||||||
|
"* To view the description of either type of tag use `spacy.explain(tag)`\n",
|
||||||
|
"\n",
|
||||||
|
"<div class=\"alert alert-success\">Note that `token.pos` and `token.tag` return integer hash values; by adding the underscores we get the text equivalent that lives in **doc.vocab**.</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The quick brown fox jumped over the lazy dog's back.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print the full text:\n",
|
||||||
|
"print(doc.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"jumped VERB VBD verb, past tense\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print the fifth word and associated tags:\n",
|
||||||
|
"print(doc[4].text, doc[4].pos_, doc[4].tag_, spacy.explain(doc[4].tag_))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can apply this technique to the entire Doc object:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The DET DT determiner\n",
|
||||||
|
"quick ADJ JJ adjective\n",
|
||||||
|
"brown ADJ JJ adjective\n",
|
||||||
|
"fox NOUN NN noun, singular or mass\n",
|
||||||
|
"jumped VERB VBD verb, past tense\n",
|
||||||
|
"over ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"the DET DT determiner\n",
|
||||||
|
"lazy ADJ JJ adjective\n",
|
||||||
|
"dog NOUN NN noun, singular or mass\n",
|
||||||
|
"'s PART POS possessive ending\n",
|
||||||
|
"back NOUN NN noun, singular or mass\n",
|
||||||
|
". PUNCT . punctuation mark, sentence closer\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for token in doc:\n",
|
||||||
|
" print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"## Coarse-grained Part-of-speech Tags\n",
|
||||||
|
"Every token is assigned a POS Tag from the following list:\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"<table><tr><th>POS</th><th>DESCRIPTION</th><th>EXAMPLES</th></tr>\n",
|
||||||
|
" \n",
|
||||||
|
"<tr><td>ADJ</td><td>adjective</td><td>*big, old, green, incomprehensible, first*</td></tr>\n",
|
||||||
|
"<tr><td>ADP</td><td>adposition</td><td>*in, to, during*</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>adverb</td><td>*very, tomorrow, down, where, there*</td></tr>\n",
|
||||||
|
"<tr><td>AUX</td><td>auxiliary</td><td>*is, has (done), will (do), should (do)*</td></tr>\n",
|
||||||
|
"<tr><td>CONJ</td><td>conjunction</td><td>*and, or, but*</td></tr>\n",
|
||||||
|
"<tr><td>CCONJ</td><td>coordinating conjunction</td><td>*and, or, but*</td></tr>\n",
|
||||||
|
"<tr><td>DET</td><td>determiner</td><td>*a, an, the*</td></tr>\n",
|
||||||
|
"<tr><td>INTJ</td><td>interjection</td><td>*psst, ouch, bravo, hello*</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td>noun</td><td>*girl, cat, tree, air, beauty*</td></tr>\n",
|
||||||
|
"<tr><td>NUM</td><td>numeral</td><td>*1, 2017, one, seventy-seven, IV, MMXIV*</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td>particle</td><td>*'s, not,*</td></tr>\n",
|
||||||
|
"<tr><td>PRON</td><td>pronoun</td><td>*I, you, he, she, myself, themselves, somebody*</td></tr>\n",
|
||||||
|
"<tr><td>PROPN</td><td>proper noun</td><td>*Mary, John, London, NATO, HBO*</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td>punctuation</td><td>*., (, ), ?*</td></tr>\n",
|
||||||
|
"<tr><td>SCONJ</td><td>subordinating conjunction</td><td>*if, while, that*</td></tr>\n",
|
||||||
|
"<tr><td>SYM</td><td>symbol</td><td>*$, %, §, ©, +, −, ×, ÷, =, :), 😝*</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>verb</td><td>*run, runs, running, eat, ate, eating*</td></tr>\n",
|
||||||
|
"<tr><td>X</td><td>other</td><td>*sfpksdpsxmsa*</td></tr>\n",
|
||||||
|
"<tr><td>SPACE</td><td>space</td></tr>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Fine-grained Part-of-speech Tags\n",
|
||||||
|
"Tokens are subsequently given a fine-grained tag as determined by morphology:\n",
|
||||||
|
"<table>\n",
|
||||||
|
"<tr><th>POS</th><th>Description</th><th>Fine-grained Tag</th><th>Description</th><th>Morphology</th></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>adjective</td><td>AFX</td><td>affix</td><td>Hyph=yes</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>JJ</td><td>adjective</td><td>Degree=pos</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>JJR</td><td>adjective, comparative</td><td>Degree=comp</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>JJS</td><td>adjective, superlative</td><td>Degree=sup</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>PDT</td><td>predeterminer</td><td>AdjType=pdt PronType=prn</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>PRP\\$</td><td>pronoun, possessive</td><td>PronType=prs Poss=yes</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>WDT</td><td>wh-determiner</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td></td><td>WP\\$</td><td>wh-pronoun, possessive</td><td>Poss=yes PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>ADP</td><td>adposition</td><td>IN</td><td>conjunction, subordinating or preposition</td><td></td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>adverb</td><td>EX</td><td>existential there</td><td>AdvType=ex</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td></td><td>RB</td><td>adverb</td><td>Degree=pos</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td></td><td>RBR</td><td>adverb, comparative</td><td>Degree=comp</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td></td><td>RBS</td><td>adverb, superlative</td><td>Degree=sup</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td></td><td>WRB</td><td>wh-adverb</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>CONJ</td><td>conjunction</td><td>CC</td><td>conjunction, coordinating</td><td>ConjType=coor</td></tr>\n",
|
||||||
|
"<tr><td>DET</td><td>determiner</td><td>DT</td><td>determiner</td><td></td></tr>\n",
|
||||||
|
"<tr><td>INTJ</td><td>interjection</td><td>UH</td><td>interjection</td><td></td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td>noun</td><td>NN</td><td>noun, singular or mass</td><td>Number=sing</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td></td><td>NNS</td><td>noun, plural</td><td>Number=plur</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td></td><td>WP</td><td>wh-pronoun, personal</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>NUM</td><td>numeral</td><td>CD</td><td>cardinal number</td><td>NumType=card</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td>particle</td><td>POS</td><td>possessive ending</td><td>Poss=yes</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td></td><td>RP</td><td>adverb, particle</td><td></td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td></td><td>TO</td><td>infinitival to</td><td>PartType=inf VerbForm=inf</td></tr>\n",
|
||||||
|
"<tr><td>PRON</td><td>pronoun</td><td>PRP</td><td>pronoun, personal</td><td>PronType=prs</td></tr>\n",
|
||||||
|
"<tr><td>PROPN</td><td>proper noun</td><td>NNP</td><td>noun, proper singular</td><td>NounType=prop Number=sign</td></tr>\n",
|
||||||
|
"<tr><td>PROPN</td><td></td><td>NNPS</td><td>noun, proper plural</td><td>NounType=prop Number=plur</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td>punctuation</td><td>-LRB-</td><td>left round bracket</td><td>PunctType=brck PunctSide=ini</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>-RRB-</td><td>right round bracket</td><td>PunctType=brck PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>,</td><td>punctuation mark, comma</td><td>PunctType=comm</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>:</td><td>punctuation mark, colon or ellipsis</td><td></td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>.</td><td>punctuation mark, sentence closer</td><td>PunctType=peri</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>''</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>\"\"</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>``</td><td>opening quotation mark</td><td>PunctType=quot PunctSide=ini</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>HYPH</td><td>punctuation mark, hyphen</td><td>PunctType=dash</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>LS</td><td>list item marker</td><td>NumType=ord</td></tr>\n",
|
||||||
|
"<tr><td>PUNCT</td><td></td><td>NFP</td><td>superfluous punctuation</td><td></td></tr>\n",
|
||||||
|
"<tr><td>SYM</td><td>symbol</td><td>#</td><td>symbol, number sign</td><td>SymType=numbersign</td></tr>\n",
|
||||||
|
"<tr><td>SYM</td><td></td><td>\\$</td><td>symbol, currency</td><td>SymType=currency</td></tr>\n",
|
||||||
|
"<tr><td>SYM</td><td></td><td>SYM</td><td>symbol</td><td></td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>verb</td><td>BES</td><td>auxiliary \"be\"</td><td></td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>HVS</td><td>forms of \"have\"</td><td></td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>MD</td><td>verb, modal auxiliary</td><td>VerbType=mod</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VB</td><td>verb, base form</td><td>VerbForm=inf</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VBD</td><td>verb, past tense</td><td>VerbForm=fin Tense=past</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VBG</td><td>verb, gerund or present participle</td><td>VerbForm=part Tense=pres Aspect=prog</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VBN</td><td>verb, past participle</td><td>VerbForm=part Tense=past Aspect=perf</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VBP</td><td>verb, non-3rd person singular present</td><td>VerbForm=fin Tense=pres</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td></td><td>VBZ</td><td>verb, 3rd person singular present</td><td>VerbForm=fin Tense=pres Number=sing Person=3</td></tr>\n",
|
||||||
|
"<tr><td>X</td><td>other</td><td>ADD</td><td>email</td><td></td></tr>\n",
|
||||||
|
"<tr><td>X</td><td></td><td>FW</td><td>foreign word</td><td>Foreign=yes</td></tr>\n",
|
||||||
|
"<tr><td>X</td><td></td><td>GW</td><td>additional word in multi-word expression</td><td></td></tr>\n",
|
||||||
|
"<tr><td>X</td><td></td><td>XX</td><td>unknown</td><td></td></tr>\n",
|
||||||
|
"<tr><td>SPACE</td><td>space</td><td>_SP</td><td>space</td><td></td></tr>\n",
|
||||||
|
"<tr><td></td><td></td><td>NIL</td><td>missing tag</td><td></td></tr>\n",
|
||||||
|
"</table>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For a current list of tags for all languages visit https://spacy.io/api/annotation#pos-tagging"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Working with POS Tags\n",
|
||||||
|
"In the English language, the same string of characters can have different meanings, even within the same sentence. For this reason, morphology is important. **spaCy** uses machine learning algorithms to best predict the use of a token in a sentence. Is *\"I read books on NLP\"* present or past tense? Is *wind* a verb or a noun?"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"read VERB VBP verb, non-3rd person singular present\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'I read books on NLP.')\n",
|
||||||
|
"r = doc[1]\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"read VERB VBD verb, past tense\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'I read a book on NLP.')\n",
|
||||||
|
"r = doc[1]\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In the first example, with no other cues to work from, spaCy assumed that ***read*** was present tense.<br>In the second example the present tense form would be ***I am reading a book***, so spaCy assigned the past tense."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Counting POS Tags\n",
|
||||||
|
"The `Doc.count_by()` method accepts a specific token attribute as its argument, and returns a frequency count of the given attribute as a dictionary object. Keys in the dictionary are the integer values of the given attribute ID, and values are the frequency. Counts of zero are not included."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{83: 3, 84: 1, 89: 2, 91: 3, 93: 1, 96: 1, 99: 1}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u\"The quick brown fox jumped over the lazy dog's back.\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Count the frequencies of different coarse-grained POS tags:\n",
|
||||||
|
"POS_counts = doc.count_by(spacy.attrs.POS)\n",
|
||||||
|
"POS_counts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This isn't very helpful until you decode the attribute ID:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'ADJ'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc.vocab[83].text"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Create a frequency list of POS tags from the entire document\n",
|
||||||
|
"Since `POS_counts` returns a dictionary, we can obtain a list of keys with `POS_counts.items()`.<br>By sorting the list we have access to the tag and its count, in order."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"83. ADJ : 3\n",
|
||||||
|
"84. ADP : 1\n",
|
||||||
|
"89. DET : 2\n",
|
||||||
|
"91. NOUN : 3\n",
|
||||||
|
"93. PART : 1\n",
|
||||||
|
"96. PUNCT: 1\n",
|
||||||
|
"99. VERB : 1\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for k,v in sorted(POS_counts.items()):\n",
|
||||||
|
" print(f'{k}. {doc.vocab[k].text:{5}}: {v}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"74. POS : 1\n",
|
||||||
|
"1292078113972184607. IN : 1\n",
|
||||||
|
"10554686591937588953. JJ : 3\n",
|
||||||
|
"12646065887601541794. . : 1\n",
|
||||||
|
"15267657372422890137. DT : 2\n",
|
||||||
|
"15308085513773655218. NN : 3\n",
|
||||||
|
"17109001835818727656. VBD : 1\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Count the different fine-grained tags:\n",
|
||||||
|
"TAG_counts = doc.count_by(spacy.attrs.TAG)\n",
|
||||||
|
"\n",
|
||||||
|
"for k,v in sorted(TAG_counts.items()):\n",
|
||||||
|
" print(f'{k}. {doc.vocab[k].text:{4}}: {v}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<div class=\"alert alert-success\">**Why did the ID numbers get so big?** In spaCy, certain text values are hardcoded into `Doc.vocab` and take up the first several hundred ID numbers. Strings like 'NOUN' and 'VERB' are used frequently by internal operations. Others, like fine-grained tags, are assigned hash values as needed.</div>\n",
|
||||||
|
"<div class=\"alert alert-success\">**Why don't SPACE tags appear?** In spaCy, only strings of spaces (two or more) are assigned tokens. Single spaces are not.</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"399. amod: 3\n",
|
||||||
|
"412. det : 2\n",
|
||||||
|
"426. nsubj: 1\n",
|
||||||
|
"436. pobj: 1\n",
|
||||||
|
"437. poss: 1\n",
|
||||||
|
"440. prep: 1\n",
|
||||||
|
"442. punct: 1\n",
|
||||||
|
"8110129090154140942. case: 1\n",
|
||||||
|
"8206900633647566924. ROOT: 1\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Count the different dependencies:\n",
|
||||||
|
"DEP_counts = doc.count_by(spacy.attrs.DEP)\n",
|
||||||
|
"\n",
|
||||||
|
"for k,v in sorted(DEP_counts.items()):\n",
|
||||||
|
" print(f'{k}. {doc.vocab[k].text:{4}}: {v}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Here we've shown `spacy.attrs.POS`, `spacy.attrs.TAG` and `spacy.attrs.DEP`.<br>Refer back to the **Vocabulary and Matching** lecture from the previous section for a table of **Other token attributes**."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Fine-grained POS Tag Examples\n",
|
||||||
|
"These are some grammatical examples (shown in **bold**) of specific fine-grained tags. We've removed punctuation and rarely used tags:\n",
|
||||||
|
"<table>\n",
|
||||||
|
"<tr><th>POS</th><th>TAG</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>AFX</td><td>affix</td><td>The Flintstones were a **pre**-historic family.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>JJ</td><td>adjective</td><td>This is a **good** sentence.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>JJR</td><td>adjective, comparative</td><td>This is a **better** sentence.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>JJS</td><td>adjective, superlative</td><td>This is the **best** sentence.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>PDT</td><td>predeterminer</td><td>Waking up is **half** the battle.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>PRP\\$</td><td>pronoun, possessive</td><td>**His** arm hurts.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>WDT</td><td>wh-determiner</td><td>It's blue, **which** is odd.</td></tr>\n",
|
||||||
|
"<tr><td>ADJ</td><td>WP\\$</td><td>wh-pronoun, possessive</td><td>We don't know **whose** it is.</td></tr>\n",
|
||||||
|
"<tr><td>ADP</td><td>IN</td><td>conjunction, subordinating or preposition</td><td>It arrived **in** a box.</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>EX</td><td>existential there</td><td>**There** is cake.</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>RB</td><td>adverb</td><td>He ran **quickly**.</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>RBR</td><td>adverb, comparative</td><td>He ran **quicker**.</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>RBS</td><td>adverb, superlative</td><td>He ran **fastest**.</td></tr>\n",
|
||||||
|
"<tr><td>ADV</td><td>WRB</td><td>wh-adverb</td><td>**When** was that?</td></tr>\n",
|
||||||
|
"<tr><td>CONJ</td><td>CC</td><td>conjunction, coordinating</td><td>The balloon popped **and** everyone jumped.</td></tr>\n",
|
||||||
|
"<tr><td>DET</td><td>DT</td><td>determiner</td><td>**This** is **a** sentence.</td></tr>\n",
|
||||||
|
"<tr><td>INTJ</td><td>UH</td><td>interjection</td><td>**Um**, I don't know.</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td>NN</td><td>noun, singular or mass</td><td>This is a **sentence**.</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td>NNS</td><td>noun, plural</td><td>These are **words**.</td></tr>\n",
|
||||||
|
"<tr><td>NOUN</td><td>WP</td><td>wh-pronoun, personal</td><td>**Who** was that?</td></tr>\n",
|
||||||
|
"<tr><td>NUM</td><td>CD</td><td>cardinal number</td><td>I want **three** things.</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td>POS</td><td>possessive ending</td><td>Fred**'s** name is short.</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td>RP</td><td>adverb, particle</td><td>Put it **back**!</td></tr>\n",
|
||||||
|
"<tr><td>PART</td><td>TO</td><td>infinitival to</td><td>I want **to** go.</td></tr>\n",
|
||||||
|
"<tr><td>PRON</td><td>PRP</td><td>pronoun, personal</td><td>**I** want **you** to go.</td></tr>\n",
|
||||||
|
"<tr><td>PROPN</td><td>NNP</td><td>noun, proper singular</td><td>**Kilroy** was here.</td></tr>\n",
|
||||||
|
"<tr><td>PROPN</td><td>NNPS</td><td>noun, proper plural</td><td>The **Flintstones** were a pre-historic family.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>MD</td><td>verb, modal auxiliary</td><td>This **could** work.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VB</td><td>verb, base form</td><td>I want to **go**.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VBD</td><td>verb, past tense</td><td>This **was** a sentence.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VBG</td><td>verb, gerund or present participle</td><td>I am **going**.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VBN</td><td>verb, past participle</td><td>The treasure was **lost**.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VBP</td><td>verb, non-3rd person singular present</td><td>I **want** to go.</td></tr>\n",
|
||||||
|
"<tr><td>VERB</td><td>VBZ</td><td>verb, 3rd person singular present</td><td>He **wants** to go.</td></tr>\n",
|
||||||
|
"</table>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Up Next: Visualizing POS"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,469 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Visualizing Parts of Speech\n",
|
||||||
|
"spaCy offers an outstanding visualizer called **displaCy**:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')\n",
|
||||||
|
"\n",
|
||||||
|
"# Import the displaCy library\n",
|
||||||
|
"from spacy import displacy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Create a simple Doc object\n",
|
||||||
|
"doc = nlp(u\"The quick brown fox jumped over the lazy dog's back.\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"0\" class=\"displacy\" width=\"1260\" height=\"357.0\" style=\"max-width: none; height: 357.0px; color: #000000; background: #ffffff; font-family: Arial\">\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">The</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">DET</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"160\">quick</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"160\">ADJ</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"270\">brown</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"270\">ADJ</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"380\">fox</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"380\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"490\">jumped</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"490\">VERB</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"600\">over</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"600\">ADP</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"710\">the</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"710\">DET</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"820\">lazy</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"820\">ADJ</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"930\">dog</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"930\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1040\">'s</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1040\">PART</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"267.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1150\">back.</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1150\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-0\" stroke-width=\"2px\" d=\"M70,222.0 C70,57.0 375.0,57.0 375.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-0\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M70,224.0 L62,212.0 78,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-1\" stroke-width=\"2px\" d=\"M180,222.0 C180,112.0 370.0,112.0 370.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-1\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M180,224.0 L172,212.0 188,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-2\" stroke-width=\"2px\" d=\"M290,222.0 C290,167.0 365.0,167.0 365.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-2\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M290,224.0 L282,212.0 298,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-3\" stroke-width=\"2px\" d=\"M400,222.0 C400,167.0 475.0,167.0 475.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-3\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M400,224.0 L392,212.0 408,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-4\" stroke-width=\"2px\" d=\"M510,222.0 C510,167.0 585.0,167.0 585.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-4\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M585.0,224.0 L593.0,212.0 577.0,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-5\" stroke-width=\"2px\" d=\"M730,222.0 C730,112.0 920.0,112.0 920.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-5\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M730,224.0 L722,212.0 738,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-6\" stroke-width=\"2px\" d=\"M840,222.0 C840,167.0 915.0,167.0 915.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-6\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M840,224.0 L832,212.0 848,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-7\" stroke-width=\"2px\" d=\"M950,222.0 C950,112.0 1140.0,112.0 1140.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-7\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">poss</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M950,224.0 L942,212.0 958,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-8\" stroke-width=\"2px\" d=\"M950,222.0 C950,167.0 1025.0,167.0 1025.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-8\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">case</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1025.0,224.0 L1033.0,212.0 1017.0,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-9\" stroke-width=\"2px\" d=\"M620,222.0 C620,2.0 1150.0,2.0 1150.0,222.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-9\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1150.0,224.0 L1158.0,212.0 1142.0,212.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"</svg>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Render the dependency parse immediately inside Jupyter:\n",
|
||||||
|
"displacy.render(doc, style='dep', jupyter=True, options={'distance': 110})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The dependency parse shows the coarse POS tag for each token, as well as the **dependency tag** if given:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The DET det determiner\n",
|
||||||
|
"quick ADJ amod adjectival modifier\n",
|
||||||
|
"brown ADJ amod adjectival modifier\n",
|
||||||
|
"fox NOUN nsubj nominal subject\n",
|
||||||
|
"jumped VERB ROOT None\n",
|
||||||
|
"over ADP prep prepositional modifier\n",
|
||||||
|
"the DET det determiner\n",
|
||||||
|
"lazy ADJ amod adjectival modifier\n",
|
||||||
|
"dog NOUN poss possession modifier\n",
|
||||||
|
"'s PART case None\n",
|
||||||
|
"back NOUN pobj object of preposition\n",
|
||||||
|
". PUNCT punct punctuation\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for token in doc:\n",
|
||||||
|
" print(f'{token.text:{10}} {token.pos_:{7}} {token.dep_:{7}} {spacy.explain(token.dep_)}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Creating Visualizations Outside of Jupyter\n",
|
||||||
|
"If you're using another Python IDE or writing a script, you can choose to have spaCy serve up HTML separately.\n",
|
||||||
|
"\n",
|
||||||
|
"Instead of `displacy.render()`, use `displacy.serve()`:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
" Serving on port 5000...\n",
|
||||||
|
" Using the 'dep' visualizer\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"127.0.0.1 - - [12/Oct/2018 16:54:07] \"GET / HTTP/1.1\" 200 8304\n",
|
||||||
|
"127.0.0.1 - - [12/Oct/2018 16:54:07] \"GET /favicon.ico HTTP/1.1\" 200 8304\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
" Shutting down server on port 5000.\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"displacy.serve(doc, style='dep', options={'distance': 110})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=blue>**After running the cell above, click the link below to view the dependency parse**:</font>\n",
|
||||||
|
"\n",
|
||||||
|
"http://127.0.0.1:5000\n",
|
||||||
|
"<br><br>\n",
|
||||||
|
"<font color=red>**To shut down the server and return to jupyter**, interrupt the kernel either through the **Kernel** menu above, by hitting the black square on the toolbar, or by typing the keyboard shortcut `Esc`, `I`, `I`</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>**NOTE**: We'll use this method moving forward because, at this time, several of the customizations we want to show don't work well in Jupyter.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Handling Large Text\n",
|
||||||
|
"`displacy.serve()` accepts a single Doc or list of Doc objects. Since large texts are difficult to view in one line, you may want to pass a list of spans instead. Each span will appear on its own line:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
" Serving on port 5000...\n",
|
||||||
|
" Using the 'dep' visualizer\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"127.0.0.1 - - [12/Oct/2018 16:57:03] \"GET / HTTP/1.1\" 200 7328\n",
|
||||||
|
"127.0.0.1 - - [12/Oct/2018 16:57:03] \"GET /favicon.ico HTTP/1.1\" 200 7328\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
" Shutting down server on port 5000.\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc2 = nlp(u\"This is a sentence. This is another, possibly longer sentence.\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Create spans from Doc.sents:\n",
|
||||||
|
"spans = list(doc2.sents)\n",
|
||||||
|
"\n",
|
||||||
|
"displacy.serve(spans, style='dep', options={'distance': 110})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**Click this link to view the dependency**: http://127.0.0.1:5000\n",
|
||||||
|
"<br>Interrupt the kernel to return to jupyter."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Customizing the Appearance\n",
|
||||||
|
"Besides setting the distance between tokens, you can pass other arguments to the `options` parameter:\n",
|
||||||
|
"\n",
|
||||||
|
"<table>\n",
|
||||||
|
"<tr><th>NAME</th><th>TYPE</th><th>DESCRIPTION</th><th>DEFAULT</th></tr>\n",
|
||||||
|
"<tr><td>`compact`</td><td>bool</td><td>\"Compact mode\" with square arrows that takes up less space.</td><td>`False`</td></tr>\n",
|
||||||
|
"<tr><td>`color`</td><td>unicode</td><td>Text color (HEX, RGB or color names).</td><td>`#000000`</td></tr>\n",
|
||||||
|
"<tr><td>`bg`</td><td>unicode</td><td>Background color (HEX, RGB or color names).</td><td>`#ffffff`</td></tr>\n",
|
||||||
|
"<tr><td>`font`</td><td>unicode</td><td>Font name or font family for all text.</td><td>`Arial`</td></tr>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"\n",
|
||||||
|
"For a full list of options visit https://spacy.io/api/top-level#displacy_options"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
" Serving on port 5000...\n",
|
||||||
|
" Using the 'dep' visualizer\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"127.0.0.1 - - [12/Oct/2018 17:02:17] \"GET / HTTP/1.1\" 200 8533\n",
|
||||||
|
"127.0.0.1 - - [12/Oct/2018 17:02:17] \"GET /favicon.ico HTTP/1.1\" 200 8533\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
" Shutting down server on port 5000.\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"options = {'distance': 110, 'compact': 'True', 'color': 'yellow', 'bg': '#09a3d5', 'font': 'Times'}\n",
|
||||||
|
"\n",
|
||||||
|
"displacy.serve(doc, style='dep', options=options)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**Click this link to view the dependency**: http://127.0.0.1:5000\n",
|
||||||
|
"<br>Interrupt the kernel to return to jupyter."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"Great! Now you should be familiar with visualizing spaCy's dependency parse. For more info on **displaCy** visit https://spacy.io/usage/visualizers\n",
|
||||||
|
"<br>In the next section we'll look at Named Entity Recognition, followed by displaCy's NER visualizer.\n",
|
||||||
|
"\n",
|
||||||
|
"### Next Up: Named Entity Recognition"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,625 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Named Entity Recognition (NER)\n",
|
||||||
|
"spaCy has an **'ner'** pipeline component that identifies token spans fitting a predetermined set of named entities. These are available as the `ents` property of a `Doc` object."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Write a function to display basic entity info:\n",
|
||||||
|
"def show_ents(doc):\n",
|
||||||
|
" if doc.ents:\n",
|
||||||
|
" for ent in doc.ents:\n",
|
||||||
|
" print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))\n",
|
||||||
|
" else:\n",
|
||||||
|
" print('No named entities found.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Washington, DC - GPE - Countries, cities, states\n",
|
||||||
|
"next May - DATE - Absolute or relative dates or periods\n",
|
||||||
|
"the Washington Monument - ORG - Companies, agencies, institutions, etc.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'May I go to Washington, DC next May to see the Washington Monument?')\n",
|
||||||
|
"\n",
|
||||||
|
"show_ents(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Here we see tokens combine to form the entities `Washington, DC`, `next May` and `the Washington Monument`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Entity annotations\n",
|
||||||
|
"`Doc.ents` are token spans with their own set of annotations.\n",
|
||||||
|
"<table>\n",
|
||||||
|
"<tr><td>`ent.text`</td><td>The original entity text</td></tr>\n",
|
||||||
|
"<tr><td>`ent.label`</td><td>The entity type's hash value</td></tr>\n",
|
||||||
|
"<tr><td>`ent.label_`</td><td>The entity type's string description</td></tr>\n",
|
||||||
|
"<tr><td>`ent.start`</td><td>The token span's *start* index position in the Doc</td></tr>\n",
|
||||||
|
"<tr><td>`ent.end`</td><td>The token span's *stop* index position in the Doc</td></tr>\n",
|
||||||
|
"<tr><td>`ent.start_char`</td><td>The entity text's *start* index position in the Doc</td></tr>\n",
|
||||||
|
"<tr><td>`ent.end_char`</td><td>The entity text's *stop* index position in the Doc</td></tr>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"500 dollars 4 6 20 31 MONEY\n",
|
||||||
|
"Microsoft 11 12 53 62 ORG\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'Can I please borrow 500 dollars from you to buy some Microsoft stock?')\n",
|
||||||
|
"\n",
|
||||||
|
"for ent in doc.ents:\n",
|
||||||
|
" print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## NER Tags\n",
|
||||||
|
"Tags are accessible through the `.label_` property of an entity.\n",
|
||||||
|
"<table>\n",
|
||||||
|
"<tr><th>TYPE</th><th>DESCRIPTION</th><th>EXAMPLE</th></tr>\n",
|
||||||
|
"<tr><td>`PERSON`</td><td>People, including fictional.</td><td>*Fred Flintstone*</td></tr>\n",
|
||||||
|
"<tr><td>`NORP`</td><td>Nationalities or religious or political groups.</td><td>*The Republican Party*</td></tr>\n",
|
||||||
|
"<tr><td>`FAC`</td><td>Buildings, airports, highways, bridges, etc.</td><td>*Logan International Airport, The Golden Gate*</td></tr>\n",
|
||||||
|
"<tr><td>`ORG`</td><td>Companies, agencies, institutions, etc.</td><td>*Microsoft, FBI, MIT*</td></tr>\n",
|
||||||
|
"<tr><td>`GPE`</td><td>Countries, cities, states.</td><td>*France, UAR, Chicago, Idaho*</td></tr>\n",
|
||||||
|
"<tr><td>`LOC`</td><td>Non-GPE locations, mountain ranges, bodies of water.</td><td>*Europe, Nile River, Midwest*</td></tr>\n",
|
||||||
|
"<tr><td>`PRODUCT`</td><td>Objects, vehicles, foods, etc. (Not services.)</td><td>*Formula 1*</td></tr>\n",
|
||||||
|
"<tr><td>`EVENT`</td><td>Named hurricanes, battles, wars, sports events, etc.</td><td>*Olympic Games*</td></tr>\n",
|
||||||
|
"<tr><td>`WORK_OF_ART`</td><td>Titles of books, songs, etc.</td><td>*The Mona Lisa*</td></tr>\n",
|
||||||
|
"<tr><td>`LAW`</td><td>Named documents made into laws.</td><td>*Roe v. Wade*</td></tr>\n",
|
||||||
|
"<tr><td>`LANGUAGE`</td><td>Any named language.</td><td>*English*</td></tr>\n",
|
||||||
|
"<tr><td>`DATE`</td><td>Absolute or relative dates or periods.</td><td>*20 July 1969*</td></tr>\n",
|
||||||
|
"<tr><td>`TIME`</td><td>Times smaller than a day.</td><td>*Four hours*</td></tr>\n",
|
||||||
|
"<tr><td>`PERCENT`</td><td>Percentage, including \"%\".</td><td>*Eighty percent*</td></tr>\n",
|
||||||
|
"<tr><td>`MONEY`</td><td>Monetary values, including unit.</td><td>*Twenty Cents*</td></tr>\n",
|
||||||
|
"<tr><td>`QUANTITY`</td><td>Measurements, as of weight or distance.</td><td>*Several kilometers, 55kg*</td></tr>\n",
|
||||||
|
"<tr><td>`ORDINAL`</td><td>\"first\", \"second\", etc.</td><td>*9th, Ninth*</td></tr>\n",
|
||||||
|
"<tr><td>`CARDINAL`</td><td>Numerals that do not fall under another type.</td><td>*2, Two, Fifty-two*</td></tr>\n",
|
||||||
|
"</table>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Adding a Named Entity to a Span\n",
|
||||||
|
"Normally we would have spaCy build a library of named entities by training it on several samples of text.<br>In this case, we only want to add one value:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"U.K. - GPE - Countries, cities, states\n",
|
||||||
|
"$6 million - MONEY - Monetary values, including unit\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'Tesla to build a U.K. factory for $6 million')\n",
|
||||||
|
"\n",
|
||||||
|
"show_ents(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Right now, spaCy does not recognize \"Tesla\" as a company.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from spacy.tokens import Span\n",
|
||||||
|
"\n",
|
||||||
|
"# Get the hash value of the ORG entity label\n",
|
||||||
|
"ORG = doc.vocab.strings[u'ORG'] \n",
|
||||||
|
"\n",
|
||||||
|
"# Create a Span for the new entity\n",
|
||||||
|
"new_ent = Span(doc, 0, 1, label=ORG)\n",
|
||||||
|
"\n",
|
||||||
|
"# Add the entity to the existing Doc object\n",
|
||||||
|
"doc.ents = list(doc.ents) + [new_ent]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>In the code above, the arguments passed to `Span()` are:</font>\n",
|
||||||
|
"- `doc` - the name of the Doc object\n",
|
||||||
|
"- `0` - the *start* index position of the span\n",
|
||||||
|
"- `1` - the *stop* index position (exclusive)\n",
|
||||||
|
"- `label=ORG` - the label assigned to our entity"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tesla - ORG - Companies, agencies, institutions, etc.\n",
|
||||||
|
"U.K. - GPE - Countries, cities, states\n",
|
||||||
|
"$6 million - MONEY - Monetary values, including unit\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"show_ents(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Adding Named Entities to All Matching Spans\n",
|
||||||
|
"What if we want to tag *all* occurrences of \"Tesla\"? In this section we show how to use the PhraseMatcher to identify a series of spans in the Doc:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"first - ORDINAL - \"first\", \"second\", etc.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'Our company plans to introduce a new vacuum cleaner. '\n",
|
||||||
|
" u'If successful, the vacuum cleaner will be our first product.')\n",
|
||||||
|
"\n",
|
||||||
|
"show_ents(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import PhraseMatcher and create a matcher object:\n",
|
||||||
|
"from spacy.matcher import PhraseMatcher\n",
|
||||||
|
"matcher = PhraseMatcher(nlp.vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Create the desired phrase patterns:\n",
|
||||||
|
"phrase_list = ['vacuum cleaner', 'vacuum-cleaner']\n",
|
||||||
|
"phrase_patterns = [nlp(text) for text in phrase_list]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Apply the patterns to our matcher object:\n",
|
||||||
|
"matcher.add('newproduct', None, *phrase_patterns)\n",
|
||||||
|
"\n",
|
||||||
|
"# Apply the matcher to our Doc object:\n",
|
||||||
|
"matches = matcher(doc)\n",
|
||||||
|
"\n",
|
||||||
|
"# See what matches occur:\n",
|
||||||
|
"matches"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Here we create Spans from each match, and create named entities from them:\n",
|
||||||
|
"from spacy.tokens import Span\n",
|
||||||
|
"\n",
|
||||||
|
"PROD = doc.vocab.strings[u'PRODUCT']\n",
|
||||||
|
"\n",
|
||||||
|
"new_ents = [Span(doc, match[1],match[2],label=PROD) for match in matches]\n",
|
||||||
|
"\n",
|
||||||
|
"doc.ents = list(doc.ents) + new_ents"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)\n",
|
||||||
|
"vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)\n",
|
||||||
|
"first - ORDINAL - \"first\", \"second\", etc.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"show_ents(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Counting Entities\n",
|
||||||
|
"While spaCy may not have a built-in tool for counting entities, we can pass a conditional statement into a list comprehension:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"29.50 - MONEY - Monetary values, including unit\n",
|
||||||
|
"five dollars - MONEY - Monetary values, including unit\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'Originally priced at $29.50, the sweater was marked down to five dollars.')\n",
|
||||||
|
"\n",
|
||||||
|
"show_ents(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len([ent for ent in doc.ents if ent.label_=='MONEY'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## <font color=blue>Problem with Line Breaks</font>\n",
|
||||||
|
"\n",
|
||||||
|
"<div class=\"alert alert-info\" style=\"margin: 20px\">There's a <a href='https://github.com/explosion/spaCy/issues/1717'>known issue</a> with <strong>spaCy v2.0.12</strong> where some linebreaks are interpreted as `GPE` entities:</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'2.0.12'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"spacy.__version__"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"29.50 - MONEY - Monetary values, including unit\n",
|
||||||
|
"\n",
|
||||||
|
" - GPE - Countries, cities, states\n",
|
||||||
|
"five dollars - MONEY - Monetary values, including unit\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'Originally priced at $29.50,\\nthe sweater was marked down to five dollars.')\n",
|
||||||
|
"\n",
|
||||||
|
"show_ents(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### <font color=blue>However, there is a simple fix that can be added to the nlp pipeline:</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Quick function to remove ents formed on whitespace:\n",
|
||||||
|
"def remove_whitespace_entities(doc):\n",
|
||||||
|
" doc.ents = [e for e in doc.ents if not e.text.isspace()]\n",
|
||||||
|
" return doc\n",
|
||||||
|
"\n",
|
||||||
|
"# Insert this into the pipeline AFTER the ner component:\n",
|
||||||
|
"nlp.add_pipe(remove_whitespace_entities, after='ner')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"29.50 - MONEY - Monetary values, including unit\n",
|
||||||
|
"five dollars - MONEY - Monetary values, including unit\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Rerun nlp on the text above, and show ents:\n",
|
||||||
|
"doc = nlp(u'Originally priced at $29.50,\\nthe sweater was marked down to five dollars.')\n",
|
||||||
|
"\n",
|
||||||
|
"show_ents(doc)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For more on **Named Entity Recognition** visit https://spacy.io/usage/linguistic-features#101"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Noun Chunks\n",
|
||||||
|
"`Doc.noun_chunks` are *base noun phrases*: token spans that include the noun and words describing the noun. Noun chunks cannot be nested, cannot overlap, and do not involve prepositional phrases or relative clauses.<br>\n",
|
||||||
|
"Where `Doc.ents` rely on the **ner** pipeline component, `Doc.noun_chunks` are provided by the **parser**."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### `noun_chunks` components:\n",
|
||||||
|
"<table>\n",
|
||||||
|
"<tr><td>`.text`</td><td>The original noun chunk text.</td></tr>\n",
|
||||||
|
"<tr><td>`.root.text`</td><td>The original text of the word connecting the noun chunk to the rest of the parse.</td></tr>\n",
|
||||||
|
"<tr><td>`.root.dep_`</td><td>Dependency relation connecting the root to its head.</td></tr>\n",
|
||||||
|
"<tr><td>`.root.head.text`</td><td>The text of the root token's head.</td></tr>\n",
|
||||||
|
"</table>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Autonomous cars - cars - nsubj - shift\n",
|
||||||
|
"insurance liability - liability - dobj - shift\n",
|
||||||
|
"manufacturers - manufacturers - pobj - toward\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u\"Autonomous cars shift insurance liability toward manufacturers.\")\n",
|
||||||
|
"\n",
|
||||||
|
"for chunk in doc.noun_chunks:\n",
|
||||||
|
" print(chunk.text+' - '+chunk.root.text+' - '+chunk.root.dep_+' - '+chunk.root.head.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### `Doc.noun_chunks` is a generator function\n",
|
||||||
|
"Previously we mentioned that `Doc` objects do not retain a list of sentences, but they're available through the `Doc.sents` generator.<br>It's the same with `Doc.noun_chunks` - lists can be created if needed:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "TypeError",
|
||||||
|
"evalue": "object of type 'generator' has no len()",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[1;32m<ipython-input-21-8b52b37c204e>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnoun_chunks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||||
|
"\u001b[1;31mTypeError\u001b[0m: object of type 'generator' has no len()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(doc.noun_chunks)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len(list(doc.noun_chunks))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For more on **noun_chunks** visit https://spacy.io/usage/linguistic-features#noun-chunks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Great! Now you should be more familiar with both named entities and noun chunks. In the next section we revisit the NER visualizer.\n",
|
||||||
|
"## Next up: Visualizing NER"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,541 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Visualizing Named Entities\n",
|
||||||
|
"Besides viewing Part of Speech dependencies with `style='dep'`, **displaCy** offers a `style='ent'` visualizer:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')\n",
|
||||||
|
"\n",
|
||||||
|
"# Import the displaCy library\n",
|
||||||
|
"from spacy import displacy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">Over \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" the last quarter\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Apple\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" nearly 20 thousand\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">CARDINAL</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" iPods\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" for a profit of \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" $6 million\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MONEY</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
". By contrast, \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Sony\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" only 7 thousand\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">CARDINAL</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Walkman\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" music players.</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '\n",
|
||||||
|
" u'By contrast, Sony sold only 7 thousand Walkman music players.')\n",
|
||||||
|
"\n",
|
||||||
|
"displacy.render(doc, style='ent', jupyter=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Viewing Sentences Line by Line\n",
|
||||||
|
"Unlike the **displaCy** dependency parse, the NER viewer has to take in a Doc object with an `ents` attribute. For this reason, we can't just pass a list of spans to `.render()`, we have to create a new Doc from each `span.text`:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">Over \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" the last quarter\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Apple\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" nearly 20 thousand\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">CARDINAL</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" iPods\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" for a profit of \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" $6 million\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MONEY</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
".</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">By contrast, \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Sony\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" only 7 thousand\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">CARDINAL</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Walkman\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" music players.</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for sent in doc.sents:\n",
|
||||||
|
" displacy.render(nlp(sent.text), style='ent', jupyter=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<div class=\"alert alert-info\"><font color=black>**NOTE**: If a span does not contain any entities, displaCy will issue a harmless warning:</font></div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"doc2 = nlp(u'Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 million. '\n",
|
||||||
|
" u'By contrast, my kids sold a lot of lemonade.')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">Over \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" the last quarter\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Apple\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" nearly 20 thousand\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">CARDINAL</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" iPods\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" for a profit of \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" $6 million\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MONEY</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
".</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"C:\\Anaconda3\\lib\\runpy.py:193: UserWarning: [W006] No entities to visualize found in Doc object. If this is surprising to you, make sure the Doc was processed using a model that supports named entity recognition, and check the `doc.ents` property manually if necessary.\n",
|
||||||
|
" \"__main__\", mod_spec)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">By contrast, my kids sold a lot of lemonade.</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for sent in doc2.sents:\n",
|
||||||
|
" displacy.render(nlp(sent.text), style='ent', jupyter=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<div class=\"alert alert-info\"><font color=black>**WORKAROUND:** We can avert this with an additional bit of code:</font></div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">Over \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" the last quarter\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Apple\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" nearly 20 thousand\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">CARDINAL</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" iPods\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" for a profit of \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #e4e7d2; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" $6 million\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">MONEY</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
".</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"By contrast, my kids sold a lot of lemonade.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for sent in doc2.sents:\n",
|
||||||
|
" docx = nlp(sent.text)\n",
|
||||||
|
" if docx.ents:\n",
|
||||||
|
" displacy.render(docx, style='ent', jupyter=True)\n",
|
||||||
|
" else:\n",
|
||||||
|
" print(docx.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Viewing Specific Entities\n",
|
||||||
|
"You can pass a list of entity types to restrict the visualization:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">Over the last quarter \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Apple\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold nearly 20 thousand \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" iPods\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" for a profit of $6 million. By contrast, \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Sony\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold only 7 thousand \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Walkman\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" music players.</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"options = {'ents': ['ORG', 'PRODUCT']}\n",
|
||||||
|
"\n",
|
||||||
|
"displacy.render(doc, style='ent', jupyter=True, options=options)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Customizing Colors and Effects\n",
|
||||||
|
"You can also pass background color and gradient options:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">Over the last quarter \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: linear-gradient(90deg, #aa9cfc, #fc9ce7); padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Apple\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold nearly 20 thousand \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: radial-gradient(yellow, green); padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" iPods\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" for a profit of $6 million. By contrast, \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: linear-gradient(90deg, #aa9cfc, #fc9ce7); padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Sony\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" sold only 7 thousand \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: radial-gradient(yellow, green); padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Walkman\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" music players.</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"colors = {'ORG': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)', 'PRODUCT': 'radial-gradient(yellow, green)'}\n",
|
||||||
|
"\n",
|
||||||
|
"options = {'ents': ['ORG', 'PRODUCT'], 'colors':colors}\n",
|
||||||
|
"\n",
|
||||||
|
"displacy.render(doc, style='ent', jupyter=True, options=options)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For more on applying CSS background colors and gradients, visit https://www.w3schools.com/css/css3_gradients.asp"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Creating Visualizations Outside of Jupyter\n",
|
||||||
|
"If you're using another Python IDE or writing a script, you can choose to have spaCy serve up HTML separately.\n",
|
||||||
|
"\n",
|
||||||
|
"Instead of `displacy.render()`, use `displacy.serve()`:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
" Serving on port 5000...\n",
|
||||||
|
" Using the 'ent' visualizer\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"127.0.0.1 - - [04/Dec/2018 13:21:26] \"GET / HTTP/1.1\" 200 2210\n",
|
||||||
|
"127.0.0.1 - - [04/Dec/2018 13:21:26] \"GET /favicon.ico HTTP/1.1\" 200 2210\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
" Shutting down server on port 5000.\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"displacy.serve(doc, style='ent', options=options)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=blue>**After running the cell above, click the link below to view the dependency parse**:</font>\n",
|
||||||
|
"\n",
|
||||||
|
"http://127.0.0.1:5000\n",
|
||||||
|
"<br><br>\n",
|
||||||
|
"<font color=red>**To shut down the server and return to jupyter**, interrupt the kernel either through the **Kernel** menu above, by hitting the black square on the toolbar, or by typing the keyboard shortcut `Esc`, `I`, `I`</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"For more on **Visualizing the entity recognizer** visit https://spacy.io/usage/visualizers#ent\n",
|
||||||
|
"## Next Up: Sentence Segmentation"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,537 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Sentence Segmentation\n",
|
||||||
|
"In **spaCy Basics** we saw briefly how Doc objects are divided into sentences. In this section we'll learn how sentence segmentation works, and how to set our own segmentation rules."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"This is the first sentence.\n",
|
||||||
|
"This is another sentence.\n",
|
||||||
|
"This is the last sentence.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# From Spacy Basics:\n",
|
||||||
|
"doc = nlp(u'This is the first sentence. This is another sentence. This is the last sentence.')\n",
|
||||||
|
"\n",
|
||||||
|
"for sent in doc.sents:\n",
|
||||||
|
" print(sent)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### `Doc.sents` is a generator\n",
|
||||||
|
"It is important to note that `doc.sents` is a *generator*. That is, a Doc is not segmented until `doc.sents` is called. This means that, where you could print the second Doc token with `print(doc[1])`, you can't call the \"second Doc sentence\" with `print(doc.sents[1])`:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"is\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(doc[1])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "TypeError",
|
||||||
|
"evalue": "'generator' object is not subscriptable",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[1;32m<ipython-input-4-2bc012eee1da>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdoc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msents\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||||
|
"\u001b[1;31mTypeError\u001b[0m: 'generator' object is not subscriptable"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(doc.sents[1])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"However, you *can* build a sentence collection by running `doc.sents` and saving the result to a list:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[This is the first sentence.,\n",
|
||||||
|
" This is another sentence.,\n",
|
||||||
|
" This is the last sentence.]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc_sents = [sent for sent in doc.sents]\n",
|
||||||
|
"doc_sents"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>**NOTE**: `list(doc.sents)` also works. We show a list comprehension as it allows you to pass in conditionals.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"This is another sentence.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Now you can access individual sentences:\n",
|
||||||
|
"print(doc_sents[1])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### `sents` are Spans\n",
|
||||||
|
"At first glance it looks like each `sent` contains text from the original Doc object. In fact they're just Spans with start and end token pointers."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"spacy.tokens.span.Span"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"type(doc_sents[1])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"6 11\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(doc_sents[1].start, doc_sents[1].end)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Adding Rules\n",
|
||||||
|
"spaCy's built-in `sentencizer` relies on the dependency parse and end-of-sentence punctuation to determine segmentation rules. We can add rules of our own, but they have to be added *before* the creation of the Doc object, as that is where the parsing of segment start tokens happens:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"None This\n",
|
||||||
|
"None is\n",
|
||||||
|
"None a\n",
|
||||||
|
"None sentence\n",
|
||||||
|
"None .\n",
|
||||||
|
"True This\n",
|
||||||
|
"None is\n",
|
||||||
|
"None a\n",
|
||||||
|
"None sentence\n",
|
||||||
|
"None .\n",
|
||||||
|
"True This\n",
|
||||||
|
"None is\n",
|
||||||
|
"None a\n",
|
||||||
|
"None sentence\n",
|
||||||
|
"None .\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Parsing the segmentation start tokens happens during the nlp pipeline\n",
|
||||||
|
"doc2 = nlp(u'This is a sentence. This is a sentence. This is a sentence.')\n",
|
||||||
|
"\n",
|
||||||
|
"for token in doc2:\n",
|
||||||
|
" print(token.is_sent_start, ' '+token.text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Notice we haven't run `doc2.sents`, and yet `token.is_sent_start` was set to True on two tokens in the Doc.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Let's add a semicolon to our existing segmentation rules. That is, whenever the sentencizer encounters a semicolon, the next token should start a new segment."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\"Management is doing things right; leadership is doing the right things.\"\n",
|
||||||
|
"-Peter Drucker\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# SPACY'S DEFAULT BEHAVIOR\n",
|
||||||
|
"doc3 = nlp(u'\"Management is doing things right; leadership is doing the right things.\" -Peter Drucker')\n",
|
||||||
|
"\n",
|
||||||
|
"for sent in doc3.sents:\n",
|
||||||
|
" print(sent)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['tagger', 'set_custom_boundaries', 'parser', 'ner']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# ADD A NEW RULE TO THE PIPELINE\n",
|
||||||
|
"def set_custom_boundaries(doc):\n",
|
||||||
|
" for token in doc[:-1]:\n",
|
||||||
|
" if token.text == ';':\n",
|
||||||
|
" doc[token.i+1].is_sent_start = True\n",
|
||||||
|
" return doc\n",
|
||||||
|
"\n",
|
||||||
|
"nlp.add_pipe(set_custom_boundaries, before='parser')\n",
|
||||||
|
"\n",
|
||||||
|
"nlp.pipe_names"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>The new rule has to run before the document is parsed. Here we can either pass the argument `before='parser'` or `first=True`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\"Management is doing things right;\n",
|
||||||
|
"leadership is doing the right things.\"\n",
|
||||||
|
"-Peter Drucker\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Re-run the Doc object creation:\n",
|
||||||
|
"doc4 = nlp(u'\"Management is doing things right; leadership is doing the right things.\" -Peter Drucker')\n",
|
||||||
|
"\n",
|
||||||
|
"for sent in doc4.sents:\n",
|
||||||
|
" print(sent)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\"Management is doing things right; leadership is doing the right things.\"\n",
|
||||||
|
"-Peter Drucker\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# And yet the new rule doesn't apply to the older Doc object:\n",
|
||||||
|
"for sent in doc3.sents:\n",
|
||||||
|
" print(sent)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Why not change the token directly?\n",
|
||||||
|
"Why not simply set the `.is_sent_start` value to True on existing tokens?"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"leadership"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Find the token we want to change:\n",
|
||||||
|
"doc3[7]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "ValueError",
|
||||||
|
"evalue": "[E043] Refusing to write to token.sent_start if its document is parsed, because this may cause inconsistent state.",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"\u001b[1;32m<ipython-input-5-bcec3fe6a9a2>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# Try to change the .is_sent_start attribute:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mdoc3\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m7\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_sent_start\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
|
||||||
|
"\u001b[1;32mtoken.pyx\u001b[0m in \u001b[0;36mspacy.tokens.token.Token.is_sent_start.__set__\u001b[1;34m()\u001b[0m\n",
|
||||||
|
"\u001b[1;31mValueError\u001b[0m: [E043] Refusing to write to token.sent_start if its document is parsed, because this may cause inconsistent state."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Try to change the .is_sent_start attribute:\n",
|
||||||
|
"doc3[7].is_sent_start = True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>spaCy refuses to change the tag after the document is parsed to prevent inconsistencies in the data.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Changing the Rules\n",
|
||||||
|
"In some cases we want to *replace* spaCy's default sentencizer with our own set of rules. In this section we'll see how the default sentencizer breaks on periods. We'll then replace this behavior with a sentencizer that breaks on linebreaks."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['This', 'is', 'a', 'sentence', '.']\n",
|
||||||
|
"['This', 'is', 'another', '.', '\\n\\n']\n",
|
||||||
|
"['This', 'is', 'a', '\\n', 'third', 'sentence', '.']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nlp = spacy.load('en_core_web_sm') # reset to the original\n",
|
||||||
|
"\n",
|
||||||
|
"mystring = u\"This is a sentence. This is another.\\n\\nThis is a \\nthird sentence.\"\n",
|
||||||
|
"\n",
|
||||||
|
"# SPACY DEFAULT BEHAVIOR:\n",
|
||||||
|
"doc = nlp(mystring)\n",
|
||||||
|
"\n",
|
||||||
|
"for sent in doc.sents:\n",
|
||||||
|
" print([token.text for token in sent])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# CHANGING THE RULES\n",
|
||||||
|
"from spacy.pipeline import SentenceSegmenter\n",
|
||||||
|
"\n",
|
||||||
|
"def split_on_newlines(doc):\n",
|
||||||
|
" start = 0\n",
|
||||||
|
" seen_newline = False\n",
|
||||||
|
" for word in doc:\n",
|
||||||
|
" if seen_newline:\n",
|
||||||
|
" yield doc[start:word.i]\n",
|
||||||
|
" start = word.i\n",
|
||||||
|
" seen_newline = False\n",
|
||||||
|
" elif word.text.startswith('\\n'): # handles multiple occurrences\n",
|
||||||
|
" seen_newline = True\n",
|
||||||
|
" yield doc[start:] # handles the last group of tokens\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"sbd = SentenceSegmenter(nlp.vocab, strategy=split_on_newlines)\n",
|
||||||
|
"nlp.add_pipe(sbd)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>While the function `split_on_newlines` can be named anything we want, it's important to use the name `sbd` for the SentenceSegmenter.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['This', 'is', 'a', 'sentence', '.', 'This', 'is', 'another', '.', '\\n\\n']\n",
|
||||||
|
"['This', 'is', 'a', '\\n']\n",
|
||||||
|
"['third', 'sentence', '.']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(mystring)\n",
|
||||||
|
"for sent in doc.sents:\n",
|
||||||
|
" print([token.text for token in sent])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Here we see that periods no longer affect segmentation, only linebreaks do. This would be appropriate when working with a long list of tweets, for instance.</font>\n",
|
||||||
|
"## Next Up: POS Assessment"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,607 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Parts of Speech Assessment"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For this assessment we'll be using the short story [The Tale of Peter Rabbit](https://en.wikipedia.org/wiki/The_Tale_of_Peter_Rabbit) by Beatrix Potter (1902). <br>The story is in the public domain; the text file was obtained from [Project Gutenberg](https://www.gutenberg.org/ebooks/14838.txt.utf-8)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# RUN THIS CELL to perform standard imports:\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')\n",
|
||||||
|
"from spacy import displacy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**1. Create a Doc object from the file `peterrabbit.txt`**<br>\n",
|
||||||
|
"> HINT: Use `with open('../TextFiles/peterrabbit.txt') as f:`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**2. For every token in the third sentence, print the token text, the POS tag, the fine-grained TAG tag, and the description of the fine-grained tag.**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"They PRON PRP pronoun, personal\n",
|
||||||
|
"lived VERB VBD verb, past tense\n",
|
||||||
|
"with ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"their ADJ PRP$ pronoun, possessive\n",
|
||||||
|
"Mother PROPN NNP noun, proper singular\n",
|
||||||
|
"in ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"a DET DT determiner\n",
|
||||||
|
"sand NOUN NN noun, singular or mass\n",
|
||||||
|
"- PUNCT HYPH punctuation mark, hyphen\n",
|
||||||
|
"bank NOUN NN noun, singular or mass\n",
|
||||||
|
", PUNCT , punctuation mark, comma\n",
|
||||||
|
"underneath ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"the DET DT determiner\n",
|
||||||
|
"root NOUN NN noun, singular or mass\n",
|
||||||
|
"of ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"a DET DT determiner\n",
|
||||||
|
"\n",
|
||||||
|
" SPACE None\n",
|
||||||
|
"very ADV RB adverb\n",
|
||||||
|
"big ADJ JJ adjective\n",
|
||||||
|
"fir NOUN NN noun, singular or mass\n",
|
||||||
|
"- PUNCT HYPH punctuation mark, hyphen\n",
|
||||||
|
"tree NOUN NN noun, singular or mass\n",
|
||||||
|
". PUNCT . punctuation mark, sentence closer\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" SPACE _SP None\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Enter your code here:\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**3. Provide a frequency list of POS tags from the entire document**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"83. ADJ : 83\n",
|
||||||
|
"84. ADP : 127\n",
|
||||||
|
"85. ADV : 75\n",
|
||||||
|
"88. CCONJ: 61\n",
|
||||||
|
"89. DET : 90\n",
|
||||||
|
"91. NOUN : 176\n",
|
||||||
|
"92. NUM : 8\n",
|
||||||
|
"93. PART : 36\n",
|
||||||
|
"94. PRON : 72\n",
|
||||||
|
"95. PROPN: 75\n",
|
||||||
|
"96. PUNCT: 174\n",
|
||||||
|
"99. VERB : 182\n",
|
||||||
|
"102. SPACE: 99\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**4. CHALLENGE: What percentage of tokens are nouns?**<br>\n",
|
||||||
|
"HINT: the attribute ID for 'NOUN' is 91"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"176/1258 = 13.99%\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**5. Display the Dependency Parse for the third sentence**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"0\" class=\"displacy\" width=\"2250\" height=\"522.0\" style=\"max-width: none; height: 522.0px; color: #000000; background: #ffffff; font-family: Arial\">\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">They</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PRON</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"160\">lived</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"160\">VERB</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"270\">with</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"270\">ADP</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"380\">their</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"380\">ADJ</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"490\">Mother</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"490\">PROPN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"600\">in</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"600\">ADP</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"710\">a</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"710\">DET</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"820\">sand-</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"820\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"930\">bank,</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"930\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1040\">underneath</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1040\">ADP</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1150\">the</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1150\">DET</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1260\">root</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1260\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1370\">of</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1370\">ADP</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1480\">a</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1480\">DET</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1590\">\n",
|
||||||
|
"</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1590\"></tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1700\">very</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1700\">ADV</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1810\">big</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1810\">ADJ</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1920\">fir-</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1920\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"2030\">tree.</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"2030\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"2140\">\n",
|
||||||
|
"\n",
|
||||||
|
"</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"2140\">SPACE</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-0\" stroke-width=\"2px\" d=\"M70,387.0 C70,332.0 130.0,332.0 130.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-0\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M70,389.0 L62,377.0 78,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-1\" stroke-width=\"2px\" d=\"M180,387.0 C180,332.0 240.0,332.0 240.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-1\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M240.0,389.0 L248.0,377.0 232.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-2\" stroke-width=\"2px\" d=\"M400,387.0 C400,332.0 460.0,332.0 460.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-2\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">poss</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M400,389.0 L392,377.0 408,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-3\" stroke-width=\"2px\" d=\"M290,387.0 C290,277.0 465.0,277.0 465.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-3\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M465.0,389.0 L473.0,377.0 457.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-4\" stroke-width=\"2px\" d=\"M180,387.0 C180,167.0 585.0,167.0 585.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-4\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M585.0,389.0 L593.0,377.0 577.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-5\" stroke-width=\"2px\" d=\"M730,387.0 C730,277.0 905.0,277.0 905.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-5\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M730,389.0 L722,377.0 738,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-6\" stroke-width=\"2px\" d=\"M840,387.0 C840,332.0 900.0,332.0 900.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-6\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M840,389.0 L832,377.0 848,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-7\" stroke-width=\"2px\" d=\"M620,387.0 C620,222.0 910.0,222.0 910.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-7\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M910.0,389.0 L918.0,377.0 902.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-8\" stroke-width=\"2px\" d=\"M180,387.0 C180,57.0 1035.0,57.0 1035.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-8\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1035.0,389.0 L1043.0,377.0 1027.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-9\" stroke-width=\"2px\" d=\"M1170,387.0 C1170,332.0 1230.0,332.0 1230.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-9\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1170,389.0 L1162,377.0 1178,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-10\" stroke-width=\"2px\" d=\"M1060,387.0 C1060,277.0 1235.0,277.0 1235.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-10\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1235.0,389.0 L1243.0,377.0 1227.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-11\" stroke-width=\"2px\" d=\"M1280,387.0 C1280,332.0 1340.0,332.0 1340.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-11\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1340.0,389.0 L1348.0,377.0 1332.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-12\" stroke-width=\"2px\" d=\"M1500,387.0 C1500,112.0 2020.0,112.0 2020.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-12\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1500,389.0 L1492,377.0 1508,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-13\" stroke-width=\"2px\" d=\"M1500,387.0 C1500,332.0 1560.0,332.0 1560.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-13\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\"></textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1560.0,389.0 L1568.0,377.0 1552.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-14\" stroke-width=\"2px\" d=\"M1720,387.0 C1720,332.0 1780.0,332.0 1780.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-14\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">advmod</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1720,389.0 L1712,377.0 1728,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-15\" stroke-width=\"2px\" d=\"M1830,387.0 C1830,277.0 2005.0,277.0 2005.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-15\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1830,389.0 L1822,377.0 1838,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-16\" stroke-width=\"2px\" d=\"M1940,387.0 C1940,332.0 2000.0,332.0 2000.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-16\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1940,389.0 L1932,377.0 1948,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-17\" stroke-width=\"2px\" d=\"M180,387.0 C180,2.0 2030.0,2.0 2030.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-17\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">punct</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M2030.0,389.0 L2038.0,377.0 2022.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-18\" stroke-width=\"2px\" d=\"M2050,387.0 C2050,332.0 2110.0,332.0 2110.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-18\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\"></textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M2110.0,389.0 L2118.0,377.0 2102.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"</svg>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**6. Show the first two named entities from Beatrix Potter's *The Tale of Peter Rabbit* **"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The Tale of Peter Rabbit - WORK_OF_ART - Titles of books, songs, etc.\n",
|
||||||
|
"Beatrix Potter - PERSON - People, including fictional\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**7. How many sentences are contained in *The Tale of Peter Rabbit*?**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"56"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**8. CHALLENGE: How many sentences contain named entities?**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"49"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**9. CHALLENGE: Display the named entity visualization for `list_of_sents[0]` from the previous problem**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">\n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #f0d0ff; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" The Tale of Peter Rabbit\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">WORK_OF_ART</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
", by \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Beatrix Potter\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" (\n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" 1902\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
").\n",
|
||||||
|
"\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Great Job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,619 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Parts of Speech Assessment - Solutions"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"For this assessment we'll be using the short story [The Tale of Peter Rabbit](https://en.wikipedia.org/wiki/The_Tale_of_Peter_Rabbit) by Beatrix Potter (1902). <br>The story is in the public domain; the text file was obtained from [Project Gutenberg](https://www.gutenberg.org/ebooks/14838.txt.utf-8)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# RUN THIS CELL to perform standard imports:\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')\n",
|
||||||
|
"from spacy import displacy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**1. Create a Doc object from the file `peterrabbit.txt`**<br>\n",
|
||||||
|
"> HINT: Use `with open('../TextFiles/peterrabbit.txt') as f:`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"with open('../TextFiles/peterrabbit.txt') as f:\n",
|
||||||
|
" doc = nlp(f.read())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**2. For every token in the third sentence, print the token text, the POS tag, the fine-grained TAG tag, and the description of the fine-grained tag.**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"They PRON PRP pronoun, personal\n",
|
||||||
|
"lived VERB VBD verb, past tense\n",
|
||||||
|
"with ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"their ADJ PRP$ pronoun, possessive\n",
|
||||||
|
"Mother PROPN NNP noun, proper singular\n",
|
||||||
|
"in ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"a DET DT determiner\n",
|
||||||
|
"sand NOUN NN noun, singular or mass\n",
|
||||||
|
"- PUNCT HYPH punctuation mark, hyphen\n",
|
||||||
|
"bank NOUN NN noun, singular or mass\n",
|
||||||
|
", PUNCT , punctuation mark, comma\n",
|
||||||
|
"underneath ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"the DET DT determiner\n",
|
||||||
|
"root NOUN NN noun, singular or mass\n",
|
||||||
|
"of ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"a DET DT determiner\n",
|
||||||
|
"\n",
|
||||||
|
" SPACE None\n",
|
||||||
|
"very ADV RB adverb\n",
|
||||||
|
"big ADJ JJ adjective\n",
|
||||||
|
"fir NOUN NN noun, singular or mass\n",
|
||||||
|
"- PUNCT HYPH punctuation mark, hyphen\n",
|
||||||
|
"tree NOUN NN noun, singular or mass\n",
|
||||||
|
". PUNCT . punctuation mark, sentence closer\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" SPACE _SP None\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Enter your code here:\n",
|
||||||
|
"\n",
|
||||||
|
"for token in list(doc.sents)[2]:\n",
|
||||||
|
" print(f'{token.text:{12}} {token.pos_:{6}} {token.tag_:{6}} {spacy.explain(token.tag_)}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**3. Provide a frequency list of POS tags from the entire document**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"83. ADJ : 83\n",
|
||||||
|
"84. ADP : 127\n",
|
||||||
|
"85. ADV : 75\n",
|
||||||
|
"88. CCONJ: 61\n",
|
||||||
|
"89. DET : 90\n",
|
||||||
|
"91. NOUN : 176\n",
|
||||||
|
"92. NUM : 8\n",
|
||||||
|
"93. PART : 36\n",
|
||||||
|
"94. PRON : 72\n",
|
||||||
|
"95. PROPN: 75\n",
|
||||||
|
"96. PUNCT: 174\n",
|
||||||
|
"99. VERB : 182\n",
|
||||||
|
"102. SPACE: 99\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"POS_counts = doc.count_by(spacy.attrs.POS)\n",
|
||||||
|
"\n",
|
||||||
|
"for k,v in sorted(POS_counts.items()):\n",
|
||||||
|
" print(f'{k}. {doc.vocab[k].text:{5}}: {v}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**4. CHALLENGE: What percentage of tokens are nouns?**<br>\n",
|
||||||
|
"HINT: the attribute ID for 'NOUN' is 91"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"176/1258 = 13.99%\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"percent = 100*POS_counts[91]/len(doc)\n",
|
||||||
|
"\n",
|
||||||
|
"print(f'{POS_counts[91]}/{len(doc)} = {percent:{.4}}%')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**5. Display the Dependency Parse for the third sentence**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" id=\"0\" class=\"displacy\" width=\"2250\" height=\"522.0\" style=\"max-width: none; height: 522.0px; color: #000000; background: #ffffff; font-family: Arial\">\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"50\">They</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"50\">PRON</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"160\">lived</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"160\">VERB</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"270\">with</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"270\">ADP</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"380\">their</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"380\">ADJ</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"490\">Mother</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"490\">PROPN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"600\">in</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"600\">ADP</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"710\">a</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"710\">DET</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"820\">sand-</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"820\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"930\">bank,</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"930\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1040\">underneath</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1040\">ADP</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1150\">the</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1150\">DET</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1260\">root</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1260\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1370\">of</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1370\">ADP</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1480\">a</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1480\">DET</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1590\">\n",
|
||||||
|
"</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1590\"></tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1700\">very</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1700\">ADV</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1810\">big</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1810\">ADJ</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"1920\">fir-</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"1920\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"2030\">tree.</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"2030\">NOUN</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<text class=\"displacy-token\" fill=\"currentColor\" text-anchor=\"middle\" y=\"432.0\">\n",
|
||||||
|
" <tspan class=\"displacy-word\" fill=\"currentColor\" x=\"2140\">\n",
|
||||||
|
"\n",
|
||||||
|
"</tspan>\n",
|
||||||
|
" <tspan class=\"displacy-tag\" dy=\"2em\" fill=\"currentColor\" x=\"2140\">SPACE</tspan>\n",
|
||||||
|
"</text>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-0\" stroke-width=\"2px\" d=\"M70,387.0 C70,332.0 130.0,332.0 130.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-0\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">nsubj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M70,389.0 L62,377.0 78,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-1\" stroke-width=\"2px\" d=\"M180,387.0 C180,332.0 240.0,332.0 240.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-1\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M240.0,389.0 L248.0,377.0 232.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-2\" stroke-width=\"2px\" d=\"M400,387.0 C400,332.0 460.0,332.0 460.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-2\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">poss</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M400,389.0 L392,377.0 408,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-3\" stroke-width=\"2px\" d=\"M290,387.0 C290,277.0 465.0,277.0 465.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-3\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M465.0,389.0 L473.0,377.0 457.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-4\" stroke-width=\"2px\" d=\"M180,387.0 C180,167.0 585.0,167.0 585.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-4\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M585.0,389.0 L593.0,377.0 577.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-5\" stroke-width=\"2px\" d=\"M730,387.0 C730,277.0 905.0,277.0 905.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-5\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M730,389.0 L722,377.0 738,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-6\" stroke-width=\"2px\" d=\"M840,387.0 C840,332.0 900.0,332.0 900.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-6\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M840,389.0 L832,377.0 848,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-7\" stroke-width=\"2px\" d=\"M620,387.0 C620,222.0 910.0,222.0 910.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-7\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M910.0,389.0 L918.0,377.0 902.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-8\" stroke-width=\"2px\" d=\"M180,387.0 C180,57.0 1035.0,57.0 1035.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-8\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1035.0,389.0 L1043.0,377.0 1027.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-9\" stroke-width=\"2px\" d=\"M1170,387.0 C1170,332.0 1230.0,332.0 1230.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-9\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1170,389.0 L1162,377.0 1178,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-10\" stroke-width=\"2px\" d=\"M1060,387.0 C1060,277.0 1235.0,277.0 1235.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-10\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">pobj</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1235.0,389.0 L1243.0,377.0 1227.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-11\" stroke-width=\"2px\" d=\"M1280,387.0 C1280,332.0 1340.0,332.0 1340.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-11\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">prep</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1340.0,389.0 L1348.0,377.0 1332.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-12\" stroke-width=\"2px\" d=\"M1500,387.0 C1500,112.0 2020.0,112.0 2020.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-12\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">det</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1500,389.0 L1492,377.0 1508,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-13\" stroke-width=\"2px\" d=\"M1500,387.0 C1500,332.0 1560.0,332.0 1560.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-13\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\"></textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1560.0,389.0 L1568.0,377.0 1552.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-14\" stroke-width=\"2px\" d=\"M1720,387.0 C1720,332.0 1780.0,332.0 1780.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-14\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">advmod</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1720,389.0 L1712,377.0 1728,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-15\" stroke-width=\"2px\" d=\"M1830,387.0 C1830,277.0 2005.0,277.0 2005.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-15\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">amod</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1830,389.0 L1822,377.0 1838,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-16\" stroke-width=\"2px\" d=\"M1940,387.0 C1940,332.0 2000.0,332.0 2000.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-16\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">compound</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M1940,389.0 L1932,377.0 1948,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-17\" stroke-width=\"2px\" d=\"M180,387.0 C180,2.0 2030.0,2.0 2030.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-17\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\">punct</textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M2030.0,389.0 L2038.0,377.0 2022.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"\n",
|
||||||
|
"<g class=\"displacy-arrow\">\n",
|
||||||
|
" <path class=\"displacy-arc\" id=\"arrow-0-18\" stroke-width=\"2px\" d=\"M2050,387.0 C2050,332.0 2110.0,332.0 2110.0,387.0\" fill=\"none\" stroke=\"currentColor\"/>\n",
|
||||||
|
" <text dy=\"1.25em\" style=\"font-size: 0.8em; letter-spacing: 1px\">\n",
|
||||||
|
" <textPath xlink:href=\"#arrow-0-18\" class=\"displacy-label\" startOffset=\"50%\" fill=\"currentColor\" text-anchor=\"middle\"></textPath>\n",
|
||||||
|
" </text>\n",
|
||||||
|
" <path class=\"displacy-arrowhead\" d=\"M2110.0,389.0 L2118.0,377.0 2102.0,377.0\" fill=\"currentColor\"/>\n",
|
||||||
|
"</g>\n",
|
||||||
|
"</svg>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"displacy.render(list(doc.sents)[2], style='dep', jupyter=True, options={'distance': 110})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**6. Show the first two named entities from Beatrix Potter's *The Tale of Peter Rabbit* **"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"The Tale of Peter Rabbit - WORK_OF_ART - Titles of books, songs, etc.\n",
|
||||||
|
"Beatrix Potter - PERSON - People, including fictional\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for ent in doc.ents[:2]:\n",
|
||||||
|
" print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**7. How many sentences are contained in *The Tale of Peter Rabbit*?**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"56"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"len([sent for sent in doc.sents])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**8. CHALLENGE: How many sentences contain named entities?**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"49"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"list_of_sents = [nlp(sent.text) for sent in doc.sents]\n",
|
||||||
|
"list_of_ners = [doc for doc in list_of_sents if doc.ents]\n",
|
||||||
|
"len(list_of_ners)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**9. CHALLENGE: Display the named entity visualization for `list_of_sents[0]` from the previous problem**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div class=\"entities\" style=\"line-height: 2.5\">\n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #f0d0ff; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" The Tale of Peter Rabbit\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">WORK_OF_ART</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
", by \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" Beatrix Potter\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" (\n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone\">\n",
|
||||||
|
" 1902\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
").\n",
|
||||||
|
"\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"displacy.render(list_of_sents[0], style='ent', jupyter=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Great Job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,199 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# POS Challenge - OPTIONAL\n",
|
||||||
|
"Just for fun, we've developed a game to test your knowledge of Part of Speech tags. The object of the game is to write a body of text that contains as many different ** *fine-grained tags* ** as possible. The highest possible score is 100 (or thereabouts). Points are awarded for the number of unique tags used, and for the fewest possible tokens used. Below is an example. Feel free to post your results in the Q&A Forum for this lecture, and good luck!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Perform standard imports\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')\n",
|
||||||
|
"\n",
|
||||||
|
"# Import the game script\n",
|
||||||
|
"import game"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Enter your text here:\n",
|
||||||
|
"text = u\"The quick brown fox jumped over the lazy dog's back.\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Unique tags: 7\n",
|
||||||
|
"Tokens used: 12\n",
|
||||||
|
"SCORE: 9\n",
|
||||||
|
"CONGRATULATIONS!\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Make your Doc object and pass it into the scorer:\n",
|
||||||
|
"doc = nlp(text)\n",
|
||||||
|
"print(game.scorer(doc))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"TOKEN COARSE FINE DESCRIPTION\n",
|
||||||
|
"----- ------ ---- -----------\n",
|
||||||
|
"The DET DT determiner\n",
|
||||||
|
"quick ADJ JJ adjective\n",
|
||||||
|
"brown ADJ JJ adjective\n",
|
||||||
|
"fox NOUN NN noun, singular or mass\n",
|
||||||
|
"jumped VERB VBD verb, past tense\n",
|
||||||
|
"over ADP IN conjunction, subordinating or preposition\n",
|
||||||
|
"the DET DT determiner\n",
|
||||||
|
"lazy ADJ JJ adjective\n",
|
||||||
|
"dog NOUN NN noun, singular or mass\n",
|
||||||
|
"'s PART POS possessive ending\n",
|
||||||
|
"back NOUN NN noun, singular or mass\n",
|
||||||
|
". PUNCT . punctuation mark, sentence closer\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# For practice, visualize your fine-grained POS tags (shown in the third column):\n",
|
||||||
|
"print(f\"{'TOKEN':{10}} {'COARSE':{8}} {'FINE':{6}} {'DESCRIPTION'}\")\n",
|
||||||
|
"print(f\"{'-----':{10}} {'------':{8}} {'----':{6}} {'-----------'}\")\n",
|
||||||
|
"\n",
|
||||||
|
"for token in doc:\n",
|
||||||
|
" print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"## Fine-grained Part-of-speech Tags\n",
|
||||||
|
"\n",
|
||||||
|
"<table>\n",
|
||||||
|
"<tr><th></th><th>Coarse POS Tag</th><th>Description</th><th>Fine-grained Tag</th><th>Description</th><th>Morphology</th></tr>\n",
|
||||||
|
"<tr><td>1.</td><td>ADJ</td><td>adjective</td><td>AFX</td><td>affix</td><td>Hyph=yes</td></tr>\n",
|
||||||
|
"<tr><td>2.</td><td>ADJ</td><td></td><td>JJ</td><td>adjective</td><td>Degree=pos</td></tr>\n",
|
||||||
|
"<tr><td>3.</td><td>ADJ</td><td></td><td>JJR</td><td>adjective, comparative</td><td>Degree=comp</td></tr>\n",
|
||||||
|
"<tr><td>4.</td><td>ADJ</td><td></td><td>JJS</td><td>adjective, superlative</td><td>Degree=sup</td></tr>\n",
|
||||||
|
"<tr><td>5.</td><td>ADJ</td><td></td><td>PDT</td><td>predeterminer</td><td>AdjType=pdt PronType=prn</td></tr>\n",
|
||||||
|
"<tr><td>6.</td><td>ADJ</td><td></td><td>PRP\\$</td><td>pronoun, possessive</td><td>PronType=prs Poss=yes</td></tr>\n",
|
||||||
|
"<tr><td>7.</td><td>ADJ</td><td></td><td>WDT</td><td>wh-determiner</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>8.</td><td>ADJ</td><td></td><td>WP\\$</td><td>wh-pronoun, possessive</td><td>Poss=yes PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>9.</td><td>ADP</td><td>adposition</td><td>IN</td><td>conjunction, subordinating or preposition</td><td></td></tr>\n",
|
||||||
|
"<tr><td>10.</td><td>ADV</td><td>adverb</td><td>EX</td><td>existential there</td><td>AdvType=ex</td></tr>\n",
|
||||||
|
"<tr><td>11.</td><td>ADV</td><td></td><td>RB</td><td>adverb</td><td>Degree=pos</td></tr>\n",
|
||||||
|
"<tr><td>12.</td><td>ADV</td><td></td><td>RBR</td><td>adverb, comparative</td><td>Degree=comp</td></tr>\n",
|
||||||
|
"<tr><td>13.</td><td>ADV</td><td></td><td>RBS</td><td>adverb, superlative</td><td>Degree=sup</td></tr>\n",
|
||||||
|
"<tr><td>14.</td><td>ADV</td><td></td><td>WRB</td><td>wh-adverb</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>15.</td><td>CONJ</td><td>conjunction</td><td>CC</td><td>conjunction, coordinating</td><td>ConjType=coor</td></tr>\n",
|
||||||
|
"<tr><td>16.</td><td>DET</td><td>determiner</td><td>DT</td><td></td><td>determiner</td></tr>\n",
|
||||||
|
"<tr><td>17.</td><td>INTJ</td><td>interjection</td><td>UH</td><td>interjection</td><td></td></tr>\n",
|
||||||
|
"<tr><td>18.</td><td>NOUN</td><td>noun</td><td>NN</td><td>noun, singular or mass</td><td>Number=sing</td></tr>\n",
|
||||||
|
"<tr><td>19.</td><td>NOUN</td><td></td><td>NNS</td><td>noun, plural</td><td>Number=plur</td></tr>\n",
|
||||||
|
"<tr><td>20.</td><td>NOUN</td><td></td><td>WP</td><td>wh-pronoun, personal</td><td>PronType=int rel</td></tr>\n",
|
||||||
|
"<tr><td>21.</td><td>NUM</td><td>numeral</td><td>CD</td><td>cardinal number</td><td>NumType=card</td></tr>\n",
|
||||||
|
"<tr><td>22.</td><td>PART</td><td>particle</td><td>POS</td><td>possessive ending</td><td>Poss=yes</td></tr>\n",
|
||||||
|
"<tr><td>23.</td><td>PART</td><td></td><td>RP</td><td>adverb, particle</td><td></td></tr>\n",
|
||||||
|
"<tr><td>24.</td><td>PART</td><td></td><td>TO</td><td>infinitival to</td><td>PartType=inf VerbForm=inf</td></tr>\n",
|
||||||
|
"<tr><td>25.</td><td>PRON</td><td>pronoun</td><td>PRP</td><td>pronoun, personal</td><td>PronType=prs</td></tr>\n",
|
||||||
|
"<tr><td>26.</td><td>PROPN</td><td>proper noun</td><td>NNP</td><td>noun, proper singular</td><td>NounType=prop Number=sign</td></tr>\n",
|
||||||
|
"<tr><td>27.</td><td>PROPN</td><td></td><td>NNPS</td><td>noun, proper plural</td><td>NounType=prop Number=plur</td></tr>\n",
|
||||||
|
"<tr><td>28.</td><td>PUNCT</td><td>punctuation</td><td>-LRB-</td><td>left round bracket</td><td>PunctType=brck PunctSide=ini</td></tr>\n",
|
||||||
|
"<tr><td>29.</td><td>PUNCT</td><td></td><td>-RRB-</td><td>right round bracket</td><td>PunctType=brck PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>30.</td><td>PUNCT</td><td></td><td>,</td><td>punctuation mark, comma</td><td>PunctType=comm</td></tr>\n",
|
||||||
|
"<tr><td>31.</td><td>PUNCT</td><td></td><td>:</td><td>punctuation mark, colon or ellipsis</td><td></td></tr>\n",
|
||||||
|
"<tr><td>32.</td><td>PUNCT</td><td></td><td>.</td><td>punctuation mark, sentence closer</td><td>PunctType=peri</td></tr>\n",
|
||||||
|
"<tr><td>33.</td><td>PUNCT</td><td></td><td>''</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>34.</td><td>PUNCT</td><td></td><td>\"\"</td><td>closing quotation mark</td><td>PunctType=quot PunctSide=fin</td></tr>\n",
|
||||||
|
"<tr><td>35.</td><td>PUNCT</td><td></td><td>``</td><td>opening quotation mark</td><td>PunctType=quot PunctSide=ini</td></tr>\n",
|
||||||
|
"<tr><td>36.</td><td>PUNCT</td><td></td><td>HYPH</td><td>punctuation mark, hyphen</td><td>PunctType=dash</td></tr>\n",
|
||||||
|
"<tr><td>37.</td><td>PUNCT</td><td></td><td>LS</td><td>list item marker</td><td>NumType=ord</td></tr>\n",
|
||||||
|
"<tr><td>38.</td><td>PUNCT</td><td></td><td>NFP</td><td>superfluous punctuation</td><td></td></tr>\n",
|
||||||
|
"<tr><td>39.</td><td>SYM</td><td>symbol</td><td>#</td><td>symbol, number sign</td><td>SymType=numbersign</td></tr>\n",
|
||||||
|
"<tr><td>40.</td><td>SYM</td><td></td><td>\\$</td><td>symbol, currency</td><td>SymType=currency</td></tr>\n",
|
||||||
|
"<tr><td>41.</td><td>SYM</td><td></td><td>SYM</td><td>symbol</td><td></td></tr>\n",
|
||||||
|
"<tr><td>42.</td><td>VERB</td><td>verb</td><td>BES</td><td>auxiliary \"be\"</td><td></td></tr>\n",
|
||||||
|
"<tr><td>43.</td><td>VERB</td><td></td><td>HVS</td><td>forms of \"have\"</td><td></td></tr>\n",
|
||||||
|
"<tr><td>44.</td><td>VERB</td><td></td><td>MD</td><td>verb, modal auxiliary</td><td>VerbType=mod</td></tr>\n",
|
||||||
|
"<tr><td>45.</td><td>VERB</td><td></td><td>VB</td><td>verb, base form</td><td>VerbForm=inf</td></tr>\n",
|
||||||
|
"<tr><td>46.</td><td>VERB</td><td></td><td>VBD</td><td>verb, past tense</td><td>VerbForm=fin Tense=past</td></tr>\n",
|
||||||
|
"<tr><td>47.</td><td>VERB</td><td></td><td>VBG</td><td>verb, gerund or present participle</td><td>VerbForm=part Tense=pres Aspect=prog</td></tr>\n",
|
||||||
|
"<tr><td>48.</td><td>VERB</td><td></td><td>VBN</td><td>verb, past participle</td><td>VerbForm=part Tense=past Aspect=perf</td></tr>\n",
|
||||||
|
"<tr><td>49.</td><td>VERB</td><td></td><td>VBP</td><td>verb, non-3rd person singular present</td><td>VerbForm=fin Tense=pres</td></tr>\n",
|
||||||
|
"<tr><td>50.</td><td>VERB</td><td></td><td>VBZ</td><td>verb, 3rd person singular present</td><td>VerbForm=fin Tense=pres Number=sing Person=3</td></tr>\n",
|
||||||
|
"<tr><td>51.</td><td>X</td><td>other</td><td>ADD</td><td>email</td><td></td></tr>\n",
|
||||||
|
"<tr><td>52.</td><td>X</td><td></td><td>FW</td><td>foreign word</td><td>Foreign=yes</td></tr>\n",
|
||||||
|
"<tr><td>53.</td><td>X</td><td></td><td>GW</td><td>additional word in multi-word expression</td><td></td></tr>\n",
|
||||||
|
"<tr><td>54.</td><td>X</td><td></td><td>XX</td><td>unknown</td><td></td></tr>\n",
|
||||||
|
"<tr><td>55.</td><td>SPACE</td><td>space</td><td>_SP</td><td>space</td><td></td></tr>\n",
|
||||||
|
"<tr><td>56.</td><td></td><td></td><td>NIL</td><td>missing tag</td><td></td></tr>\n",
|
||||||
|
"</table>"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
Binary file not shown.
10
Praktikum Python Code/02-Parts-of-Speech-Tagging/game.py
Normal file
10
Praktikum Python Code/02-Parts-of-Speech-Tagging/game.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
def scorer(doc):
|
||||||
|
tags = ['AFX', 'JJ', 'JJR', 'JJS', 'PDT', 'PRP$', 'WDT', 'WP$', 'IN', 'EX', 'RB', 'RBR', 'RBS', 'WRB', 'CC', 'DT', 'UH', 'NN', 'NNS', 'WP', 'CD', 'POS', 'RP', 'TO', 'PRP', 'NNP', 'NNPS', '-LRB-', '-RRB-', ',', ':', '.', "''", '""', '``', 'HYPH', 'LS', 'NFP', '_SP', '#', '$', 'SYM', 'BES', 'HVS', 'MD', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'ADD', 'FW', 'GW', 'XX', 'NIL']
|
||||||
|
counter = 0
|
||||||
|
for tag in tags:
|
||||||
|
for token in doc:
|
||||||
|
if token.tag_ == tag:
|
||||||
|
counter+=1
|
||||||
|
break
|
||||||
|
score = max(counter*3 - len(doc),counter)
|
||||||
|
return f'Unique tags: {counter}\nTokens used: {len(doc)}\nSCORE: {score}\nCONGRATULATIONS!'
|
||||||
@ -0,0 +1,800 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This unit is divided into two sections:\n",
|
||||||
|
"* First, we'll find out what what is necessary to build an NLP system that can turn a body of text into a numerical array of *features*.\n",
|
||||||
|
"* Next we'll show how to perform these steps using real tools."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Building a Natural Language Processor From Scratch\n",
|
||||||
|
"In this section we'll use basic Python to build a rudimentary NLP system. We'll build a *corpus of documents* (two small text files), create a *vocabulary* from all the words in both documents, and then demonstrate a *Bag of Words* technique to extract features from each document.<br>\n",
|
||||||
|
"<div class=\"alert alert-info\" style=\"margin: 20px\">**This first section is for illustration only!**\n",
|
||||||
|
"<br>Don't bother memorizing the code - we'd never do this in real life.</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Start with some documents:\n",
|
||||||
|
"For simplicity we won't use any punctuation."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Overwriting 1.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile 1.txt\n",
|
||||||
|
"This is a story about cats\n",
|
||||||
|
"our feline pets\n",
|
||||||
|
"Cats are furry animals"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Overwriting 2.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile 2.txt\n",
|
||||||
|
"This story is about surfing\n",
|
||||||
|
"Catching waves is fun\n",
|
||||||
|
"Surfing is a popular water sport"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Build a vocabulary\n",
|
||||||
|
"The goal here is to build a numerical array from all the words that appear in every document. Later we'll create instances (vectors) for each individual document."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"vocab = {}\n",
|
||||||
|
"i = 1\n",
|
||||||
|
"\n",
|
||||||
|
"with open('1.txt') as f:\n",
|
||||||
|
" x = f.read().lower().split()\n",
|
||||||
|
"\n",
|
||||||
|
"for word in x:\n",
|
||||||
|
" if word in vocab:\n",
|
||||||
|
" continue\n",
|
||||||
|
" else:\n",
|
||||||
|
" vocab[word]=i\n",
|
||||||
|
" i+=1\n",
|
||||||
|
"\n",
|
||||||
|
"print(vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12, 'surfing': 13, 'catching': 14, 'waves': 15, 'fun': 16, 'popular': 17, 'water': 18, 'sport': 19}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"with open('2.txt') as f:\n",
|
||||||
|
" x = f.read().lower().split()\n",
|
||||||
|
"\n",
|
||||||
|
"for word in x:\n",
|
||||||
|
" if word in vocab:\n",
|
||||||
|
" continue\n",
|
||||||
|
" else:\n",
|
||||||
|
" vocab[word]=i\n",
|
||||||
|
" i+=1\n",
|
||||||
|
"\n",
|
||||||
|
"print(vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Even though `2.txt` has 15 words, only 7 new words were added to the dictionary.\n",
|
||||||
|
"\n",
|
||||||
|
"## Feature Extraction\n",
|
||||||
|
"Now that we've encapsulated our \"entire language\" in a dictionary, let's perform *feature extraction* on each of our original documents:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['1.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create an empty vector with space for each word in the vocabulary:\n",
|
||||||
|
"one = ['1.txt']+[0]*len(vocab)\n",
|
||||||
|
"one"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['1.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# map the frequencies of each word in 1.txt to our vector:\n",
|
||||||
|
"with open('1.txt') as f:\n",
|
||||||
|
" x = f.read().lower().split()\n",
|
||||||
|
" \n",
|
||||||
|
"for word in x:\n",
|
||||||
|
" one[vocab[word]]+=1\n",
|
||||||
|
" \n",
|
||||||
|
"one"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>We can see that most of the words in 1.txt appear only once, although \"cats\" appears twice.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Do the same for the second document:\n",
|
||||||
|
"two = ['2.txt']+[0]*len(vocab)\n",
|
||||||
|
"\n",
|
||||||
|
"with open('2.txt') as f:\n",
|
||||||
|
" x = f.read().lower().split()\n",
|
||||||
|
" \n",
|
||||||
|
"for word in x:\n",
|
||||||
|
" two[vocab[word]]+=1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['1.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]\n",
|
||||||
|
"['2.txt', 1, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Compare the two vectors:\n",
|
||||||
|
"print(f'{one}\\n{two}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"By comparing the vectors we see that some words are common to both, some appear only in `1.txt`, others only in `2.txt`. Extending this logic to tens of thousands of documents, we would see the vocabulary dictionary grow to hundreds of thousands of words. Vectors would contain mostly zero values, making them *sparse matrices*."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Bag of Words and Tf-idf\n",
|
||||||
|
"In the above examples, each vector can be considered a *bag of words*. By itself these may not be helpful until we consider *term frequencies*, or how often individual words appear in documents. A simple way to calculate term frequencies is to divide the number of occurrences of a word by the total number of words in the document. In this way, the number of times a word appears in large documents can be compared to that of smaller documents.\n",
|
||||||
|
"\n",
|
||||||
|
"However, it may be hard to differentiate documents based on term frequency if a word shows up in a majority of documents. To handle this we also consider *inverse document frequency*, which is the total number of documents divided by the number of documents that contain the word. In practice we convert this value to a logarithmic scale, as described [here](https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency).\n",
|
||||||
|
"\n",
|
||||||
|
"Together these terms become [**tf-idf**](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Stop Words and Word Stems\n",
|
||||||
|
"Some words like \"the\" and \"and\" appear so frequently, and in so many documents, that we needn't bother counting them. Also, it may make sense to only record the root of a word, say `cat` in place of both `cat` and `cats`. This will shrink our vocab array and improve performance."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Tokenization and Tagging\n",
|
||||||
|
"When we created our vectors the first thing we did was split the incoming text on whitespace with `.split()`. This was a crude form of *tokenization* - that is, dividing a document into individual words. In this simple example we didn't worry about punctuation or different parts of speech. In the real world we rely on some fairly sophisticated *morphology* to parse text appropriately.\n",
|
||||||
|
"\n",
|
||||||
|
"Once the text is divided, we can go back and *tag* our tokens with information about parts of speech, grammatical dependencies, etc. This adds more dimensions to our data and enables a deeper understanding of the context of specific documents. For this reason, vectors become ***high dimensional sparse matrices***."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<div class=\"alert alert-info\" style=\"margin: 20px\">**That's the end of the first section.**\n",
|
||||||
|
"<br>In the next section we'll use scikit-learn to perform a real-life analysis.</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Feature Extraction from Text\n",
|
||||||
|
"In the **Scikit-learn Primer** lecture we applied a simple SVC classification model to the SMSSpamCollection dataset. We tried to predict the ham/spam label based on message length and punctuation counts. In this section we'll actually look at the text of each message and try to perform a classification based on content. We'll take advantage of some of scikit-learn's [feature extraction](https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction) tools.\n",
|
||||||
|
"\n",
|
||||||
|
"## Load a dataset"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style>\n",
|
||||||
|
" .dataframe thead tr:only-child th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: left;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>message</th>\n",
|
||||||
|
" <th>length</th>\n",
|
||||||
|
" <th>punct</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>ham</td>\n",
|
||||||
|
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
|
||||||
|
" <td>111</td>\n",
|
||||||
|
" <td>9</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>ham</td>\n",
|
||||||
|
" <td>Ok lar... Joking wif u oni...</td>\n",
|
||||||
|
" <td>29</td>\n",
|
||||||
|
" <td>6</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>spam</td>\n",
|
||||||
|
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
|
||||||
|
" <td>155</td>\n",
|
||||||
|
" <td>6</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>ham</td>\n",
|
||||||
|
" <td>U dun say so early hor... U c already then say...</td>\n",
|
||||||
|
" <td>49</td>\n",
|
||||||
|
" <td>6</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>ham</td>\n",
|
||||||
|
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
|
||||||
|
" <td>61</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label message length punct\n",
|
||||||
|
"0 ham Go until jurong point, crazy.. Available only ... 111 9\n",
|
||||||
|
"1 ham Ok lar... Joking wif u oni... 29 6\n",
|
||||||
|
"2 spam Free entry in 2 a wkly comp to win FA Cup fina... 155 6\n",
|
||||||
|
"3 ham U dun say so early hor... U c already then say... 49 6\n",
|
||||||
|
"4 ham Nah I don't think he goes to usf, he lives aro... 61 2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Perform imports and load the dataset:\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"df = pd.read_csv('../TextFiles/smsspamcollection.tsv', sep='\\t')\n",
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Check for missing values:\n",
|
||||||
|
"Always a good practice."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"label 0\n",
|
||||||
|
"message 0\n",
|
||||||
|
"length 0\n",
|
||||||
|
"punct 0\n",
|
||||||
|
"dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df.isnull().sum()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Take a quick look at the *ham* and *spam* `label` column:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"ham 4825\n",
|
||||||
|
"spam 747\n",
|
||||||
|
"Name: label, dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df['label'].value_counts()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>4825 out of 5572 messages, or 86.6%, are ham. This means that any text classification model we create has to perform **better than 86.6%** to beat random chance.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Split the data into train & test sets:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"X = df['message'] # this time we want to look at the text\n",
|
||||||
|
"y = df['label']\n",
|
||||||
|
"\n",
|
||||||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Scikit-learn's CountVectorizer\n",
|
||||||
|
"Text preprocessing, tokenizing and the ability to filter out stopwords are all included in [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html), which builds a dictionary of features and transforms documents to feature vectors."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(3733, 7082)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||||
|
"count_vect = CountVectorizer()\n",
|
||||||
|
"\n",
|
||||||
|
"X_train_counts = count_vect.fit_transform(X_train)\n",
|
||||||
|
"X_train_counts.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>This shows that our training set is comprised of 3733 documents, and 7082 features.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Transform Counts to Frequencies with Tf-idf\n",
|
||||||
|
"While counting words is helpful, longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.\n",
|
||||||
|
"\n",
|
||||||
|
"To avoid this we can simply divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called **tf** for Term Frequencies.\n",
|
||||||
|
"\n",
|
||||||
|
"Another refinement on top of **tf** is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.\n",
|
||||||
|
"\n",
|
||||||
|
"This downscaling is called **tf–idf** for “Term Frequency times Inverse Document Frequency”.\n",
|
||||||
|
"\n",
|
||||||
|
"Both tf and tf–idf can be computed as follows using [TfidfTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(3733, 7082)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.feature_extraction.text import TfidfTransformer\n",
|
||||||
|
"tfidf_transformer = TfidfTransformer()\n",
|
||||||
|
"\n",
|
||||||
|
"X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)\n",
|
||||||
|
"X_train_tfidf.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Note: the `fit_transform()` method actually performs two operations: it fits an estimator to the data and then transforms our count-matrix to a tf-idf representation."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Combine Steps with TfidVectorizer\n",
|
||||||
|
"In the future, we can combine the CountVectorizer and TfidTransformer steps into one using [TfidVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(3733, 7082)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"vectorizer = TfidfVectorizer()\n",
|
||||||
|
"\n",
|
||||||
|
"X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set\n",
|
||||||
|
"X_train_tfidf.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Train a Classifier\n",
|
||||||
|
"Here we'll introduce an SVM classifier that's similar to SVC, called [LinearSVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html). LinearSVC handles sparse input better, and scales well to large numbers of samples."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
|
||||||
|
" intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
|
||||||
|
" multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
|
||||||
|
" verbose=0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.svm import LinearSVC\n",
|
||||||
|
"clf = LinearSVC()\n",
|
||||||
|
"clf.fit(X_train_tfidf,y_train)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Earlier we named our SVC classifier **svc_model**. Here we're using the more generic name **clf** (for classifier).</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Build a Pipeline\n",
|
||||||
|
"Remember that only our training set has been vectorized into a full vocabulary. In order to perform an analysis on our test set we'll have to submit it to the same procedures. Fortunately scikit-learn offers a [**Pipeline**](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) class that behaves like a compound classifier."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Pipeline(memory=None,\n",
|
||||||
|
" steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
|
||||||
|
" dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',\n",
|
||||||
|
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
|
||||||
|
" ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,\n",
|
||||||
|
" multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
|
||||||
|
" verbose=0))])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.pipeline import Pipeline\n",
|
||||||
|
"# from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"# from sklearn.svm import LinearSVC\n",
|
||||||
|
"\n",
|
||||||
|
"text_clf = Pipeline([('tfidf', TfidfVectorizer()),\n",
|
||||||
|
" ('clf', LinearSVC()),\n",
|
||||||
|
"])\n",
|
||||||
|
"\n",
|
||||||
|
"# Feed the training data through the pipeline\n",
|
||||||
|
"text_clf.fit(X_train, y_train) "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test the classifier and display results"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Form a prediction set\n",
|
||||||
|
"predictions = text_clf.predict(X_test)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[1586 7]\n",
|
||||||
|
" [ 12 234]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Report the confusion matrix\n",
|
||||||
|
"from sklearn import metrics\n",
|
||||||
|
"print(metrics.confusion_matrix(y_test,predictions))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" ham 0.99 1.00 0.99 1593\n",
|
||||||
|
" spam 0.97 0.95 0.96 246\n",
|
||||||
|
"\n",
|
||||||
|
" micro avg 0.99 0.99 0.99 1839\n",
|
||||||
|
" macro avg 0.98 0.97 0.98 1839\n",
|
||||||
|
"weighted avg 0.99 0.99 0.99 1839\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print a classification report\n",
|
||||||
|
"print(metrics.classification_report(y_test,predictions))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"0.989668297988\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print the overall accuracy\n",
|
||||||
|
"print(metrics.accuracy_score(y_test,predictions))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Using the text of the messages, our model performed exceedingly well; it correctly predicted spam **98.97%** of the time!<br>\n",
|
||||||
|
"Now let's apply what we've learned to a text classification project involving positive and negative movie reviews.\n",
|
||||||
|
"\n",
|
||||||
|
"## Next up: Text Classification Project"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -0,0 +1,800 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This unit is divided into two sections:\n",
|
||||||
|
"* First, we'll find out what what is necessary to build an NLP system that can turn a body of text into a numerical array of *features*.\n",
|
||||||
|
"* Next we'll show how to perform these steps using real tools."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Building a Natural Language Processor From Scratch\n",
|
||||||
|
"In this section we'll use basic Python to build a rudimentary NLP system. We'll build a *corpus of documents* (two small text files), create a *vocabulary* from all the words in both documents, and then demonstrate a *Bag of Words* technique to extract features from each document.<br>\n",
|
||||||
|
"<div class=\"alert alert-info\" style=\"margin: 20px\">**This first section is for illustration only!**\n",
|
||||||
|
"<br>Don't bother memorizing the code - we'd never do this in real life.</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Start with some documents:\n",
|
||||||
|
"For simplicity we won't use any punctuation."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Overwriting 1.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile 1.txt\n",
|
||||||
|
"This is a story about cats\n",
|
||||||
|
"our feline pets\n",
|
||||||
|
"Cats are furry animals"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Overwriting 2.txt\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"%%writefile 2.txt\n",
|
||||||
|
"This story is about surfing\n",
|
||||||
|
"Catching waves is fun\n",
|
||||||
|
"Surfing is a popular water sport"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Build a vocabulary\n",
|
||||||
|
"The goal here is to build a numerical array from all the words that appear in every document. Later we'll create instances (vectors) for each individual document."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"vocab = {}\n",
|
||||||
|
"i = 1\n",
|
||||||
|
"\n",
|
||||||
|
"with open('1.txt') as f:\n",
|
||||||
|
" x = f.read().lower().split()\n",
|
||||||
|
"\n",
|
||||||
|
"for word in x:\n",
|
||||||
|
" if word in vocab:\n",
|
||||||
|
" continue\n",
|
||||||
|
" else:\n",
|
||||||
|
" vocab[word]=i\n",
|
||||||
|
" i+=1\n",
|
||||||
|
"\n",
|
||||||
|
"print(vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{'this': 1, 'is': 2, 'a': 3, 'story': 4, 'about': 5, 'cats': 6, 'our': 7, 'feline': 8, 'pets': 9, 'are': 10, 'furry': 11, 'animals': 12, 'surfing': 13, 'catching': 14, 'waves': 15, 'fun': 16, 'popular': 17, 'water': 18, 'sport': 19}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"with open('2.txt') as f:\n",
|
||||||
|
" x = f.read().lower().split()\n",
|
||||||
|
"\n",
|
||||||
|
"for word in x:\n",
|
||||||
|
" if word in vocab:\n",
|
||||||
|
" continue\n",
|
||||||
|
" else:\n",
|
||||||
|
" vocab[word]=i\n",
|
||||||
|
" i+=1\n",
|
||||||
|
"\n",
|
||||||
|
"print(vocab)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Even though `2.txt` has 15 words, only 7 new words were added to the dictionary.\n",
|
||||||
|
"\n",
|
||||||
|
"## Feature Extraction\n",
|
||||||
|
"Now that we've encapsulated our \"entire language\" in a dictionary, let's perform *feature extraction* on each of our original documents:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['1.txt', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create an empty vector with space for each word in the vocabulary:\n",
|
||||||
|
"one = ['1.txt']+[0]*len(vocab)\n",
|
||||||
|
"one"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['1.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# map the frequencies of each word in 1.txt to our vector:\n",
|
||||||
|
"with open('1.txt') as f:\n",
|
||||||
|
" x = f.read().lower().split()\n",
|
||||||
|
" \n",
|
||||||
|
"for word in x:\n",
|
||||||
|
" one[vocab[word]]+=1\n",
|
||||||
|
" \n",
|
||||||
|
"one"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>We can see that most of the words in 1.txt appear only once, although \"cats\" appears twice.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Do the same for the second document:\n",
|
||||||
|
"two = ['2.txt']+[0]*len(vocab)\n",
|
||||||
|
"\n",
|
||||||
|
"with open('2.txt') as f:\n",
|
||||||
|
" x = f.read().lower().split()\n",
|
||||||
|
" \n",
|
||||||
|
"for word in x:\n",
|
||||||
|
" two[vocab[word]]+=1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['1.txt', 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]\n",
|
||||||
|
"['2.txt', 1, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Compare the two vectors:\n",
|
||||||
|
"print(f'{one}\\n{two}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"By comparing the vectors we see that some words are common to both, some appear only in `1.txt`, others only in `2.txt`. Extending this logic to tens of thousands of documents, we would see the vocabulary dictionary grow to hundreds of thousands of words. Vectors would contain mostly zero values, making them *sparse matrices*."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Bag of Words and Tf-idf\n",
|
||||||
|
"In the above examples, each vector can be considered a *bag of words*. By itself these may not be helpful until we consider *term frequencies*, or how often individual words appear in documents. A simple way to calculate term frequencies is to divide the number of occurrences of a word by the total number of words in the document. In this way, the number of times a word appears in large documents can be compared to that of smaller documents.\n",
|
||||||
|
"\n",
|
||||||
|
"However, it may be hard to differentiate documents based on term frequency if a word shows up in a majority of documents. To handle this we also consider *inverse document frequency*, which is the total number of documents divided by the number of documents that contain the word. In practice we convert this value to a logarithmic scale, as described [here](https://en.wikipedia.org/wiki/Tf%E2%80%93idf#Inverse_document_frequency).\n",
|
||||||
|
"\n",
|
||||||
|
"Together these terms become [**tf-idf**](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Stop Words and Word Stems\n",
|
||||||
|
"Some words like \"the\" and \"and\" appear so frequently, and in so many documents, that we needn't bother counting them. Also, it may make sense to only record the root of a word, say `cat` in place of both `cat` and `cats`. This will shrink our vocab array and improve performance."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Tokenization and Tagging\n",
|
||||||
|
"When we created our vectors the first thing we did was split the incoming text on whitespace with `.split()`. This was a crude form of *tokenization* - that is, dividing a document into individual words. In this simple example we didn't worry about punctuation or different parts of speech. In the real world we rely on some fairly sophisticated *morphology* to parse text appropriately.\n",
|
||||||
|
"\n",
|
||||||
|
"Once the text is divided, we can go back and *tag* our tokens with information about parts of speech, grammatical dependencies, etc. This adds more dimensions to our data and enables a deeper understanding of the context of specific documents. For this reason, vectors become ***high dimensional sparse matrices***."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<div class=\"alert alert-info\" style=\"margin: 20px\">**That's the end of the first section.**\n",
|
||||||
|
"<br>In the next section we'll use scikit-learn to perform a real-life analysis.</div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Feature Extraction from Text\n",
|
||||||
|
"In the **Scikit-learn Primer** lecture we applied a simple SVC classification model to the SMSSpamCollection dataset. We tried to predict the ham/spam label based on message length and punctuation counts. In this section we'll actually look at the text of each message and try to perform a classification based on content. We'll take advantage of some of scikit-learn's [feature extraction](https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction) tools.\n",
|
||||||
|
"\n",
|
||||||
|
"## Load a dataset"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style>\n",
|
||||||
|
" .dataframe thead tr:only-child th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: left;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>message</th>\n",
|
||||||
|
" <th>length</th>\n",
|
||||||
|
" <th>punct</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>ham</td>\n",
|
||||||
|
" <td>Go until jurong point, crazy.. Available only ...</td>\n",
|
||||||
|
" <td>111</td>\n",
|
||||||
|
" <td>9</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>ham</td>\n",
|
||||||
|
" <td>Ok lar... Joking wif u oni...</td>\n",
|
||||||
|
" <td>29</td>\n",
|
||||||
|
" <td>6</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>spam</td>\n",
|
||||||
|
" <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
|
||||||
|
" <td>155</td>\n",
|
||||||
|
" <td>6</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>ham</td>\n",
|
||||||
|
" <td>U dun say so early hor... U c already then say...</td>\n",
|
||||||
|
" <td>49</td>\n",
|
||||||
|
" <td>6</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>ham</td>\n",
|
||||||
|
" <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
|
||||||
|
" <td>61</td>\n",
|
||||||
|
" <td>2</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label message length punct\n",
|
||||||
|
"0 ham Go until jurong point, crazy.. Available only ... 111 9\n",
|
||||||
|
"1 ham Ok lar... Joking wif u oni... 29 6\n",
|
||||||
|
"2 spam Free entry in 2 a wkly comp to win FA Cup fina... 155 6\n",
|
||||||
|
"3 ham U dun say so early hor... U c already then say... 49 6\n",
|
||||||
|
"4 ham Nah I don't think he goes to usf, he lives aro... 61 2"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Perform imports and load the dataset:\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"df = pd.read_csv('../TextFiles/smsspamcollection.tsv', sep='\\t')\n",
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Check for missing values:\n",
|
||||||
|
"Always a good practice."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"label 0\n",
|
||||||
|
"message 0\n",
|
||||||
|
"length 0\n",
|
||||||
|
"punct 0\n",
|
||||||
|
"dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df.isnull().sum()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Take a quick look at the *ham* and *spam* `label` column:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"ham 4825\n",
|
||||||
|
"spam 747\n",
|
||||||
|
"Name: label, dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df['label'].value_counts()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>4825 out of 5572 messages, or 86.6%, are ham. This means that any text classification model we create has to perform **better than 86.6%** to beat random chance.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Split the data into train & test sets:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"X = df['message'] # this time we want to look at the text\n",
|
||||||
|
"y = df['label']\n",
|
||||||
|
"\n",
|
||||||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Scikit-learn's CountVectorizer\n",
|
||||||
|
"Text preprocessing, tokenizing and the ability to filter out stopwords are all included in [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html), which builds a dictionary of features and transforms documents to feature vectors."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(3733, 7082)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||||
|
"count_vect = CountVectorizer()\n",
|
||||||
|
"\n",
|
||||||
|
"X_train_counts = count_vect.fit_transform(X_train)\n",
|
||||||
|
"X_train_counts.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>This shows that our training set is comprised of 3733 documents, and 7082 features.</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Transform Counts to Frequencies with Tf-idf\n",
|
||||||
|
"While counting words is helpful, longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.\n",
|
||||||
|
"\n",
|
||||||
|
"To avoid this we can simply divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called **tf** for Term Frequencies.\n",
|
||||||
|
"\n",
|
||||||
|
"Another refinement on top of **tf** is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.\n",
|
||||||
|
"\n",
|
||||||
|
"This downscaling is called **tf–idf** for “Term Frequency times Inverse Document Frequency”.\n",
|
||||||
|
"\n",
|
||||||
|
"Both tf and tf–idf can be computed as follows using [TfidfTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(3733, 7082)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.feature_extraction.text import TfidfTransformer\n",
|
||||||
|
"tfidf_transformer = TfidfTransformer()\n",
|
||||||
|
"\n",
|
||||||
|
"X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)\n",
|
||||||
|
"X_train_tfidf.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Note: the `fit_transform()` method actually performs two operations: it fits an estimator to the data and then transforms our count-matrix to a tf-idf representation."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Combine Steps with TfidVectorizer\n",
|
||||||
|
"In the future, we can combine the CountVectorizer and TfidTransformer steps into one using [TfidVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"(3733, 7082)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"vectorizer = TfidfVectorizer()\n",
|
||||||
|
"\n",
|
||||||
|
"X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set\n",
|
||||||
|
"X_train_tfidf.shape"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Train a Classifier\n",
|
||||||
|
"Here we'll introduce an SVM classifier that's similar to SVC, called [LinearSVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html). LinearSVC handles sparse input better, and scales well to large numbers of samples."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n",
|
||||||
|
" intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
|
||||||
|
" multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
|
||||||
|
" verbose=0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.svm import LinearSVC\n",
|
||||||
|
"clf = LinearSVC()\n",
|
||||||
|
"clf.fit(X_train_tfidf,y_train)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Earlier we named our SVC classifier **svc_model**. Here we're using the more generic name **clf** (for classifier).</font>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Build a Pipeline\n",
|
||||||
|
"Remember that only our training set has been vectorized into a full vocabulary. In order to perform an analysis on our test set we'll have to submit it to the same procedures. Fortunately scikit-learn offers a [**Pipeline**](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) class that behaves like a compound classifier."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Pipeline(memory=None,\n",
|
||||||
|
" steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
|
||||||
|
" dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',\n",
|
||||||
|
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
|
||||||
|
" ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,\n",
|
||||||
|
" multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
|
||||||
|
" verbose=0))])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.pipeline import Pipeline\n",
|
||||||
|
"# from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"# from sklearn.svm import LinearSVC\n",
|
||||||
|
"\n",
|
||||||
|
"text_clf = Pipeline([('tfidf', TfidfVectorizer()),\n",
|
||||||
|
" ('clf', LinearSVC()),\n",
|
||||||
|
"])\n",
|
||||||
|
"\n",
|
||||||
|
"# Feed the training data through the pipeline\n",
|
||||||
|
"text_clf.fit(X_train, y_train) "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Test the classifier and display results"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Form a prediction set\n",
|
||||||
|
"predictions = text_clf.predict(X_test)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[1586 7]\n",
|
||||||
|
" [ 12 234]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Report the confusion matrix\n",
|
||||||
|
"from sklearn import metrics\n",
|
||||||
|
"print(metrics.confusion_matrix(y_test,predictions))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" ham 0.99 1.00 0.99 1593\n",
|
||||||
|
" spam 0.97 0.95 0.96 246\n",
|
||||||
|
"\n",
|
||||||
|
" micro avg 0.99 0.99 0.99 1839\n",
|
||||||
|
" macro avg 0.98 0.97 0.98 1839\n",
|
||||||
|
"weighted avg 0.99 0.99 0.99 1839\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print a classification report\n",
|
||||||
|
"print(metrics.classification_report(y_test,predictions))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"0.989668297988\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print the overall accuracy\n",
|
||||||
|
"print(metrics.accuracy_score(y_test,predictions))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Using the text of the messages, our model performed exceedingly well; it correctly predicted spam **98.97%** of the time!<br>\n",
|
||||||
|
"Now let's apply what we've learned to a text classification project involving positive and negative movie reviews.\n",
|
||||||
|
"\n",
|
||||||
|
"## Next up: Text Classification Project"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,229 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Text Classification Assessment\n",
|
||||||
|
"This assessment is very much like the Text Classification Project we just completed, and the dataset is very similar.\n",
|
||||||
|
"\n",
|
||||||
|
"The **moviereviews2.tsv** dataset contains the text of 6000 movie reviews. 3000 are positive, 3000 are negative, and the text has been preprocessed as a tab-delimited file. As before, labels are given as `pos` and `neg`.\n",
|
||||||
|
"\n",
|
||||||
|
"We've included 20 reviews that contain either `NaN` data, or have strings made up of whitespace.\n",
|
||||||
|
"\n",
|
||||||
|
"For more information on this dataset visit http://ai.stanford.edu/~amaas/data/sentiment/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #1: Perform imports and load the dataset into a pandas DataFrame\n",
|
||||||
|
"For this exercise you can load the dataset from `'../TextFiles/moviereviews2.tsv'`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #2: Check for missing values:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Check for NaN values:\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Check for whitespace strings (it's OK if there aren't any!):\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #3: Remove NaN values:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #4: Take a quick look at the `label` column:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #5: Split the data into train & test sets:\n",
|
||||||
|
"You may use whatever settings you like. To compare your results to the solution notebook, use `test_size=0.33, random_state=42`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #6: Build a pipeline to vectorize the date, then train and fit a model\n",
|
||||||
|
"You may use whatever model you like. To compare your results to the solution notebook, use `LinearSVC`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #7: Run predictions and analyze the results"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Form a prediction set\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Report the confusion matrix\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Print a classification report\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Print the overall accuracy\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Great job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,397 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Text Classification Assessment - Solution\n",
|
||||||
|
"This assessment is very much like the Text Classification Project we just completed, and the dataset is very similar.\n",
|
||||||
|
"\n",
|
||||||
|
"The **moviereviews2.tsv** dataset contains the text of 6000 movie reviews. 3000 are positive, 3000 are negative, and the text has been preprocessed as a tab-delimited file. As before, labels are given as `pos` and `neg`. \n",
|
||||||
|
"\n",
|
||||||
|
"We've included 20 reviews that contain either `NaN` data, or have strings made up of whitespace.\n",
|
||||||
|
"\n",
|
||||||
|
"For more information on this dataset visit http://ai.stanford.edu/~amaas/data/sentiment/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #1: Perform imports and load the dataset into a pandas DataFrame\n",
|
||||||
|
"For this exercise you can load the dataset from `'../TextFiles/moviereviews2.tsv'`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style>\n",
|
||||||
|
" .dataframe thead tr:only-child th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: left;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>review</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>I loved this movie and will watch it again. Or...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>A warm, touching movie that has a fantasy-like...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>I was not expecting the powerful filmmaking ex...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>This so-called \"documentary\" tries to tell tha...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>This show has been my escape from reality for ...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label review\n",
|
||||||
|
"0 pos I loved this movie and will watch it again. Or...\n",
|
||||||
|
"1 pos A warm, touching movie that has a fantasy-like...\n",
|
||||||
|
"2 pos I was not expecting the powerful filmmaking ex...\n",
|
||||||
|
"3 neg This so-called \"documentary\" tries to tell tha...\n",
|
||||||
|
"4 pos This show has been my escape from reality for ..."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"df = pd.read_csv('../TextFiles/moviereviews2.tsv', sep='\\t')\n",
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #2: Check for missing values:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"label 0\n",
|
||||||
|
"review 20\n",
|
||||||
|
"dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Check for NaN values:\n",
|
||||||
|
"df.isnull().sum()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Check for whitespace strings (it's OK if there aren't any!):\n",
|
||||||
|
"blanks = [] # start with an empty list\n",
|
||||||
|
"\n",
|
||||||
|
"for i,lb,rv in df.itertuples(): # iterate over the DataFrame\n",
|
||||||
|
" if type(rv)==str: # avoid NaN values\n",
|
||||||
|
" if rv.isspace(): # test 'review' for whitespace\n",
|
||||||
|
" blanks.append(i) # add matching index numbers to the list\n",
|
||||||
|
" \n",
|
||||||
|
"len(blanks)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #3: Remove NaN values:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df.dropna(inplace=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #4: Take a quick look at the `label` column:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"pos 2990\n",
|
||||||
|
"neg 2990\n",
|
||||||
|
"Name: label, dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df['label'].value_counts()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #5: Split the data into train & test sets:\n",
|
||||||
|
"You may use whatever settings you like. To compare your results to the solution notebook, use `test_size=0.33, random_state=42`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"X = df['review']\n",
|
||||||
|
"y = df['label']\n",
|
||||||
|
"\n",
|
||||||
|
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #6: Build a pipeline to vectorize the date, then train and fit a model\n",
|
||||||
|
"You may use whatever model you like. To compare your results to the solution notebook, use `LinearSVC`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"Pipeline(memory=None,\n",
|
||||||
|
" steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
|
||||||
|
" dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',\n",
|
||||||
|
" lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
|
||||||
|
" ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,\n",
|
||||||
|
" multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n",
|
||||||
|
" verbose=0))])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.pipeline import Pipeline\n",
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||||
|
"from sklearn.svm import LinearSVC\n",
|
||||||
|
"\n",
|
||||||
|
"text_clf = Pipeline([('tfidf', TfidfVectorizer()),\n",
|
||||||
|
" ('clf', LinearSVC()),\n",
|
||||||
|
"])\n",
|
||||||
|
"\n",
|
||||||
|
"# Feed the training data through the pipeline\n",
|
||||||
|
"text_clf.fit(X_train, y_train) "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Task #7: Run predictions and analyze the results"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Form a prediction set\n",
|
||||||
|
"predictions = text_clf.predict(X_test)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[900 91]\n",
|
||||||
|
" [ 63 920]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Report the confusion matrix\n",
|
||||||
|
"from sklearn import metrics\n",
|
||||||
|
"print(metrics.confusion_matrix(y_test,predictions))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" neg 0.93 0.91 0.92 991\n",
|
||||||
|
" pos 0.91 0.94 0.92 983\n",
|
||||||
|
"\n",
|
||||||
|
" micro avg 0.92 0.92 0.92 1974\n",
|
||||||
|
" macro avg 0.92 0.92 0.92 1974\n",
|
||||||
|
"weighted avg 0.92 0.92 0.92 1974\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print a classification report\n",
|
||||||
|
"print(metrics.classification_report(y_test,predictions))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"0.921985815603\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Print the overall accuracy\n",
|
||||||
|
"print(metrics.accuracy_score(y_test,predictions))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Great job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
3
Praktikum Python Code/03-Text-Classification/1.txt
Normal file
3
Praktikum Python Code/03-Text-Classification/1.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
This is a story about cats
|
||||||
|
our feline pets
|
||||||
|
Cats are furry animals
|
||||||
3
Praktikum Python Code/03-Text-Classification/2.txt
Normal file
3
Praktikum Python Code/03-Text-Classification/2.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
This story is about surfing
|
||||||
|
Catching waves is fun
|
||||||
|
Surfing is a popular water sport
|
||||||
@ -0,0 +1,623 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Semantics and Word Vectors\n",
|
||||||
|
"Sometimes called \"opinion mining\", [Wikipedia](https://en.wikipedia.org/wiki/Sentiment_analysis) defines ***sentiment analysis*** as\n",
|
||||||
|
"<div class=\"alert alert-info\" style=\"margin: 20px\">\"the use of natural language processing ... to systematically identify, extract, quantify, and study affective states and subjective information.<br>\n",
|
||||||
|
"Generally speaking, sentiment analysis aims to determine the attitude of a speaker, writer, or other subject with respect to some topic or the overall contextual polarity or emotional reaction to a document, interaction, or event.\"</div>\n",
|
||||||
|
"\n",
|
||||||
|
"Up to now we've used the occurrence of specific words and word patterns to perform test classifications. In this section we'll take machine learning even further, and try to extract intended meanings from complex phrases. Some simple examples include:\n",
|
||||||
|
"* Python is relatively easy to learn.\n",
|
||||||
|
"* That was the worst movie I've ever seen.\n",
|
||||||
|
"\n",
|
||||||
|
"However, things get harder with phrases like:\n",
|
||||||
|
"* I do not dislike green eggs and ham. (requires negation handling)\n",
|
||||||
|
"\n",
|
||||||
|
"The way this is done is through complex machine learning algorithms like [word2vec](https://en.wikipedia.org/wiki/Word2vec). The idea is to create numerical arrays, or *word embeddings* for every word in a large corpus. Each word is assigned its own vector in such a way that words that frequently appear together in the same context are given vectors that are close together. The result is a model that may not know that a \"lion\" is an animal, but does know that \"lion\" is closer in context to \"cat\" than \"dandelion\".\n",
|
||||||
|
"\n",
|
||||||
|
"It is important to note that *building* useful models takes a long time - hours or days to train a large corpus - and that for our purposes it is best to import an existing model rather than take the time to train our own.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Installing Larger spaCy Models\n",
|
||||||
|
"Up to now we've been using spaCy's smallest English language model, [**en_core_web_sm**](https://spacy.io/models/en#en_core_web_sm) (35MB), which provides vocabulary, syntax, and entities, but not vectors. To take advantage of built-in word vectors we'll need a larger library. We have a few options:\n",
|
||||||
|
"> [**en_core_web_md**](https://spacy.io/models/en#en_core_web_md) (116MB) Vectors: 685k keys, 20k unique vectors (300 dimensions)\n",
|
||||||
|
"> <br>or<br>\n",
|
||||||
|
"> [**en_core_web_lg**](https://spacy.io/models/en#en_core_web_lg) (812MB) Vectors: 685k keys, 685k unique vectors (300 dimensions)\n",
|
||||||
|
"\n",
|
||||||
|
"If you plan to rely heavily on word vectors, consider using spaCy's largest vector library containing over one million unique vectors:\n",
|
||||||
|
"> [**en_vectors_web_lg**](https://spacy.io/models/en#en_vectors_web_lg) (631MB) Vectors: 1.1m keys, 1.1m unique vectors (300 dimensions)\n",
|
||||||
|
"\n",
|
||||||
|
"For our purposes **en_core_web_md** should suffice.\n",
|
||||||
|
"\n",
|
||||||
|
"### From the command line (you must run this as admin or use sudo):\n",
|
||||||
|
"\n",
|
||||||
|
"> `activate spacyenv` *if using a virtual environment* \n",
|
||||||
|
"> \n",
|
||||||
|
"> `python -m spacy download en_core_web_md` \n",
|
||||||
|
"> `python -m spacy download en_core_web_lg`   *optional library* \n",
|
||||||
|
"> `python -m spacy download en_vectors_web_lg` *optional library* \n",
|
||||||
|
"\n",
|
||||||
|
"> ### If successful, you should see a message like: \n",
|
||||||
|
"> <tt><br>\n",
|
||||||
|
"> **Linking successful**<br>\n",
|
||||||
|
"> C:\\Anaconda3\\envs\\spacyenv\\lib\\site-packages\\en_core_web_md --><br>\n",
|
||||||
|
"> C:\\Anaconda3\\envs\\spacyenv\\lib\\site-packages\\spacy\\data\\en_core_web_md<br>\n",
|
||||||
|
"> <br>\n",
|
||||||
|
"> You can now load the model via spacy.load('en_core_web_md')</tt>\n",
|
||||||
|
"\n",
|
||||||
|
"<font color=green>Of course, we have a third option, and that is to train our own vectors from a large corpus of documents. Unfortunately this would take a prohibitively large amount of time and processing power.</font> "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"# Word Vectors\n",
|
||||||
|
"Word vectors - also called *word embeddings* - are mathematical descriptions of individual words such that words that appear frequently together in the language will have similar values. In this way we can mathematically derive *context*. As mentioned above, the word vector for \"lion\" will be closer in value to \"cat\" than to \"dandelion\"."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Vector values\n",
|
||||||
|
"So what does a word vector look like? Since spaCy employs 300 dimensions, word vectors are stored as 300-item arrays.\n",
|
||||||
|
"\n",
|
||||||
|
"Note that we would see the same set of values with **en_core_web_md** and **en_core_web_lg**, as both were trained using the [word2vec](https://en.wikipedia.org/wiki/Word2vec) family of algorithms."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import spaCy and load the language library\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_md') # make sure to use a larger model!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"array([ 1.89630002e-01, -4.03090000e-01, 3.53500009e-01,\n",
|
||||||
|
" -4.79070008e-01, -4.33109999e-01, 2.38570005e-01,\n",
|
||||||
|
" 2.69620001e-01, 6.43320009e-02, 3.07669997e-01,\n",
|
||||||
|
" 1.37119997e+00, -3.75820011e-01, -2.27129996e-01,\n",
|
||||||
|
" -3.56570005e-01, -2.53549993e-01, 1.75429992e-02,\n",
|
||||||
|
" 3.39619994e-01, 7.47229978e-02, 5.12260020e-01,\n",
|
||||||
|
" -3.97590011e-01, 5.13330009e-03, -3.09289992e-01,\n",
|
||||||
|
" 4.89110015e-02, -1.86100006e-01, -4.17019993e-01,\n",
|
||||||
|
" -8.16389978e-01, -1.69080004e-01, -2.62459993e-01,\n",
|
||||||
|
" -1.59830004e-02, 1.24789998e-01, -3.72759998e-02,\n",
|
||||||
|
" -5.71250021e-01, -1.62959993e-01, 1.23760000e-01,\n",
|
||||||
|
" -5.54639995e-02, 1.32440001e-01, 2.75190007e-02,\n",
|
||||||
|
" 1.25919998e-01, -3.27219993e-01, -4.91649985e-01,\n",
|
||||||
|
" -3.55589986e-01, -3.06300014e-01, 6.11849986e-02,\n",
|
||||||
|
" -1.69320002e-01, -6.24050014e-02, 6.57630026e-01,\n",
|
||||||
|
" -2.79249996e-01, -3.04499990e-03, -2.23999992e-02,\n",
|
||||||
|
" -2.80149996e-01, -2.19750002e-01, -4.31879997e-01,\n",
|
||||||
|
" 3.98639999e-02, -2.21019998e-01, -4.26930003e-02,\n",
|
||||||
|
" 5.27479984e-02, 2.87259996e-01, 1.23149998e-01,\n",
|
||||||
|
" -2.86619999e-02, 7.82940015e-02, 4.67539996e-01,\n",
|
||||||
|
" -2.45890006e-01, -1.10639997e-01, 7.22500011e-02,\n",
|
||||||
|
" -9.49800014e-02, -2.75480002e-01, -5.40970027e-01,\n",
|
||||||
|
" 1.28230006e-01, -8.24080035e-02, 3.10350001e-01,\n",
|
||||||
|
" -6.33940026e-02, -7.37550020e-01, -5.49920022e-01,\n",
|
||||||
|
" 9.99990031e-02, -2.07580000e-01, -3.96739990e-02,\n",
|
||||||
|
" 2.06640005e-01, -9.75570008e-02, -3.70920002e-01,\n",
|
||||||
|
" 2.79009998e-01, -6.22179985e-01, -1.02799997e-01,\n",
|
||||||
|
" 2.32710004e-01, 4.38380003e-01, 3.24449986e-02,\n",
|
||||||
|
" -2.98660010e-01, -7.36109987e-02, 7.15939999e-01,\n",
|
||||||
|
" 1.42409995e-01, 2.77700007e-01, -3.98920000e-01,\n",
|
||||||
|
" 3.66559997e-02, 1.57590002e-01, 8.20140019e-02,\n",
|
||||||
|
" -5.73430002e-01, 3.54570001e-01, 2.24910006e-01,\n",
|
||||||
|
" -6.26990020e-01, -8.81059989e-02, 2.43609995e-01,\n",
|
||||||
|
" 3.85329992e-01, -1.40829995e-01, 1.76909998e-01,\n",
|
||||||
|
" 7.08969980e-02, 1.79509997e-01, -4.59069997e-01,\n",
|
||||||
|
" -8.21200013e-01, -2.66309995e-02, 6.25490025e-02,\n",
|
||||||
|
" 4.24149990e-01, -8.96300003e-02, -2.46539995e-01,\n",
|
||||||
|
" 1.41560003e-01, 4.01870012e-01, -4.12319988e-01,\n",
|
||||||
|
" 8.45159963e-02, -1.06260002e-01, 7.31450021e-01,\n",
|
||||||
|
" 1.92169994e-01, 1.42399997e-01, 2.85109997e-01,\n",
|
||||||
|
" -2.94539988e-01, -2.19479993e-01, 9.04600024e-01,\n",
|
||||||
|
" -1.90980002e-01, -1.03400004e+00, -1.57539994e-01,\n",
|
||||||
|
" -1.19640000e-01, 4.98879999e-01, -1.06239998e+00,\n",
|
||||||
|
" -3.28200012e-01, -1.12319998e-02, -7.94820011e-01,\n",
|
||||||
|
" 3.72750014e-01, -6.87099993e-03, -2.57719994e-01,\n",
|
||||||
|
" -4.70050007e-01, -4.13870007e-01, -6.40890002e-02,\n",
|
||||||
|
" -2.80330002e-01, -4.07779999e-02, -2.48659992e+00,\n",
|
||||||
|
" 6.24939986e-03, -1.02100000e-02, 1.27519995e-01,\n",
|
||||||
|
" 3.49649996e-01, -1.25709996e-01, 3.15699995e-01,\n",
|
||||||
|
" 4.19259995e-01, 2.00560004e-01, -5.59840024e-01,\n",
|
||||||
|
" -2.28009999e-01, 1.20119996e-01, -2.05180002e-03,\n",
|
||||||
|
" -8.97639990e-02, -8.03729966e-02, 1.19690001e-02,\n",
|
||||||
|
" -2.69780010e-01, 3.48289996e-01, 7.36640021e-03,\n",
|
||||||
|
" -1.11369997e-01, 6.34100020e-01, 3.84490013e-01,\n",
|
||||||
|
" -6.22479975e-01, 4.11450006e-02, 2.59220004e-01,\n",
|
||||||
|
" 6.58110023e-01, -4.95480001e-01, -1.30300000e-01,\n",
|
||||||
|
" -3.82789999e-01, 1.11560002e-01, -4.30849999e-01,\n",
|
||||||
|
" 3.44729990e-01, 2.71090008e-02, -2.51080006e-01,\n",
|
||||||
|
" -2.80110002e-01, 2.16619998e-01, 3.26599985e-01,\n",
|
||||||
|
" 5.58950007e-02, 7.60769993e-02, -5.24800010e-02,\n",
|
||||||
|
" 4.59280014e-02, -2.52660006e-01, 5.28450012e-01,\n",
|
||||||
|
" -1.31449997e-01, -1.24530002e-01, 4.05559987e-01,\n",
|
||||||
|
" 3.18769991e-01, 2.44149994e-02, -2.26199999e-01,\n",
|
||||||
|
" -6.19599998e-01, -4.08859998e-01, -3.55339982e-02,\n",
|
||||||
|
" -5.51229995e-03, 2.34380007e-01, 8.78539979e-01,\n",
|
||||||
|
" -2.51610011e-01, 4.05999988e-01, -4.42840010e-01,\n",
|
||||||
|
" 3.49339992e-01, -5.64289987e-01, -2.36760005e-01,\n",
|
||||||
|
" 6.21990025e-01, -2.81749994e-01, 4.20240015e-01,\n",
|
||||||
|
" 1.00429997e-01, -1.47200003e-01, 4.95929986e-01,\n",
|
||||||
|
" -3.58500004e-01, -1.39980003e-01, -2.74940014e-01,\n",
|
||||||
|
" 2.38270000e-01, 5.72679996e-01, 7.90250003e-02,\n",
|
||||||
|
" 1.78720001e-02, -2.18290001e-01, 5.50500005e-02,\n",
|
||||||
|
" -5.41999996e-01, 1.67879999e-01, 3.90650004e-01,\n",
|
||||||
|
" 3.02089989e-01, 2.30399996e-01, -3.93510014e-02,\n",
|
||||||
|
" -2.10779995e-01, -2.72240013e-01, 1.69070005e-01,\n",
|
||||||
|
" 5.48189998e-01, 9.48880017e-02, 7.97980011e-01,\n",
|
||||||
|
" -6.61579967e-02, 1.98440000e-01, 2.03070000e-01,\n",
|
||||||
|
" 4.48080003e-02, -1.02399997e-01, -6.99089989e-02,\n",
|
||||||
|
" -3.67560014e-02, 9.51590016e-02, -2.78299987e-01,\n",
|
||||||
|
" -1.05970003e-01, -1.62760004e-01, -1.82109997e-01,\n",
|
||||||
|
" -3.18969995e-01, -2.16330007e-01, 1.49939999e-01,\n",
|
||||||
|
" -7.20570013e-02, 2.22639993e-01, -4.55509990e-01,\n",
|
||||||
|
" 3.03409994e-01, 1.84310004e-01, 2.16810003e-01,\n",
|
||||||
|
" -3.19400012e-01, 2.64259994e-01, 5.81059992e-01,\n",
|
||||||
|
" 5.46349995e-02, 6.32380009e-01, 4.31690007e-01,\n",
|
||||||
|
" 9.03429985e-02, 1.94940001e-01, 3.54829997e-01,\n",
|
||||||
|
" -2.07059998e-02, -7.31169999e-01, 1.29409999e-01,\n",
|
||||||
|
" 1.74180001e-01, -1.50649995e-01, 5.33550009e-02,\n",
|
||||||
|
" 4.47940007e-02, -1.65999994e-01, 2.20070004e-01,\n",
|
||||||
|
" -5.39699972e-01, -2.49679998e-01, -2.64640003e-01,\n",
|
||||||
|
" -5.55149972e-01, 5.82419991e-01, 2.22949997e-01,\n",
|
||||||
|
" 2.44330004e-01, 4.52749997e-01, 3.46929997e-01,\n",
|
||||||
|
" 1.22550003e-01, -3.90589982e-02, -3.27490002e-01,\n",
|
||||||
|
" -2.78910011e-01, 1.37659997e-01, 3.83920014e-01,\n",
|
||||||
|
" 1.05430000e-03, -1.02420002e-02, 4.92049992e-01,\n",
|
||||||
|
" -1.79220006e-01, 4.12149988e-02, 1.35470003e-01,\n",
|
||||||
|
" -2.05980003e-01, -2.31940001e-01, -7.77010024e-01,\n",
|
||||||
|
" -3.82369995e-01, -7.63830006e-01, 1.94179997e-01,\n",
|
||||||
|
" -1.54410005e-01, 8.97400022e-01, 3.06259990e-01,\n",
|
||||||
|
" 4.03759986e-01, 2.17380002e-01, -3.80499989e-01], dtype=float32)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nlp(u'lion').vector"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"What's interesting is that Doc and Span objects themselves have vectors, derived from the averages of individual token vectors. <br>This makes it possible to compare similarities between whole documents."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"array([ -1.96635887e-01, -2.32740352e-03, -5.36607020e-02,\n",
|
||||||
|
" -6.10564947e-02, -4.08843048e-02, 1.45266443e-01,\n",
|
||||||
|
" -1.08268000e-01, -6.27789786e-03, 1.48455709e-01,\n",
|
||||||
|
" 1.90697408e+00, -2.57692993e-01, -1.95818534e-03,\n",
|
||||||
|
" -1.16141019e-02, -1.62858292e-01, -1.62938282e-01,\n",
|
||||||
|
" 1.18210977e-02, 5.12646027e-02, 1.00078702e+00,\n",
|
||||||
|
" -2.01447997e-02, -2.54611671e-01, -1.28316596e-01,\n",
|
||||||
|
" -1.97198763e-02, -2.89733019e-02, -1.94347113e-01,\n",
|
||||||
|
" 1.26644447e-01, -8.69869068e-02, -2.20812604e-01,\n",
|
||||||
|
" -1.58452198e-01, 9.86308008e-02, -1.79210991e-01,\n",
|
||||||
|
" -1.55290633e-01, 1.95643142e-01, 2.66436003e-02,\n",
|
||||||
|
" -1.64984968e-02, 1.18824698e-01, -1.17830629e-03,\n",
|
||||||
|
" 4.99809943e-02, -4.23077159e-02, -3.86111848e-02,\n",
|
||||||
|
" -7.47400150e-03, 1.23448208e-01, 9.60620027e-03,\n",
|
||||||
|
" -3.32463719e-02, -1.77848607e-01, 1.19390726e-01,\n",
|
||||||
|
" 1.87545009e-02, -1.84173390e-01, 6.91781715e-02,\n",
|
||||||
|
" 1.28520593e-01, 1.48827005e-02, -1.78013414e-01,\n",
|
||||||
|
" 1.10003807e-01, -3.35464999e-02, -1.52476998e-02,\n",
|
||||||
|
" -9.41195935e-02, 1.58633105e-02, -1.29811959e-02,\n",
|
||||||
|
" 1.40140295e-01, -1.47720069e-01, -3.81718054e-02,\n",
|
||||||
|
" 4.66808230e-02, 3.31423879e-02, 7.97965974e-02,\n",
|
||||||
|
" 1.60014004e-01, 8.90410226e-03, -1.01237908e-01,\n",
|
||||||
|
" 7.39663988e-02, 2.47380026e-02, 4.26153988e-02,\n",
|
||||||
|
" 9.66729969e-02, 2.87616011e-02, 7.22841993e-02,\n",
|
||||||
|
" 1.76565602e-01, 7.55538046e-02, 1.10501610e-01,\n",
|
||||||
|
" -1.02358103e-01, -5.43345436e-02, -4.12176028e-02,\n",
|
||||||
|
" 3.98623049e-02, -2.98339734e-03, -5.32988012e-02,\n",
|
||||||
|
" 1.90624595e-01, -6.42587021e-02, -1.76225007e-02,\n",
|
||||||
|
" 3.94165330e-02, -1.14773512e-01, 4.25241649e-01,\n",
|
||||||
|
" 2.07243040e-01, 2.60730416e-01, 1.31226778e-01,\n",
|
||||||
|
" -8.00508037e-02, 6.88939020e-02, 7.05293044e-02,\n",
|
||||||
|
" -1.10744104e-01, 4.14580032e-02, 5.13269613e-03,\n",
|
||||||
|
" -1.29179001e-01, -5.84542975e-02, 9.13560018e-02,\n",
|
||||||
|
" -1.75975591e-01, 9.52741057e-02, 1.37699964e-02,\n",
|
||||||
|
" -1.30865201e-01, -4.76420000e-02, 1.61670998e-01,\n",
|
||||||
|
" -6.76959991e-01, 2.68619388e-01, -7.94106945e-02,\n",
|
||||||
|
" 8.56394917e-02, -5.94138019e-02, 7.44821057e-02,\n",
|
||||||
|
" -1.67490095e-01, 1.97447598e-01, -2.71580786e-01,\n",
|
||||||
|
" 1.51915969e-02, 1.12019002e-01, -4.32585999e-02,\n",
|
||||||
|
" -1.03554968e-02, 6.33272156e-02, 5.20200143e-03,\n",
|
||||||
|
" 4.94491048e-02, -1.07016601e-01, -6.45387918e-02,\n",
|
||||||
|
" -1.76269561e-01, -1.98135704e-01, 4.17800918e-02,\n",
|
||||||
|
" 1.23686995e-02, -1.13280594e-01, -4.03523073e-02,\n",
|
||||||
|
" -4.21132054e-03, -9.65667963e-02, -7.12300017e-02,\n",
|
||||||
|
" -2.19088510e-01, 6.41715974e-02, 1.11634992e-01,\n",
|
||||||
|
" -7.12868944e-02, -8.27060193e-02, 1.53889004e-02,\n",
|
||||||
|
" 6.84699565e-02, -5.50561920e-02, -1.84788990e+00,\n",
|
||||||
|
" -4.75010052e-02, 1.31487206e-01, 1.03359401e-01,\n",
|
||||||
|
" 1.80857688e-01, -8.03041980e-02, 2.27739997e-02,\n",
|
||||||
|
" 5.56868985e-02, 9.20986086e-02, 6.22248054e-02,\n",
|
||||||
|
" 4.86670025e-02, -4.06427011e-02, 3.83703932e-02,\n",
|
||||||
|
" -4.05869968e-02, -2.26339817e-01, 3.69174965e-02,\n",
|
||||||
|
" -1.30066186e-01, 1.27621710e-01, 2.76701003e-02,\n",
|
||||||
|
" -1.39992401e-01, -3.75526994e-02, -8.11104029e-02,\n",
|
||||||
|
" -1.78196102e-01, -1.21652998e-01, -5.88919744e-02,\n",
|
||||||
|
" -1.06128812e-01, -4.72390745e-03, -1.14130601e-01,\n",
|
||||||
|
" -7.60087445e-02, -9.48704034e-02, 1.68780401e-01,\n",
|
||||||
|
" 3.82669941e-02, -1.68303996e-01, -1.30991384e-01,\n",
|
||||||
|
" -2.46409744e-01, 1.42855030e-02, 1.23633012e-01,\n",
|
||||||
|
" 7.95699935e-03, -3.22283022e-02, 3.75844017e-02,\n",
|
||||||
|
" -4.48104031e-02, -2.00578898e-01, -2.86081016e-01,\n",
|
||||||
|
" -1.83181003e-01, -5.46899159e-04, 6.52990937e-02,\n",
|
||||||
|
" 2.34263036e-02, -7.60660022e-02, 1.13897599e-01,\n",
|
||||||
|
" -7.05380812e-02, 1.30277812e-01, 2.83973999e-02,\n",
|
||||||
|
" 1.73887815e-02, -1.71358977e-02, 1.78455990e-02,\n",
|
||||||
|
" 1.86773703e-01, 1.83613986e-01, -4.05438878e-02,\n",
|
||||||
|
" 1.28929759e-03, -3.71900201e-03, -1.97373003e-01,\n",
|
||||||
|
" 4.78463694e-02, -2.21408010e-01, 2.68826094e-02,\n",
|
||||||
|
" 2.40951017e-01, 7.42616802e-02, 7.53984973e-02,\n",
|
||||||
|
" -7.67349079e-02, -5.37766796e-03, -8.06540065e-03,\n",
|
||||||
|
" 1.88790001e-02, 8.31135064e-02, -5.20760007e-02,\n",
|
||||||
|
" 1.29393607e-01, 4.09864075e-02, 7.31946975e-02,\n",
|
||||||
|
" -1.64099425e-01, 1.17529690e-01, -6.96440935e-02,\n",
|
||||||
|
" 1.91028208e-01, 1.01721905e-01, 6.34808987e-02,\n",
|
||||||
|
" -8.29815865e-02, -6.95784390e-03, -1.69757873e-01,\n",
|
||||||
|
" -2.02478573e-01, 3.65395918e-02, 1.32345587e-01,\n",
|
||||||
|
" 3.53013016e-02, 2.27603033e-01, -1.52753398e-01,\n",
|
||||||
|
" 7.80210178e-03, 2.06879750e-02, -8.63540452e-03,\n",
|
||||||
|
" 9.85722095e-02, -2.91380938e-02, -1.42988954e-02,\n",
|
||||||
|
" -9.39018354e-02, 1.43968105e-01, 7.82396942e-02,\n",
|
||||||
|
" -1.93540990e-01, -9.36544985e-02, -8.23533013e-02,\n",
|
||||||
|
" 4.40272018e-02, -2.22195080e-03, -1.29856914e-01,\n",
|
||||||
|
" -1.53841600e-01, -1.55329984e-02, -2.55266696e-01,\n",
|
||||||
|
" 1.14425398e-01, -1.03161987e-02, -4.66439016e-02,\n",
|
||||||
|
" -5.69390282e-02, 7.72780031e-02, 1.28908500e-01,\n",
|
||||||
|
" 1.61679000e-01, 1.50837511e-01, 6.18334934e-02,\n",
|
||||||
|
" -9.06937942e-02, -3.52137014e-02, 1.35956988e-01,\n",
|
||||||
|
" 7.52059072e-02, 5.73905036e-02, -1.65402606e-01,\n",
|
||||||
|
" 1.68419987e-01, -1.83722824e-01, 5.91069926e-03,\n",
|
||||||
|
" -1.25354990e-01, 3.95771042e-02, 5.67352995e-02,\n",
|
||||||
|
" -5.63519308e-03, 1.53597593e-01, -6.84822723e-02,\n",
|
||||||
|
" -1.40976995e-01, -3.62732522e-02, -2.61475928e-02,\n",
|
||||||
|
" 2.50091963e-02, 1.18994810e-01, -2.66857035e-02,\n",
|
||||||
|
" 7.50442073e-02, 2.04583794e-01, 4.37736101e-02,\n",
|
||||||
|
" -8.17096978e-02, 6.80228025e-02, 5.50465994e-02,\n",
|
||||||
|
" -2.39979066e-02, 7.68290013e-02, -5.76773956e-02,\n",
|
||||||
|
" 8.30340981e-02, 3.63199934e-02, -1.65820405e-01,\n",
|
||||||
|
" 2.55408939e-02, -5.30679002e-02, -1.35961995e-01,\n",
|
||||||
|
" -1.03501797e-01, 1.36406809e-01, 9.66293067e-02,\n",
|
||||||
|
" 7.33902007e-02, -1.83055893e-01, -2.73141060e-02], dtype=float32)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"doc = nlp(u'The quick brown fox jumped over the lazy dogs.')\n",
|
||||||
|
"\n",
|
||||||
|
"doc.vector"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Identifying similar vectors\n",
|
||||||
|
"The best way to expose vector relationships is through the `.similarity()` method of Doc tokens."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"lion lion 1.0\n",
|
||||||
|
"lion cat 0.526544\n",
|
||||||
|
"lion pet 0.399238\n",
|
||||||
|
"cat lion 0.526544\n",
|
||||||
|
"cat cat 1.0\n",
|
||||||
|
"cat pet 0.750546\n",
|
||||||
|
"pet lion 0.399238\n",
|
||||||
|
"pet cat 0.750546\n",
|
||||||
|
"pet pet 1.0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create a three-token Doc object:\n",
|
||||||
|
"tokens = nlp(u'lion cat pet')\n",
|
||||||
|
"\n",
|
||||||
|
"# Iterate through token combinations:\n",
|
||||||
|
"for token1 in tokens:\n",
|
||||||
|
" for token2 in tokens:\n",
|
||||||
|
" print(token1.text, token2.text, token1.similarity(token2))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<font color=green>Note that order doesn't matter. `token1.similarity(token2)` has the same value as `token2.similarity(token1)`.</font>\n",
|
||||||
|
"#### To view this as a table:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/markdown": [
|
||||||
|
"<table><tr><th></th><th>lion</th><th>cat</th><th>pet</th></tr><tr><td>**lion**</td><td>1.0</td><td>0.5265</td><td>0.3992</td></tr><tr><td>**cat**</td><td>0.5265</td><td>1.0</td><td>0.7505</td></tr><tr><td>**pet**</td><td>0.3992</td><td>0.7505</td><td>1.0</td></tr>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.Markdown object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# For brevity, assign each token a name\n",
|
||||||
|
"a,b,c = tokens\n",
|
||||||
|
"\n",
|
||||||
|
"# Display as a Markdown table (this only works in Jupyter!)\n",
|
||||||
|
"from IPython.display import Markdown, display\n",
|
||||||
|
"display(Markdown(f'<table><tr><th></th><th>{a.text}</th><th>{b.text}</th><th>{c.text}</th></tr>\\\n",
|
||||||
|
"<tr><td>**{a.text}**</td><td>{a.similarity(a):{.4}}</td><td>{b.similarity(a):{.4}}</td><td>{c.similarity(a):{.4}}</td></tr>\\\n",
|
||||||
|
"<tr><td>**{b.text}**</td><td>{a.similarity(b):{.4}}</td><td>{b.similarity(b):{.4}}</td><td>{c.similarity(b):{.4}}</td></tr>\\\n",
|
||||||
|
"<tr><td>**{c.text}**</td><td>{a.similarity(c):{.4}}</td><td>{b.similarity(c):{.4}}</td><td>{c.similarity(c):{.4}}</td></tr>'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"As expected, we see the strongest similarity between \"cat\" and \"pet\", the weakest between \"lion\" and \"pet\", and some similarity between \"lion\" and \"cat\". A word will have a perfect (1.0) similarity with itself.\n",
|
||||||
|
"\n",
|
||||||
|
"If you're curious, the similarity between \"lion\" and \"dandelion\" is very small:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0.18064451829601527"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nlp(u'lion').similarity(nlp(u'dandelion'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Opposites are not necessarily different\n",
|
||||||
|
"Words that have opposite meaning, but that often appear in the same *context* may have similar vectors."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"like like 1.0\n",
|
||||||
|
"like love 0.657904\n",
|
||||||
|
"like hate 0.657465\n",
|
||||||
|
"love like 0.657904\n",
|
||||||
|
"love love 1.0\n",
|
||||||
|
"love hate 0.63931\n",
|
||||||
|
"hate like 0.657465\n",
|
||||||
|
"hate love 0.63931\n",
|
||||||
|
"hate hate 1.0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Create a three-token Doc object:\n",
|
||||||
|
"tokens = nlp(u'like love hate')\n",
|
||||||
|
"\n",
|
||||||
|
"# Iterate through token combinations:\n",
|
||||||
|
"for token1 in tokens:\n",
|
||||||
|
" for token2 in tokens:\n",
|
||||||
|
" print(token1.text, token2.text, token1.similarity(token2))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Vector norms\n",
|
||||||
|
"It's sometimes helpful to aggregate 300 dimensions into a [Euclidian (L2) norm](https://en.wikipedia.org/wiki/Norm_%28mathematics%29#Euclidean_norm), computed as the square root of the sum-of-squared-vectors. This is accessible as the `.vector_norm` token attribute. Other helpful attributes include `.has_vector` and `.is_oov` or *out of vocabulary*.\n",
|
||||||
|
"\n",
|
||||||
|
"For example, our 685k vector library may not have the word \"[nargle](https://en.wikibooks.org/wiki/Muggles%27_Guide_to_Harry_Potter/Magic/Nargle)\". To test this:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"dog True 7.03367 False\n",
|
||||||
|
"cat True 6.68082 False\n",
|
||||||
|
"nargle False 0.0 True\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"tokens = nlp(u'dog cat nargle')\n",
|
||||||
|
"\n",
|
||||||
|
"for token in tokens:\n",
|
||||||
|
" print(token.text, token.has_vector, token.vector_norm, token.is_oov)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Indeed we see that \"nargle\" does not have a vector, so the vector_norm value is zero, and it identifies as *out of vocabulary*."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Vector arithmetic\n",
|
||||||
|
"Believe it or not, we can actually calculate new vectors by adding & subtracting related vectors. A famous example suggests\n",
|
||||||
|
"<pre>\"king\" - \"man\" + \"woman\" = \"queen\"</pre>\n",
|
||||||
|
"Let's try it out!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['king', 'queen', 'commoner', 'highness', 'prince', 'sultan', 'maharajas', 'princes', 'kumbia', 'kings']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from scipy import spatial\n",
|
||||||
|
"\n",
|
||||||
|
"cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)\n",
|
||||||
|
"\n",
|
||||||
|
"king = nlp.vocab['king'].vector\n",
|
||||||
|
"man = nlp.vocab['man'].vector\n",
|
||||||
|
"woman = nlp.vocab['woman'].vector\n",
|
||||||
|
"\n",
|
||||||
|
"# Now we find the closest vector in the vocabulary to the result of \"man\" - \"woman\" + \"queen\"\n",
|
||||||
|
"new_vector = king - man + woman\n",
|
||||||
|
"computed_similarities = []\n",
|
||||||
|
"\n",
|
||||||
|
"for word in nlp.vocab:\n",
|
||||||
|
" # Ignore words without vectors and mixed-case words:\n",
|
||||||
|
" if word.has_vector:\n",
|
||||||
|
" if word.is_lower:\n",
|
||||||
|
" if word.is_alpha:\n",
|
||||||
|
" similarity = cosine_similarity(new_vector, word.vector)\n",
|
||||||
|
" computed_similarities.append((word, similarity))\n",
|
||||||
|
"\n",
|
||||||
|
"computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])\n",
|
||||||
|
"\n",
|
||||||
|
"print([w[0].text for w in computed_similarities[:10]])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"So in this case, \"king\" was still closer than \"queen\" to our calculated vector, although \"queen\" did show up!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Next up: Sentiment Analysis"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,821 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Sentiment Analysis\n",
|
||||||
|
"Now that we've seen word vectors we can start to investigate sentiment analysis. The goal is to find commonalities between documents, with the understanding that similarly *combined* vectors should correspond to similar sentiments.\n",
|
||||||
|
"\n",
|
||||||
|
"While the scope of sentiment analysis is very broad, we will focus our work in two ways.\n",
|
||||||
|
"\n",
|
||||||
|
"### 1. Polarity classification\n",
|
||||||
|
"We won't try to determine if a sentence is objective or subjective, fact or opinion. Rather, we care only if the text expresses a *positive*, *negative* or *neutral* opinion.\n",
|
||||||
|
"### 2. Document level scope\n",
|
||||||
|
"We'll also try to aggregate all of the sentences in a document or paragraph, to arrive at an overall opinion.\n",
|
||||||
|
"### 3. Coarse analysis\n",
|
||||||
|
"We won't try to perform a fine-grained analysis that would determine the degree of positivity/negativity. That is, we're not trying to guess how many stars a reviewer awarded, just whether the review was positive or negative."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Broad Steps:\n",
|
||||||
|
"* First, consider the text being analyzed. A model trained on paragraph-long movie reviews might not be effective on tweets. Make sure to use an appropriate model for the task at hand.\n",
|
||||||
|
"* Next, decide the type of analysis to perform. In the previous section on text classification we used a bag-of-words technique that considered only single tokens, or *unigrams*. Some rudimentary sentiment analysis models go one step further, and consider two-word combinations, or *bigrams*. In this section, we'd like to work with complete sentences, and for this we're going to import a trained NLTK lexicon called *VADER*."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## NLTK's VADER module\n",
|
||||||
|
"VADER is an NLTK module that provides sentiment scores based on words used (\"completely\" boosts a score, while \"slightly\" reduces it), on capitalization & punctuation (\"GREAT!!!\" is stronger than \"great.\"), and negations (words like \"isn't\" and \"doesn't\" affect the outcome).\n",
|
||||||
|
"<br>To view the source code visit https://www.nltk.org/_modules/nltk/sentiment/vader.html"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**Download the VADER lexicon.** You only need to do this once."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[nltk_data] Downloading package vader_lexicon to\n",
|
||||||
|
"[nltk_data] C:\\Users\\Mike\\AppData\\Roaming\\nltk_data...\n",
|
||||||
|
"[nltk_data] Package vader_lexicon is already up-to-date!\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"True"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import nltk\n",
|
||||||
|
"nltk.download('vader_lexicon')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"<div class=\"alert alert-danger\">NOTE: At the time of this writing there's a <a href='https://github.com/nltk/nltk/issues/2053'>known issue</a> with SentimentIntensityAnalyzer that raises a harmless warning on loading<br>\n",
|
||||||
|
"<tt><font color=black> UserWarning: The twython library has not been installed.<br> Some functionality from the twitter package will not be available.</tt>\n",
|
||||||
|
"\n",
|
||||||
|
"This is due to be fixed in an upcoming NLTK release. For now, if you want to avoid it you can (optionally) install the NLTK twitter library with<br>\n",
|
||||||
|
"<tt><font color=black> conda install nltk[twitter]</tt><br>or<br>\n",
|
||||||
|
"<tt><font color=black> pip3 install -U nltk[twitter]</tt></div>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
|
||||||
|
"\n",
|
||||||
|
"sid = SentimentIntensityAnalyzer()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"VADER's `SentimentIntensityAnalyzer()` takes in a string and returns a dictionary of scores in each of four categories:\n",
|
||||||
|
"* negative\n",
|
||||||
|
"* neutral\n",
|
||||||
|
"* positive\n",
|
||||||
|
"* compound *(computed by normalizing the scores above)*"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"a = 'This was a good movie.'\n",
|
||||||
|
"sid.polarity_scores(a)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"a = 'This was the best, most awesome movie EVER MADE!!!'\n",
|
||||||
|
"sid.polarity_scores(a)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'neg': 0.477, 'neu': 0.523, 'pos': 0.0, 'compound': -0.8074}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"a = 'This was the worst film to ever disgrace the screen.'\n",
|
||||||
|
"sid.polarity_scores(a)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Use VADER to analyze Amazon Reviews\n",
|
||||||
|
"For this exercise we're going to apply `SentimentIntensityAnalyzer` to a dataset of 10,000 Amazon reviews. Like our movie reviews datasets, these are labeled as either \"pos\" or \"neg\". At the end we'll determine the accuracy of our sentiment analysis with VADER."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>review</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Stuning even for the non-gamer: This sound tra...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>The best soundtrack ever to anything.: I'm rea...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Amazing!: This soundtrack is my favorite music...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Excellent Soundtrack: I truly like this soundt...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Remember, Pull Your Jaw Off The Floor After He...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label review\n",
|
||||||
|
"0 pos Stuning even for the non-gamer: This sound tra...\n",
|
||||||
|
"1 pos The best soundtrack ever to anything.: I'm rea...\n",
|
||||||
|
"2 pos Amazing!: This soundtrack is my favorite music...\n",
|
||||||
|
"3 pos Excellent Soundtrack: I truly like this soundt...\n",
|
||||||
|
"4 pos Remember, Pull Your Jaw Off The Floor After He..."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"df = pd.read_csv('../TextFiles/amazonreviews.tsv', sep='\\t')\n",
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"neg 5097\n",
|
||||||
|
"pos 4903\n",
|
||||||
|
"Name: label, dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df['label'].value_counts()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Clean the data (optional):\n",
|
||||||
|
"Recall that our moviereviews.tsv file contained empty records. Let's check to see if any exist in amazonreviews.tsv."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# REMOVE NaN VALUES AND EMPTY STRINGS:\n",
|
||||||
|
"df.dropna(inplace=True)\n",
|
||||||
|
"\n",
|
||||||
|
"blanks = [] # start with an empty list\n",
|
||||||
|
"\n",
|
||||||
|
"for i,lb,rv in df.itertuples(): # iterate over the DataFrame\n",
|
||||||
|
" if type(rv)==str: # avoid NaN values\n",
|
||||||
|
" if rv.isspace(): # test 'review' for whitespace\n",
|
||||||
|
" blanks.append(i) # add matching index numbers to the list\n",
|
||||||
|
"\n",
|
||||||
|
"df.drop(blanks, inplace=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"neg 5097\n",
|
||||||
|
"pos 4903\n",
|
||||||
|
"Name: label, dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df['label'].value_counts()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In this case there were no empty records. Good!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Let's run the first review through VADER"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sid.polarity_scores(df.loc[0]['review'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'pos'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df.loc[0]['label']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Great! Our first review was labeled \"positive\", and earned a positive compound score."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Adding Scores and Labels to the DataFrame\n",
|
||||||
|
"In this next section we'll add columns to the original DataFrame to store polarity_score dictionaries, extracted compound scores, and new \"pos/neg\" labels derived from the compound score. We'll use this last column to perform an accuracy test."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>review</th>\n",
|
||||||
|
" <th>scores</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Stuning even for the non-gamer: This sound tra...</td>\n",
|
||||||
|
" <td>{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>The best soundtrack ever to anything.: I'm rea...</td>\n",
|
||||||
|
" <td>{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Amazing!: This soundtrack is my favorite music...</td>\n",
|
||||||
|
" <td>{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Excellent Soundtrack: I truly like this soundt...</td>\n",
|
||||||
|
" <td>{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Remember, Pull Your Jaw Off The Floor After He...</td>\n",
|
||||||
|
" <td>{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label review \\\n",
|
||||||
|
"0 pos Stuning even for the non-gamer: This sound tra... \n",
|
||||||
|
"1 pos The best soundtrack ever to anything.: I'm rea... \n",
|
||||||
|
"2 pos Amazing!: This soundtrack is my favorite music... \n",
|
||||||
|
"3 pos Excellent Soundtrack: I truly like this soundt... \n",
|
||||||
|
"4 pos Remember, Pull Your Jaw Off The Floor After He... \n",
|
||||||
|
"\n",
|
||||||
|
" scores \n",
|
||||||
|
"0 {'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co... \n",
|
||||||
|
"1 {'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co... \n",
|
||||||
|
"2 {'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com... \n",
|
||||||
|
"3 {'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com... \n",
|
||||||
|
"4 {'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp... "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))\n",
|
||||||
|
"\n",
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>review</th>\n",
|
||||||
|
" <th>scores</th>\n",
|
||||||
|
" <th>compound</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Stuning even for the non-gamer: This sound tra...</td>\n",
|
||||||
|
" <td>{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...</td>\n",
|
||||||
|
" <td>0.9454</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>The best soundtrack ever to anything.: I'm rea...</td>\n",
|
||||||
|
" <td>{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...</td>\n",
|
||||||
|
" <td>0.8957</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Amazing!: This soundtrack is my favorite music...</td>\n",
|
||||||
|
" <td>{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...</td>\n",
|
||||||
|
" <td>0.9858</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Excellent Soundtrack: I truly like this soundt...</td>\n",
|
||||||
|
" <td>{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...</td>\n",
|
||||||
|
" <td>0.9814</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Remember, Pull Your Jaw Off The Floor After He...</td>\n",
|
||||||
|
" <td>{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...</td>\n",
|
||||||
|
" <td>0.9781</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label review \\\n",
|
||||||
|
"0 pos Stuning even for the non-gamer: This sound tra... \n",
|
||||||
|
"1 pos The best soundtrack ever to anything.: I'm rea... \n",
|
||||||
|
"2 pos Amazing!: This soundtrack is my favorite music... \n",
|
||||||
|
"3 pos Excellent Soundtrack: I truly like this soundt... \n",
|
||||||
|
"4 pos Remember, Pull Your Jaw Off The Floor After He... \n",
|
||||||
|
"\n",
|
||||||
|
" scores compound \n",
|
||||||
|
"0 {'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co... 0.9454 \n",
|
||||||
|
"1 {'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co... 0.8957 \n",
|
||||||
|
"2 {'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com... 0.9858 \n",
|
||||||
|
"3 {'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com... 0.9814 \n",
|
||||||
|
"4 {'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp... 0.9781 "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df['compound'] = df['scores'].apply(lambda score_dict: score_dict['compound'])\n",
|
||||||
|
"\n",
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>review</th>\n",
|
||||||
|
" <th>scores</th>\n",
|
||||||
|
" <th>compound</th>\n",
|
||||||
|
" <th>comp_score</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Stuning even for the non-gamer: This sound tra...</td>\n",
|
||||||
|
" <td>{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...</td>\n",
|
||||||
|
" <td>0.9454</td>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>The best soundtrack ever to anything.: I'm rea...</td>\n",
|
||||||
|
" <td>{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...</td>\n",
|
||||||
|
" <td>0.8957</td>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Amazing!: This soundtrack is my favorite music...</td>\n",
|
||||||
|
" <td>{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...</td>\n",
|
||||||
|
" <td>0.9858</td>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Excellent Soundtrack: I truly like this soundt...</td>\n",
|
||||||
|
" <td>{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...</td>\n",
|
||||||
|
" <td>0.9814</td>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>Remember, Pull Your Jaw Off The Floor After He...</td>\n",
|
||||||
|
" <td>{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...</td>\n",
|
||||||
|
" <td>0.9781</td>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label review \\\n",
|
||||||
|
"0 pos Stuning even for the non-gamer: This sound tra... \n",
|
||||||
|
"1 pos The best soundtrack ever to anything.: I'm rea... \n",
|
||||||
|
"2 pos Amazing!: This soundtrack is my favorite music... \n",
|
||||||
|
"3 pos Excellent Soundtrack: I truly like this soundt... \n",
|
||||||
|
"4 pos Remember, Pull Your Jaw Off The Floor After He... \n",
|
||||||
|
"\n",
|
||||||
|
" scores compound comp_score \n",
|
||||||
|
"0 {'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co... 0.9454 pos \n",
|
||||||
|
"1 {'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co... 0.8957 pos \n",
|
||||||
|
"2 {'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com... 0.9858 pos \n",
|
||||||
|
"3 {'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com... 0.9814 pos \n",
|
||||||
|
"4 {'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp... 0.9781 pos "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')\n",
|
||||||
|
"\n",
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Report on Accuracy\n",
|
||||||
|
"Finally, we'll use scikit-learn to determine how close VADER came to our original 10,000 labels."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.metrics import accuracy_score,classification_report,confusion_matrix"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0.7091"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"accuracy_score(df['label'],df['comp_score'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" neg 0.86 0.51 0.64 5097\n",
|
||||||
|
" pos 0.64 0.91 0.75 4903\n",
|
||||||
|
"\n",
|
||||||
|
" micro avg 0.71 0.71 0.71 10000\n",
|
||||||
|
" macro avg 0.75 0.71 0.70 10000\n",
|
||||||
|
"weighted avg 0.75 0.71 0.70 10000\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(classification_report(df['label'],df['comp_score']))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[2623 2474]\n",
|
||||||
|
" [ 435 4468]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(confusion_matrix(df['label'],df['comp_score']))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This tells us that VADER correctly identified an Amazon review as \"positive\" or \"negative\" roughly 71% of the time.\n",
|
||||||
|
"## Up Next: Sentiment Analysis Project"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,407 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Sentiment Analysis Project\n",
|
||||||
|
"For this project, we'll perform the same type of NLTK VADER sentiment analysis, this time on our movie reviews dataset.\n",
|
||||||
|
"\n",
|
||||||
|
"The 2,000 record IMDb movie review database is accessible through NLTK directly with\n",
|
||||||
|
"<pre>from nltk.corpus import movie_reviews</pre>\n",
|
||||||
|
"\n",
|
||||||
|
"However, since we already have it in a tab-delimited file we'll use that instead."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Load the Data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>review</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>how do films like mouse hunt get into theatres...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>some talented actresses are blessed with a dem...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>this has been an extraordinary year for austra...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>according to hollywood movies made in last few...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>my first press screening of 1998 and already i...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label review\n",
|
||||||
|
"0 neg how do films like mouse hunt get into theatres...\n",
|
||||||
|
"1 neg some talented actresses are blessed with a dem...\n",
|
||||||
|
"2 pos this has been an extraordinary year for austra...\n",
|
||||||
|
"3 pos according to hollywood movies made in last few...\n",
|
||||||
|
"4 neg my first press screening of 1998 and already i..."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\\t')\n",
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Remove Blank Records (optional)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# REMOVE NaN VALUES AND EMPTY STRINGS:\n",
|
||||||
|
"df.dropna(inplace=True)\n",
|
||||||
|
"\n",
|
||||||
|
"blanks = [] # start with an empty list\n",
|
||||||
|
"\n",
|
||||||
|
"for i,lb,rv in df.itertuples(): # iterate over the DataFrame\n",
|
||||||
|
" if type(rv)==str: # avoid NaN values\n",
|
||||||
|
" if rv.isspace(): # test 'review' for whitespace\n",
|
||||||
|
" blanks.append(i) # add matching index numbers to the list\n",
|
||||||
|
"\n",
|
||||||
|
"df.drop(blanks, inplace=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"pos 969\n",
|
||||||
|
"neg 969\n",
|
||||||
|
"Name: label, dtype: int64"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df['label'].value_counts()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Import `SentimentIntensityAnalyzer` and create an sid object\n",
|
||||||
|
"This assumes that the VADER lexicon has been downloaded."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
|
||||||
|
"\n",
|
||||||
|
"sid = SentimentIntensityAnalyzer()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Use sid to append a `comp_score` to the dataset"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>review</th>\n",
|
||||||
|
" <th>scores</th>\n",
|
||||||
|
" <th>compound</th>\n",
|
||||||
|
" <th>comp_score</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>how do films like mouse hunt get into theatres...</td>\n",
|
||||||
|
" <td>{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...</td>\n",
|
||||||
|
" <td>-0.9125</td>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>some talented actresses are blessed with a dem...</td>\n",
|
||||||
|
" <td>{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...</td>\n",
|
||||||
|
" <td>-0.8618</td>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>this has been an extraordinary year for austra...</td>\n",
|
||||||
|
" <td>{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...</td>\n",
|
||||||
|
" <td>0.9953</td>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>according to hollywood movies made in last few...</td>\n",
|
||||||
|
" <td>{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...</td>\n",
|
||||||
|
" <td>0.9972</td>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>my first press screening of 1998 and already i...</td>\n",
|
||||||
|
" <td>{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...</td>\n",
|
||||||
|
" <td>-0.7264</td>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label review \\\n",
|
||||||
|
"0 neg how do films like mouse hunt get into theatres... \n",
|
||||||
|
"1 neg some talented actresses are blessed with a dem... \n",
|
||||||
|
"2 pos this has been an extraordinary year for austra... \n",
|
||||||
|
"3 pos according to hollywood movies made in last few... \n",
|
||||||
|
"4 neg my first press screening of 1998 and already i... \n",
|
||||||
|
"\n",
|
||||||
|
" scores compound comp_score \n",
|
||||||
|
"0 {'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co... -0.9125 neg \n",
|
||||||
|
"1 {'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com... -0.8618 neg \n",
|
||||||
|
"2 {'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com... 0.9953 pos \n",
|
||||||
|
"3 {'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co... 0.9972 pos \n",
|
||||||
|
"4 {'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com... -0.7264 neg "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))\n",
|
||||||
|
"\n",
|
||||||
|
"df['compound'] = df['scores'].apply(lambda score_dict: score_dict['compound'])\n",
|
||||||
|
"\n",
|
||||||
|
"df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')\n",
|
||||||
|
"\n",
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Perform a comparison analysis between the original `label` and `comp_score`"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.metrics import accuracy_score,classification_report,confusion_matrix"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0.6367389060887513"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"accuracy_score(df['label'],df['comp_score'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" neg 0.72 0.44 0.55 969\n",
|
||||||
|
" pos 0.60 0.83 0.70 969\n",
|
||||||
|
"\n",
|
||||||
|
" micro avg 0.64 0.64 0.64 1938\n",
|
||||||
|
" macro avg 0.66 0.64 0.62 1938\n",
|
||||||
|
"weighted avg 0.66 0.64 0.62 1938\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(classification_report(df['label'],df['comp_score']))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[427 542]\n",
|
||||||
|
" [162 807]]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(confusion_matrix(df['label'],df['comp_score']))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"So, it looks like VADER couldn't judge the movie reviews very accurately. This demonstrates one of the biggest challenges in sentiment analysis - understanding human semantics. Many of the reviews had positive things to say about a movie, reserving final judgement to the last sentence.\n",
|
||||||
|
"## Great Job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,227 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Sentiment Analysis Assessment - Solution\n",
|
||||||
|
"\n",
|
||||||
|
"## Task #1: Perform vector arithmetic on your own words\n",
|
||||||
|
"Write code that evaluates vector arithmetic on your own set of related words. The goal is to come as close to an expected word as possible. Please feel free to share success stories in the Q&A Forum for this section!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import spaCy and load the language library. Remember to use a larger model!\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Choose the words you wish to compare, and obtain their vectors\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import spatial and define a cosine_similarity function\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Write an expression for vector arithmetic\n",
|
||||||
|
"# For example: new_vector = word1 - word2 + word3\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# List the top ten closest vectors in the vocabulary to the result of the expression above\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### CHALLENGE: Write a function that takes in 3 strings, performs a-b+c arithmetic, and returns a top-ten result"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def vector_math(a,b,c):\n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Test the function on known words:\n",
|
||||||
|
"vector_math('king','man','woman')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Task #2: Perform VADER Sentiment Analysis on your own review\n",
|
||||||
|
"Write code that returns a set of SentimentIntensityAnalyzer polarity scores based on your own written review."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import SentimentIntensityAnalyzer and create an sid object\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Write a review as one continuous string (multiple sentences are ok)\n",
|
||||||
|
"review = ''"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Obtain the sid scores for your review\n",
|
||||||
|
"sid.polarity_scores(review)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### CHALLENGE: Write a function that takes in a review and returns a score of \"Positive\", \"Negative\" or \"Neutral\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def review_rating(string):\n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" \n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Test the function on your review above:\n",
|
||||||
|
"review_rating(review)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Great job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,283 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Sentiment Analysis Assessment - Solution\n",
|
||||||
|
"\n",
|
||||||
|
"## Task #1: Perform vector arithmetic on your own words\n",
|
||||||
|
"Write code that evaluates vector arithmetic on your own set of related words. The goal is to come as close to an expected word as possible. Please feel free to share success stories in the Q&A Forum for this section!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import spaCy and load the language library. Remember to use a larger model!\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_md')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Choose the words you wish to compare, and obtain their vectors\n",
|
||||||
|
"word1 = nlp.vocab['wolf'].vector\n",
|
||||||
|
"word2 = nlp.vocab['dog'].vector\n",
|
||||||
|
"word3 = nlp.vocab['cat'].vector"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import spatial and define a cosine_similarity function\n",
|
||||||
|
"from scipy import spatial\n",
|
||||||
|
"\n",
|
||||||
|
"cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Write an expression for vector arithmetic\n",
|
||||||
|
"# For example: new_vector = word1 - word2 + word3\n",
|
||||||
|
"new_vector = word1 - word2 + word3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['maned', 'wolfs', 'wolf', 'lynx', 'wolve', 'yotes', 'canids', 'boars', 'foxes', 'wolfdogs']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# List the top ten closest vectors in the vocabulary to the result of the expression above\n",
|
||||||
|
"computed_similarities = []\n",
|
||||||
|
"\n",
|
||||||
|
"for word in nlp.vocab:\n",
|
||||||
|
" if word.has_vector:\n",
|
||||||
|
" if word.is_lower:\n",
|
||||||
|
" if word.is_alpha:\n",
|
||||||
|
" similarity = cosine_similarity(new_vector, word.vector)\n",
|
||||||
|
" computed_similarities.append((word, similarity))\n",
|
||||||
|
"\n",
|
||||||
|
"computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])\n",
|
||||||
|
"\n",
|
||||||
|
"print([w[0].text for w in computed_similarities[:10]])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### CHALLENGE: Write a function that takes in 3 strings, performs a-b+c arithmetic, and returns a top-ten result"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def vector_math(a,b,c):\n",
|
||||||
|
" new_vector = nlp.vocab[a].vector - nlp.vocab[b].vector + nlp.vocab[c].vector\n",
|
||||||
|
" computed_similarities = []\n",
|
||||||
|
"\n",
|
||||||
|
" for word in nlp.vocab:\n",
|
||||||
|
" if word.has_vector:\n",
|
||||||
|
" if word.is_lower:\n",
|
||||||
|
" if word.is_alpha:\n",
|
||||||
|
" similarity = cosine_similarity(new_vector, word.vector)\n",
|
||||||
|
" computed_similarities.append((word, similarity))\n",
|
||||||
|
"\n",
|
||||||
|
" computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])\n",
|
||||||
|
"\n",
|
||||||
|
" return [w[0].text for w in computed_similarities[:10]]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['king',\n",
|
||||||
|
" 'queen',\n",
|
||||||
|
" 'commoner',\n",
|
||||||
|
" 'highness',\n",
|
||||||
|
" 'prince',\n",
|
||||||
|
" 'sultan',\n",
|
||||||
|
" 'maharajas',\n",
|
||||||
|
" 'princes',\n",
|
||||||
|
" 'kumbia',\n",
|
||||||
|
" 'kings']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Test the function on known words:\n",
|
||||||
|
"vector_math('king','man','woman')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Task #2: Perform VADER Sentiment Analysis on your own review\n",
|
||||||
|
"Write code that returns a set of SentimentIntensityAnalyzer polarity scores based on your own written review."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Import SentimentIntensityAnalyzer and create an sid object\n",
|
||||||
|
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
|
||||||
|
"\n",
|
||||||
|
"sid = SentimentIntensityAnalyzer()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Write a review as one continuous string (multiple sentences are ok)\n",
|
||||||
|
"review = 'This movie portrayed real people, and was based on actual events.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Obtain the sid scores for your review\n",
|
||||||
|
"sid.polarity_scores(review)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### CHALLENGE: Write a function that takes in a review and returns a score of \"Positive\", \"Negative\" or \"Neutral\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def review_rating(string):\n",
|
||||||
|
" scores = sid.polarity_scores(string)\n",
|
||||||
|
" if scores['compound'] == 0:\n",
|
||||||
|
" return 'Neutral'\n",
|
||||||
|
" elif scores['compound'] > 0:\n",
|
||||||
|
" return 'Positive'\n",
|
||||||
|
" else:\n",
|
||||||
|
" return 'Negative'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Neutral'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Test the function on your review above:\n",
|
||||||
|
"review_rating(review)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Great job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,564 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Possible Approach\n",
|
||||||
|
"\n",
|
||||||
|
"## Hypothesis, does adding Pos, Neg, and Neu values from Sentiment Analysis improve the original model??"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>review</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>how do films like mouse hunt get into theatres...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>some talented actresses are blessed with a dem...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>this has been an extraordinary year for austra...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>according to hollywood movies made in last few...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>my first press screening of 1998 and already i...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label review\n",
|
||||||
|
"0 neg how do films like mouse hunt get into theatres...\n",
|
||||||
|
"1 neg some talented actresses are blessed with a dem...\n",
|
||||||
|
"2 pos this has been an extraordinary year for austra...\n",
|
||||||
|
"3 pos according to hollywood movies made in last few...\n",
|
||||||
|
"4 neg my first press screening of 1998 and already i..."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 24,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\\t')\n",
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# REMOVE NaN VALUES AND EMPTY STRINGS:\n",
|
||||||
|
"df.dropna(inplace=True)\n",
|
||||||
|
"\n",
|
||||||
|
"blanks = [] # start with an empty list\n",
|
||||||
|
"\n",
|
||||||
|
"for i,lb,rv in df.itertuples(): # iterate over the DataFrame\n",
|
||||||
|
" if type(rv)==str: # avoid NaN values\n",
|
||||||
|
" if rv.isspace(): # test 'review' for whitespace\n",
|
||||||
|
" blanks.append(i) # add matching index numbers to the list\n",
|
||||||
|
"\n",
|
||||||
|
"df.drop(blanks, inplace=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
|
||||||
|
"\n",
|
||||||
|
"sid = SentimentIntensityAnalyzer()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>review</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>how do films like mouse hunt get into theatres...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>some talented actresses are blessed with a dem...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>this has been an extraordinary year for austra...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>according to hollywood movies made in last few...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>my first press screening of 1998 and already i...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label review\n",
|
||||||
|
"0 neg how do films like mouse hunt get into theatres...\n",
|
||||||
|
"1 neg some talented actresses are blessed with a dem...\n",
|
||||||
|
"2 pos this has been an extraordinary year for austra...\n",
|
||||||
|
"3 pos according to hollywood movies made in last few...\n",
|
||||||
|
"4 neg my first press screening of 1998 and already i..."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>review</th>\n",
|
||||||
|
" <th>scores</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>how do films like mouse hunt get into theatres...</td>\n",
|
||||||
|
" <td>{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>some talented actresses are blessed with a dem...</td>\n",
|
||||||
|
" <td>{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>this has been an extraordinary year for austra...</td>\n",
|
||||||
|
" <td>{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>according to hollywood movies made in last few...</td>\n",
|
||||||
|
" <td>{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>my first press screening of 1998 and already i...</td>\n",
|
||||||
|
" <td>{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label review \\\n",
|
||||||
|
"0 neg how do films like mouse hunt get into theatres... \n",
|
||||||
|
"1 neg some talented actresses are blessed with a dem... \n",
|
||||||
|
"2 pos this has been an extraordinary year for austra... \n",
|
||||||
|
"3 pos according to hollywood movies made in last few... \n",
|
||||||
|
"4 neg my first press screening of 1998 and already i... \n",
|
||||||
|
"\n",
|
||||||
|
" scores \n",
|
||||||
|
"0 {'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co... \n",
|
||||||
|
"1 {'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com... \n",
|
||||||
|
"2 {'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com... \n",
|
||||||
|
"3 {'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co... \n",
|
||||||
|
"4 {'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com... "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df['positive'] = df['scores'].apply(lambda score_dict: score_dict['pos'])\n",
|
||||||
|
"df['negative'] = df['scores'].apply(lambda score_dict: score_dict['neg'])\n",
|
||||||
|
"df['neutral'] = df['scores'].apply(lambda score_dict: score_dict['neu'])\n",
|
||||||
|
"df['compound'] =df['scores'].apply(lambda score_dict: score_dict['compound'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 31,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>label</th>\n",
|
||||||
|
" <th>review</th>\n",
|
||||||
|
" <th>scores</th>\n",
|
||||||
|
" <th>positive</th>\n",
|
||||||
|
" <th>negative</th>\n",
|
||||||
|
" <th>neutral</th>\n",
|
||||||
|
" <th>compound</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>how do films like mouse hunt get into theatres...</td>\n",
|
||||||
|
" <td>{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...</td>\n",
|
||||||
|
" <td>0.101</td>\n",
|
||||||
|
" <td>0.121</td>\n",
|
||||||
|
" <td>0.778</td>\n",
|
||||||
|
" <td>-0.9125</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>some talented actresses are blessed with a dem...</td>\n",
|
||||||
|
" <td>{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...</td>\n",
|
||||||
|
" <td>0.105</td>\n",
|
||||||
|
" <td>0.120</td>\n",
|
||||||
|
" <td>0.775</td>\n",
|
||||||
|
" <td>-0.8618</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>this has been an extraordinary year for austra...</td>\n",
|
||||||
|
" <td>{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...</td>\n",
|
||||||
|
" <td>0.150</td>\n",
|
||||||
|
" <td>0.067</td>\n",
|
||||||
|
" <td>0.783</td>\n",
|
||||||
|
" <td>0.9953</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>pos</td>\n",
|
||||||
|
" <td>according to hollywood movies made in last few...</td>\n",
|
||||||
|
" <td>{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...</td>\n",
|
||||||
|
" <td>0.145</td>\n",
|
||||||
|
" <td>0.069</td>\n",
|
||||||
|
" <td>0.786</td>\n",
|
||||||
|
" <td>0.9972</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>neg</td>\n",
|
||||||
|
" <td>my first press screening of 1998 and already i...</td>\n",
|
||||||
|
" <td>{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...</td>\n",
|
||||||
|
" <td>0.088</td>\n",
|
||||||
|
" <td>0.090</td>\n",
|
||||||
|
" <td>0.822</td>\n",
|
||||||
|
" <td>-0.7264</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" label review \\\n",
|
||||||
|
"0 neg how do films like mouse hunt get into theatres... \n",
|
||||||
|
"1 neg some talented actresses are blessed with a dem... \n",
|
||||||
|
"2 pos this has been an extraordinary year for austra... \n",
|
||||||
|
"3 pos according to hollywood movies made in last few... \n",
|
||||||
|
"4 neg my first press screening of 1998 and already i... \n",
|
||||||
|
"\n",
|
||||||
|
" scores positive negative \\\n",
|
||||||
|
"0 {'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co... 0.101 0.121 \n",
|
||||||
|
"1 {'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com... 0.105 0.120 \n",
|
||||||
|
"2 {'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com... 0.150 0.067 \n",
|
||||||
|
"3 {'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co... 0.145 0.069 \n",
|
||||||
|
"4 {'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com... 0.088 0.090 \n",
|
||||||
|
"\n",
|
||||||
|
" neutral compound \n",
|
||||||
|
"0 0.778 -0.9125 \n",
|
||||||
|
"1 0.775 -0.8618 \n",
|
||||||
|
"2 0.783 0.9953 \n",
|
||||||
|
"3 0.786 0.9972 \n",
|
||||||
|
"4 0.822 -0.7264 "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 31,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"here's a rarity : a children's film that attempts to tackle a weighty subject , is there a god ? \r\n",
|
||||||
|
"done well , it could have been a gem among the wasteland of modern children's cinema . \r\n",
|
||||||
|
"unfortunately , it isn't . \r\n",
|
||||||
|
"with jumbled messages , and an unclear audience , wide awake was better left asleep . \r\n",
|
||||||
|
"fifth grader joshua beal ( joseph cross ) is in the middle of a moral crisis . \r\n",
|
||||||
|
"his beloved grandfather ( robert loggia ) has died , and joshua has begun a quest . \r\n",
|
||||||
|
"he wants to find god , to discover why bad things happen . \r\n",
|
||||||
|
"this religious quest is slightly disturbing for his parents ( dana delany and denis leary ) , but they do their best to cope with their son as he explores different religious faiths . \r\n",
|
||||||
|
"at his catholic school , his favorite teacher , sister terry ( rosie o'donnell ) , tries to give him guidance , but this is a journey he must make on his own . \r\n",
|
||||||
|
"meanwhile , he is having the most momentous year of his life . \r\n",
|
||||||
|
"he has several adventures with his daredevil best friend dave ( timothy reifsnyder ) , he gets his first crush , and begins to wake up to the world around him while he is on his spiritual journey . \r\n",
|
||||||
|
"it is somewhat confusing as to what the real audience for wide awake is expected to be . \r\n",
|
||||||
|
"on its surface , it appears to be a kid's film . \r\n",
|
||||||
|
"however , it deals with serious issues , and is likely to be boring for today's instant-gratification kids . \r\n",
|
||||||
|
"and while it might seem heartening to see that someone is trying to produce something thoughtful for the kidvid audience , wide awake asks serious questions , but only delivers a cheap gimmick for an answer . \r\n",
|
||||||
|
"if there were a bit more meat in the story , adults on a nostalgic bent might get a kick out of the movie . \r\n",
|
||||||
|
"the actors who might have created a great cast ( o'donnell , leary and delany ) are wasted in roles that amount to little more than cameos . \r\n",
|
||||||
|
"the nostalgic elements ( best friend , favorite teacher , first crush , etc . ) have been done much better in other movies , and actually seem more like filler here . \r\n",
|
||||||
|
"the film's strongest scenes are some touching flashbacks depicting joshua's relationship with his grandfather . \r\n",
|
||||||
|
"they show more depth than is present anywhere else in the movie . \r\n",
|
||||||
|
"maybe the film would have been better if , instead of playing the relationship through flashbacks , it were set entirely during joshua's last year with his grandpa . \r\n",
|
||||||
|
"it certainly would have been more entertaining . \r\n",
|
||||||
|
"wide awake can best be described as a failed experiment . \r\n",
|
||||||
|
"it starts out with noble aspirations , but never delivers on its promise . \r\n",
|
||||||
|
"parents who do take their children to see this one ought to be prepared to answer some tough questions . . . that is if their kids aren't bored to death first . \r\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(df.iloc[15]['review'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.metrics import accuracy_score,classification_report,confusion_matrix"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"0.6367389060887513"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 17,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"accuracy_score(df['label'],df['comp_score'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" precision recall f1-score support\n",
|
||||||
|
"\n",
|
||||||
|
" neg 0.72 0.44 0.55 969\n",
|
||||||
|
" pos 0.60 0.83 0.70 969\n",
|
||||||
|
"\n",
|
||||||
|
" micro avg 0.64 0.64 0.64 1938\n",
|
||||||
|
" macro avg 0.66 0.64 0.62 1938\n",
|
||||||
|
"weighted avg 0.66 0.64 0.62 1938\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(classification_report(df['label'],df['comp_score']))"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,545 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# Topic Modeling Assessment Project\n",
|
||||||
|
"\n",
|
||||||
|
"Welcome to your Topic Modeling Assessment! For this project you will be working with a dataset of over 400,000 quora questions that have no labeled cateogry, and attempting to find 20 cateogries to assign these questions to. The .csv file of these text questions can be found underneath the Topic-Modeling folder.\n",
|
||||||
|
"\n",
|
||||||
|
"Remember you can always check the solutions notebook and video lecture for any questions."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Task: Import pandas and read in the quora_questions.csv file."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 52,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 53,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>Question</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>What is the step by step guide to invest in sh...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>How can I increase the speed of my internet co...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>Why am I mentally very lonely? How can I solve...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>Which one dissolve in water quikly sugar, salt...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" Question\n",
|
||||||
|
"0 What is the step by step guide to invest in sh...\n",
|
||||||
|
"1 What is the story of Kohinoor (Koh-i-Noor) Dia...\n",
|
||||||
|
"2 How can I increase the speed of my internet co...\n",
|
||||||
|
"3 Why am I mentally very lonely? How can I solve...\n",
|
||||||
|
"4 Which one dissolve in water quikly sugar, salt..."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 53,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Preprocessing\n",
|
||||||
|
"\n",
|
||||||
|
"#### Task: Use TF-IDF Vectorization to create a vectorized document term matrix. You may want to explore the max_df and min_df parameters."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 40,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 41,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 42,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 43,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<404289x38669 sparse matrix of type '<class 'numpy.float64'>'\n",
|
||||||
|
"\twith 2002912 stored elements in Compressed Sparse Row format>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 43,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Non-negative Matrix Factorization\n",
|
||||||
|
"\n",
|
||||||
|
"#### TASK: Using Scikit-Learn create an instance of NMF with 20 expected components. (Use random_state=42).."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 44,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 48,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 49,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,\n",
|
||||||
|
" n_components=20, random_state=42, shuffle=False, solver='cd', tol=0.0001,\n",
|
||||||
|
" verbose=0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 49,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### TASK: Print our the top 15 most common words for each of the 20 topics."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 50,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #0\n",
|
||||||
|
"['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #1\n",
|
||||||
|
"['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #2\n",
|
||||||
|
"['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #3\n",
|
||||||
|
"['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #4\n",
|
||||||
|
"['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #5\n",
|
||||||
|
"['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'country', 'olympics', 'available', 'job', 'spotify', 'war', 'pakistan', 'india']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #6\n",
|
||||||
|
"['beginners', 'online', 'english', 'book', 'did', 'hacking', 'want', 'python', 'languages', 'java', 'learning', 'start', 'language', 'programming', 'learn']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #7\n",
|
||||||
|
"['happen', 'presidency', 'think', 'presidential', '2016', 'vote', 'better', 'election', 'did', 'win', 'hillary', 'president', 'clinton', 'donald', 'trump']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #8\n",
|
||||||
|
"['russia', 'business', 'win', 'coming', 'countries', 'place', 'pakistan', 'happen', 'end', 'country', 'iii', 'start', 'did', 'war', 'world']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #9\n",
|
||||||
|
"['indian', 'companies', 'don', 'guy', 'men', 'culture', 'women', 'work', 'girls', 'live', 'girl', 'look', 'sex', 'feel', 'like']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #10\n",
|
||||||
|
"['ca', 'departments', 'positions', 'movies', 'songs', 'business', 'read', 'start', 'job', 'work', 'engineering', 'ways', 'bad', 'books', 'good']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #11\n",
|
||||||
|
"['money', 'modi', 'currency', 'economy', 'think', 'government', 'ban', 'banning', 'black', 'indian', 'rupee', 'rs', '1000', 'notes', '500']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #12\n",
|
||||||
|
"['blowing', 'resolutions', 'resolution', 'mind', 'likes', 'girl', '2017', 'year', 'don', 'employees', 'going', 'day', 'things', 'new', 'know']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #13\n",
|
||||||
|
"['aspects', 'fluent', 'skill', 'spoken', 'ways', 'language', 'fluently', 'speak', 'communication', 'pronunciation', 'speaking', 'writing', 'skills', 'improve', 'english']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #14\n",
|
||||||
|
"['diet', 'help', 'healthy', 'exercise', 'month', 'pounds', 'reduce', 'quickly', 'loss', 'fast', 'fat', 'ways', 'gain', 'lose', 'weight']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #15\n",
|
||||||
|
"['having', 'feel', 'long', 'spend', 'did', 'person', 'machine', 'movies', 'favorite', 'job', 'home', 'sex', 'possible', 'travel', 'time']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #16\n",
|
||||||
|
"['marriage', 'make', 'did', 'girlfriend', 'feel', 'tell', 'forget', 'really', 'friend', 'true', 'know', 'person', 'girl', 'fall', 'love']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #17\n",
|
||||||
|
"['easy', 'hack', 'prepare', 'quickest', 'facebook', 'increase', 'painless', 'instagram', 'account', 'best', 'commit', 'fastest', 'suicide', 'easiest', 'way']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #18\n",
|
||||||
|
"['web', 'java', 'scripting', 'phone', 'mechanical', 'better', 'job', 'use', 'account', 'data', 'software', 'science', 'computer', 'engineering', 'difference']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #19\n",
|
||||||
|
"['earth', 'blowing', 'stop', 'use', 'easily', 'mind', 'google', 'flat', 'questions', 'hate', 'believe', 'ask', 'don', 'think', 'people']\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### TASK: Add a new column to the original quora dataframe that labels each question into one of the 20 topic categories."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 54,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>Question</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>What is the step by step guide to invest in sh...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>How can I increase the speed of my internet co...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>Why am I mentally very lonely? How can I solve...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>Which one dissolve in water quikly sugar, salt...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" Question\n",
|
||||||
|
"0 What is the step by step guide to invest in sh...\n",
|
||||||
|
"1 What is the story of Kohinoor (Koh-i-Noor) Dia...\n",
|
||||||
|
"2 How can I increase the speed of my internet co...\n",
|
||||||
|
"3 Why am I mentally very lonely? How can I solve...\n",
|
||||||
|
"4 Which one dissolve in water quikly sugar, salt..."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 54,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 55,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 56,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>Question</th>\n",
|
||||||
|
" <th>Topic</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>What is the step by step guide to invest in sh...</td>\n",
|
||||||
|
" <td>5</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n",
|
||||||
|
" <td>16</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>How can I increase the speed of my internet co...</td>\n",
|
||||||
|
" <td>17</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>Why am I mentally very lonely? How can I solve...</td>\n",
|
||||||
|
" <td>11</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>Which one dissolve in water quikly sugar, salt...</td>\n",
|
||||||
|
" <td>14</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5</th>\n",
|
||||||
|
" <td>Astrology: I am a Capricorn Sun Cap moon and c...</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6</th>\n",
|
||||||
|
" <td>Should I buy tiago?</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>7</th>\n",
|
||||||
|
" <td>How can I be a good geologist?</td>\n",
|
||||||
|
" <td>10</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>8</th>\n",
|
||||||
|
" <td>When do you use シ instead of し?</td>\n",
|
||||||
|
" <td>19</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>9</th>\n",
|
||||||
|
" <td>Motorola (company): Can I hack my Charter Moto...</td>\n",
|
||||||
|
" <td>17</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" Question Topic\n",
|
||||||
|
"0 What is the step by step guide to invest in sh... 5\n",
|
||||||
|
"1 What is the story of Kohinoor (Koh-i-Noor) Dia... 16\n",
|
||||||
|
"2 How can I increase the speed of my internet co... 17\n",
|
||||||
|
"3 Why am I mentally very lonely? How can I solve... 11\n",
|
||||||
|
"4 Which one dissolve in water quikly sugar, salt... 14\n",
|
||||||
|
"5 Astrology: I am a Capricorn Sun Cap moon and c... 1\n",
|
||||||
|
"6 Should I buy tiago? 0\n",
|
||||||
|
"7 How can I be a good geologist? 10\n",
|
||||||
|
"8 When do you use シ instead of し? 19\n",
|
||||||
|
"9 Motorola (company): Can I hack my Charter Moto... 17"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 56,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Great job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
@ -0,0 +1,576 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# Topic Modeling Assessment Project"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### Task: Import pandas and read in the quora_questions.csv file."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 52,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"quora = pd.read_csv('quora_questions.csv')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 53,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>Question</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>What is the step by step guide to invest in sh...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>How can I increase the speed of my internet co...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>Why am I mentally very lonely? How can I solve...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>Which one dissolve in water quikly sugar, salt...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" Question\n",
|
||||||
|
"0 What is the step by step guide to invest in sh...\n",
|
||||||
|
"1 What is the story of Kohinoor (Koh-i-Noor) Dia...\n",
|
||||||
|
"2 How can I increase the speed of my internet co...\n",
|
||||||
|
"3 Why am I mentally very lonely? How can I solve...\n",
|
||||||
|
"4 Which one dissolve in water quikly sugar, salt..."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 53,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"quora.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Preprocessing\n",
|
||||||
|
"\n",
|
||||||
|
"#### Task: Use TF-IDF Vectorization to create a vectorized document term matrix. You may want to explore the max_df and min_df parameters."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 40,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 41,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 42,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"dtm = tfidf.fit_transform(quora['Question'])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 43,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"<404289x38669 sparse matrix of type '<class 'numpy.float64'>'\n",
|
||||||
|
"\twith 2002912 stored elements in Compressed Sparse Row format>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 43,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"dtm"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Non-negative Matrix Factorization\n",
|
||||||
|
"\n",
|
||||||
|
"#### TASK: Using Scikit-Learn create an instance of NMF with 20 expected components. (Use random_state=42).."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 44,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.decomposition import NMF"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 48,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"nmf_model = NMF(n_components=20,random_state=42)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 49,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,\n",
|
||||||
|
" n_components=20, random_state=42, shuffle=False, solver='cd', tol=0.0001,\n",
|
||||||
|
" verbose=0)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 49,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nmf_model.fit(dtm)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### TASK: Print our the top 15 most common words for each of the 20 topics."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 50,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #0\n",
|
||||||
|
"['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #1\n",
|
||||||
|
"['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #2\n",
|
||||||
|
"['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #3\n",
|
||||||
|
"['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #4\n",
|
||||||
|
"['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #5\n",
|
||||||
|
"['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'country', 'olympics', 'available', 'job', 'spotify', 'war', 'pakistan', 'india']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #6\n",
|
||||||
|
"['beginners', 'online', 'english', 'book', 'did', 'hacking', 'want', 'python', 'languages', 'java', 'learning', 'start', 'language', 'programming', 'learn']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #7\n",
|
||||||
|
"['happen', 'presidency', 'think', 'presidential', '2016', 'vote', 'better', 'election', 'did', 'win', 'hillary', 'president', 'clinton', 'donald', 'trump']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #8\n",
|
||||||
|
"['russia', 'business', 'win', 'coming', 'countries', 'place', 'pakistan', 'happen', 'end', 'country', 'iii', 'start', 'did', 'war', 'world']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #9\n",
|
||||||
|
"['indian', 'companies', 'don', 'guy', 'men', 'culture', 'women', 'work', 'girls', 'live', 'girl', 'look', 'sex', 'feel', 'like']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #10\n",
|
||||||
|
"['ca', 'departments', 'positions', 'movies', 'songs', 'business', 'read', 'start', 'job', 'work', 'engineering', 'ways', 'bad', 'books', 'good']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #11\n",
|
||||||
|
"['money', 'modi', 'currency', 'economy', 'think', 'government', 'ban', 'banning', 'black', 'indian', 'rupee', 'rs', '1000', 'notes', '500']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #12\n",
|
||||||
|
"['blowing', 'resolutions', 'resolution', 'mind', 'likes', 'girl', '2017', 'year', 'don', 'employees', 'going', 'day', 'things', 'new', 'know']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #13\n",
|
||||||
|
"['aspects', 'fluent', 'skill', 'spoken', 'ways', 'language', 'fluently', 'speak', 'communication', 'pronunciation', 'speaking', 'writing', 'skills', 'improve', 'english']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #14\n",
|
||||||
|
"['diet', 'help', 'healthy', 'exercise', 'month', 'pounds', 'reduce', 'quickly', 'loss', 'fast', 'fat', 'ways', 'gain', 'lose', 'weight']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #15\n",
|
||||||
|
"['having', 'feel', 'long', 'spend', 'did', 'person', 'machine', 'movies', 'favorite', 'job', 'home', 'sex', 'possible', 'travel', 'time']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #16\n",
|
||||||
|
"['marriage', 'make', 'did', 'girlfriend', 'feel', 'tell', 'forget', 'really', 'friend', 'true', 'know', 'person', 'girl', 'fall', 'love']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #17\n",
|
||||||
|
"['easy', 'hack', 'prepare', 'quickest', 'facebook', 'increase', 'painless', 'instagram', 'account', 'best', 'commit', 'fastest', 'suicide', 'easiest', 'way']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #18\n",
|
||||||
|
"['web', 'java', 'scripting', 'phone', 'mechanical', 'better', 'job', 'use', 'account', 'data', 'software', 'science', 'computer', 'engineering', 'difference']\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"THE TOP 15 WORDS FOR TOPIC #19\n",
|
||||||
|
"['earth', 'blowing', 'stop', 'use', 'easily', 'mind', 'google', 'flat', 'questions', 'hate', 'believe', 'ask', 'don', 'think', 'people']\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"for index,topic in enumerate(nmf_model.components_):\n",
|
||||||
|
" print(f'THE TOP 15 WORDS FOR TOPIC #{index}')\n",
|
||||||
|
" print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])\n",
|
||||||
|
" print('\\n')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"#### TASK: Add a new column to the original quora dataframe that labels each question into one of the 20 topic categories."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 54,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>Question</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>What is the step by step guide to invest in sh...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>How can I increase the speed of my internet co...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>Why am I mentally very lonely? How can I solve...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>Which one dissolve in water quikly sugar, salt...</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" Question\n",
|
||||||
|
"0 What is the step by step guide to invest in sh...\n",
|
||||||
|
"1 What is the story of Kohinoor (Koh-i-Noor) Dia...\n",
|
||||||
|
"2 How can I increase the speed of my internet co...\n",
|
||||||
|
"3 Why am I mentally very lonely? How can I solve...\n",
|
||||||
|
"4 Which one dissolve in water quikly sugar, salt..."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 54,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"quora.head()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 55,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"topic_results = nmf_model.transform(dtm)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 56,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<div>\n",
|
||||||
|
"<style scoped>\n",
|
||||||
|
" .dataframe tbody tr th:only-of-type {\n",
|
||||||
|
" vertical-align: middle;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe tbody tr th {\n",
|
||||||
|
" vertical-align: top;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" .dataframe thead th {\n",
|
||||||
|
" text-align: right;\n",
|
||||||
|
" }\n",
|
||||||
|
"</style>\n",
|
||||||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
||||||
|
" <thead>\n",
|
||||||
|
" <tr style=\"text-align: right;\">\n",
|
||||||
|
" <th></th>\n",
|
||||||
|
" <th>Question</th>\n",
|
||||||
|
" <th>Topic</th>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </thead>\n",
|
||||||
|
" <tbody>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>0</th>\n",
|
||||||
|
" <td>What is the step by step guide to invest in sh...</td>\n",
|
||||||
|
" <td>5</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>1</th>\n",
|
||||||
|
" <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n",
|
||||||
|
" <td>16</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>2</th>\n",
|
||||||
|
" <td>How can I increase the speed of my internet co...</td>\n",
|
||||||
|
" <td>17</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>3</th>\n",
|
||||||
|
" <td>Why am I mentally very lonely? How can I solve...</td>\n",
|
||||||
|
" <td>11</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>4</th>\n",
|
||||||
|
" <td>Which one dissolve in water quikly sugar, salt...</td>\n",
|
||||||
|
" <td>14</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>5</th>\n",
|
||||||
|
" <td>Astrology: I am a Capricorn Sun Cap moon and c...</td>\n",
|
||||||
|
" <td>1</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>6</th>\n",
|
||||||
|
" <td>Should I buy tiago?</td>\n",
|
||||||
|
" <td>0</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>7</th>\n",
|
||||||
|
" <td>How can I be a good geologist?</td>\n",
|
||||||
|
" <td>10</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>8</th>\n",
|
||||||
|
" <td>When do you use シ instead of し?</td>\n",
|
||||||
|
" <td>19</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" <tr>\n",
|
||||||
|
" <th>9</th>\n",
|
||||||
|
" <td>Motorola (company): Can I hack my Charter Moto...</td>\n",
|
||||||
|
" <td>17</td>\n",
|
||||||
|
" </tr>\n",
|
||||||
|
" </tbody>\n",
|
||||||
|
"</table>\n",
|
||||||
|
"</div>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
" Question Topic\n",
|
||||||
|
"0 What is the step by step guide to invest in sh... 5\n",
|
||||||
|
"1 What is the story of Kohinoor (Koh-i-Noor) Dia... 16\n",
|
||||||
|
"2 How can I increase the speed of my internet co... 17\n",
|
||||||
|
"3 Why am I mentally very lonely? How can I solve... 11\n",
|
||||||
|
"4 Which one dissolve in water quikly sugar, salt... 14\n",
|
||||||
|
"5 Astrology: I am a Capricorn Sun Cap moon and c... 1\n",
|
||||||
|
"6 Should I buy tiago? 0\n",
|
||||||
|
"7 How can I be a good geologist? 10\n",
|
||||||
|
"8 When do you use シ instead of し? 19\n",
|
||||||
|
"9 Motorola (company): Can I hack my Charter Moto... 17"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 56,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"topic_results.argmax(axis=1)\n",
|
||||||
|
"\n",
|
||||||
|
"quora['Topic'] = topic_results.argmax(axis=1)\n",
|
||||||
|
"\n",
|
||||||
|
"quora.head(10)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Great job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
11993
Praktikum Python Code/05-Topic-Modeling/npr.csv
Normal file
11993
Praktikum Python Code/05-Topic-Modeling/npr.csv
Normal file
File diff suppressed because one or more lines are too long
404294
Praktikum Python Code/05-Topic-Modeling/quora_questions.csv
Normal file
404294
Praktikum Python Code/05-Topic-Modeling/quora_questions.csv
Normal file
File diff suppressed because it is too large
Load Diff
2137
Praktikum Python Code/06-Deep-Learning/00-Keras-Basics.ipynb
Normal file
2137
Praktikum Python Code/06-Deep-Learning/00-Keras-Basics.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,773 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___\n",
|
||||||
|
"# Text Generation with Neural Networks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Functions for Processing Text\n",
|
||||||
|
"\n",
|
||||||
|
"### Reading in files as a string text"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def read_file(filepath):\n",
|
||||||
|
" \n",
|
||||||
|
" with open(filepath) as f:\n",
|
||||||
|
" str_text = f.read()\n",
|
||||||
|
" \n",
|
||||||
|
" return str_text"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"read_file('moby_dick_four_chapters.txt')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"### Tokenize and Clean Text"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en',disable=['parser', 'tagger','ner'])\n",
|
||||||
|
"\n",
|
||||||
|
"nlp.max_length = 1198623"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def separate_punc(doc_text):\n",
|
||||||
|
" return [token.text.lower() for token in nlp(doc_text) if token.text not in '\\n\\n \\n\\n\\n!\"-#$%&()--.*+,-/:;<=>?@[\\\\]^_`{|}~\\t\\n ']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"d = read_file('melville-moby_dick.txt')\n",
|
||||||
|
"tokens = separate_punc(d)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tokens"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"len(tokens)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"4431/25"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Create Sequences of Tokens"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# organize into sequences of tokens\n",
|
||||||
|
"train_len = 25+1 # 50 training words , then one target word\n",
|
||||||
|
"\n",
|
||||||
|
"# Empty list of sequences\n",
|
||||||
|
"text_sequences = []\n",
|
||||||
|
"\n",
|
||||||
|
"for i in range(train_len, len(tokens)):\n",
|
||||||
|
" \n",
|
||||||
|
" # Grab train_len# amount of characters\n",
|
||||||
|
" seq = tokens[i-train_len:i]\n",
|
||||||
|
" \n",
|
||||||
|
" # Add to list of sequences\n",
|
||||||
|
" text_sequences.append(seq)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"' '.join(text_sequences[0])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"' '.join(text_sequences[1])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"' '.join(text_sequences[2])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"len(text_sequences)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Keras"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Keras Tokenization"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from keras.preprocessing.text import Tokenizer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# integer encode sequences of words\n",
|
||||||
|
"tokenizer = Tokenizer()\n",
|
||||||
|
"tokenizer.fit_on_texts(text_sequences)\n",
|
||||||
|
"sequences = tokenizer.texts_to_sequences(text_sequences)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"sequences[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tokenizer.index_word"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for i in sequences[0]:\n",
|
||||||
|
" print(f'{i} : {tokenizer.index_word[i]}')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tokenizer.word_counts"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"vocabulary_size = len(tokenizer.word_counts)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Convert to Numpy Matrix"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import numpy as np"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"sequences = np.array(sequences)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"sequences"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Creating an LSTM based model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import keras\n",
|
||||||
|
"from keras.models import Sequential\n",
|
||||||
|
"from keras.layers import Dense,LSTM,Embedding"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def create_model(vocabulary_size, seq_len):\n",
|
||||||
|
" model = Sequential()\n",
|
||||||
|
" model.add(Embedding(vocabulary_size, 25, input_length=seq_len))\n",
|
||||||
|
" model.add(LSTM(150, return_sequences=True))\n",
|
||||||
|
" model.add(LSTM(150))\n",
|
||||||
|
" model.add(Dense(150, activation='relu'))\n",
|
||||||
|
"\n",
|
||||||
|
" model.add(Dense(vocabulary_size, activation='softmax'))\n",
|
||||||
|
" \n",
|
||||||
|
" model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
|
||||||
|
" \n",
|
||||||
|
" model.summary()\n",
|
||||||
|
" \n",
|
||||||
|
" return model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Train / Test Split"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from keras.utils import to_categorical"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"sequences"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# First 49 words\n",
|
||||||
|
"sequences[:,:-1]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# last Word\n",
|
||||||
|
"sequences[:,-1]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"X = sequences[:,:-1]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"y = sequences[:,-1]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"y = to_categorical(y, num_classes=vocabulary_size+1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"seq_len = X.shape[1]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"seq_len"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Training the Model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# define model\n",
|
||||||
|
"model = create_model(vocabulary_size+1, seq_len)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"---\n",
|
||||||
|
"\n",
|
||||||
|
"----"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from pickle import dump,load"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# fit model\n",
|
||||||
|
"model.fit(X, y, batch_size=128, epochs=300,verbose=1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true,
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# save the model to file\n",
|
||||||
|
"model.save('epochBIG.h5')\n",
|
||||||
|
"# save the tokenizer\n",
|
||||||
|
"dump(tokenizer, open('epochBIG', 'wb'))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Generating New Text"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from random import randint\n",
|
||||||
|
"from pickle import load\n",
|
||||||
|
"from keras.models import load_model\n",
|
||||||
|
"from keras.preprocessing.sequence import pad_sequences"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):\n",
|
||||||
|
" '''\n",
|
||||||
|
" INPUTS:\n",
|
||||||
|
" model : model that was trained on text data\n",
|
||||||
|
" tokenizer : tokenizer that was fit on text data\n",
|
||||||
|
" seq_len : length of training sequence\n",
|
||||||
|
" seed_text : raw string text to serve as the seed\n",
|
||||||
|
" num_gen_words : number of words to be generated by model\n",
|
||||||
|
" '''\n",
|
||||||
|
" \n",
|
||||||
|
" # Final Output\n",
|
||||||
|
" output_text = []\n",
|
||||||
|
" \n",
|
||||||
|
" # Intial Seed Sequence\n",
|
||||||
|
" input_text = seed_text\n",
|
||||||
|
" \n",
|
||||||
|
" # Create num_gen_words\n",
|
||||||
|
" for i in range(num_gen_words):\n",
|
||||||
|
" \n",
|
||||||
|
" # Take the input text string and encode it to a sequence\n",
|
||||||
|
" encoded_text = tokenizer.texts_to_sequences([input_text])[0]\n",
|
||||||
|
" \n",
|
||||||
|
" # Pad sequences to our trained rate (50 words in the video)\n",
|
||||||
|
" pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')\n",
|
||||||
|
" \n",
|
||||||
|
" # Predict Class Probabilities for each word\n",
|
||||||
|
" pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]\n",
|
||||||
|
" \n",
|
||||||
|
" # Grab word\n",
|
||||||
|
" pred_word = tokenizer.index_word[pred_word_ind] \n",
|
||||||
|
" \n",
|
||||||
|
" # Update the sequence of input text (shifting one over with the new word)\n",
|
||||||
|
" input_text += ' ' + pred_word\n",
|
||||||
|
" \n",
|
||||||
|
" output_text.append(pred_word)\n",
|
||||||
|
" \n",
|
||||||
|
" # Make it look like a sentence.\n",
|
||||||
|
" return ' '.join(output_text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Grab a random seed sequence"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"text_sequences[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"random.seed(101)\n",
|
||||||
|
"random_pick = random.randint(0,len(text_sequences))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"random_seed_text = text_sequences[random_pick]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"random_seed_text"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"seed_text = ' '.join(random_seed_text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"seed_text"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Exploring Generated Sequence"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"full_text = read_file('moby_dick_four_chapters.txt')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"for i,word in enumerate(full_text.split()):\n",
|
||||||
|
" if word == 'inkling':\n",
|
||||||
|
" print(' '.join(full_text.split()[i-20:i+20]))\n",
|
||||||
|
" print('\\n')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Great Job!"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
1938
Praktikum Python Code/06-Deep-Learning/02-Chat-Bots.ipynb
Normal file
1938
Praktikum Python Code/06-Deep-Learning/02-Chat-Bots.ipynb
Normal file
File diff suppressed because one or more lines are too long
BIN
Praktikum Python Code/06-Deep-Learning/chatbot_10.h5
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/chatbot_10.h5
Normal file
Binary file not shown.
BIN
Praktikum Python Code/06-Deep-Learning/chatbot_120_epochs.h5
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/chatbot_120_epochs.h5
Normal file
Binary file not shown.
BIN
Praktikum Python Code/06-Deep-Learning/epoch250
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/epoch250
Normal file
Binary file not shown.
BIN
Praktikum Python Code/06-Deep-Learning/epoch250.h5
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/epoch250.h5
Normal file
Binary file not shown.
BIN
Praktikum Python Code/06-Deep-Learning/epochBIG
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/epochBIG
Normal file
Binary file not shown.
BIN
Praktikum Python Code/06-Deep-Learning/epochBIG.h5
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/epochBIG.h5
Normal file
Binary file not shown.
151
Praktikum Python Code/06-Deep-Learning/iris.csv
Normal file
151
Praktikum Python Code/06-Deep-Learning/iris.csv
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
sepal_length,sepal_width,petal_length,petal_width,species
|
||||||
|
5.1,3.5,1.4,0.2,setosa
|
||||||
|
4.9,3,1.4,0.2,setosa
|
||||||
|
4.7,3.2,1.3,0.2,setosa
|
||||||
|
4.6,3.1,1.5,0.2,setosa
|
||||||
|
5,3.6,1.4,0.2,setosa
|
||||||
|
5.4,3.9,1.7,0.4,setosa
|
||||||
|
4.6,3.4,1.4,0.3,setosa
|
||||||
|
5,3.4,1.5,0.2,setosa
|
||||||
|
4.4,2.9,1.4,0.2,setosa
|
||||||
|
4.9,3.1,1.5,0.1,setosa
|
||||||
|
5.4,3.7,1.5,0.2,setosa
|
||||||
|
4.8,3.4,1.6,0.2,setosa
|
||||||
|
4.8,3,1.4,0.1,setosa
|
||||||
|
4.3,3,1.1,0.1,setosa
|
||||||
|
5.8,4,1.2,0.2,setosa
|
||||||
|
5.7,4.4,1.5,0.4,setosa
|
||||||
|
5.4,3.9,1.3,0.4,setosa
|
||||||
|
5.1,3.5,1.4,0.3,setosa
|
||||||
|
5.7,3.8,1.7,0.3,setosa
|
||||||
|
5.1,3.8,1.5,0.3,setosa
|
||||||
|
5.4,3.4,1.7,0.2,setosa
|
||||||
|
5.1,3.7,1.5,0.4,setosa
|
||||||
|
4.6,3.6,1,0.2,setosa
|
||||||
|
5.1,3.3,1.7,0.5,setosa
|
||||||
|
4.8,3.4,1.9,0.2,setosa
|
||||||
|
5,3,1.6,0.2,setosa
|
||||||
|
5,3.4,1.6,0.4,setosa
|
||||||
|
5.2,3.5,1.5,0.2,setosa
|
||||||
|
5.2,3.4,1.4,0.2,setosa
|
||||||
|
4.7,3.2,1.6,0.2,setosa
|
||||||
|
4.8,3.1,1.6,0.2,setosa
|
||||||
|
5.4,3.4,1.5,0.4,setosa
|
||||||
|
5.2,4.1,1.5,0.1,setosa
|
||||||
|
5.5,4.2,1.4,0.2,setosa
|
||||||
|
4.9,3.1,1.5,0.1,setosa
|
||||||
|
5,3.2,1.2,0.2,setosa
|
||||||
|
5.5,3.5,1.3,0.2,setosa
|
||||||
|
4.9,3.1,1.5,0.1,setosa
|
||||||
|
4.4,3,1.3,0.2,setosa
|
||||||
|
5.1,3.4,1.5,0.2,setosa
|
||||||
|
5,3.5,1.3,0.3,setosa
|
||||||
|
4.5,2.3,1.3,0.3,setosa
|
||||||
|
4.4,3.2,1.3,0.2,setosa
|
||||||
|
5,3.5,1.6,0.6,setosa
|
||||||
|
5.1,3.8,1.9,0.4,setosa
|
||||||
|
4.8,3,1.4,0.3,setosa
|
||||||
|
5.1,3.8,1.6,0.2,setosa
|
||||||
|
4.6,3.2,1.4,0.2,setosa
|
||||||
|
5.3,3.7,1.5,0.2,setosa
|
||||||
|
5,3.3,1.4,0.2,setosa
|
||||||
|
7,3.2,4.7,1.4,versicolor
|
||||||
|
6.4,3.2,4.5,1.5,versicolor
|
||||||
|
6.9,3.1,4.9,1.5,versicolor
|
||||||
|
5.5,2.3,4,1.3,versicolor
|
||||||
|
6.5,2.8,4.6,1.5,versicolor
|
||||||
|
5.7,2.8,4.5,1.3,versicolor
|
||||||
|
6.3,3.3,4.7,1.6,versicolor
|
||||||
|
4.9,2.4,3.3,1,versicolor
|
||||||
|
6.6,2.9,4.6,1.3,versicolor
|
||||||
|
5.2,2.7,3.9,1.4,versicolor
|
||||||
|
5,2,3.5,1,versicolor
|
||||||
|
5.9,3,4.2,1.5,versicolor
|
||||||
|
6,2.2,4,1,versicolor
|
||||||
|
6.1,2.9,4.7,1.4,versicolor
|
||||||
|
5.6,2.9,3.6,1.3,versicolor
|
||||||
|
6.7,3.1,4.4,1.4,versicolor
|
||||||
|
5.6,3,4.5,1.5,versicolor
|
||||||
|
5.8,2.7,4.1,1,versicolor
|
||||||
|
6.2,2.2,4.5,1.5,versicolor
|
||||||
|
5.6,2.5,3.9,1.1,versicolor
|
||||||
|
5.9,3.2,4.8,1.8,versicolor
|
||||||
|
6.1,2.8,4,1.3,versicolor
|
||||||
|
6.3,2.5,4.9,1.5,versicolor
|
||||||
|
6.1,2.8,4.7,1.2,versicolor
|
||||||
|
6.4,2.9,4.3,1.3,versicolor
|
||||||
|
6.6,3,4.4,1.4,versicolor
|
||||||
|
6.8,2.8,4.8,1.4,versicolor
|
||||||
|
6.7,3,5,1.7,versicolor
|
||||||
|
6,2.9,4.5,1.5,versicolor
|
||||||
|
5.7,2.6,3.5,1,versicolor
|
||||||
|
5.5,2.4,3.8,1.1,versicolor
|
||||||
|
5.5,2.4,3.7,1,versicolor
|
||||||
|
5.8,2.7,3.9,1.2,versicolor
|
||||||
|
6,2.7,5.1,1.6,versicolor
|
||||||
|
5.4,3,4.5,1.5,versicolor
|
||||||
|
6,3.4,4.5,1.6,versicolor
|
||||||
|
6.7,3.1,4.7,1.5,versicolor
|
||||||
|
6.3,2.3,4.4,1.3,versicolor
|
||||||
|
5.6,3,4.1,1.3,versicolor
|
||||||
|
5.5,2.5,4,1.3,versicolor
|
||||||
|
5.5,2.6,4.4,1.2,versicolor
|
||||||
|
6.1,3,4.6,1.4,versicolor
|
||||||
|
5.8,2.6,4,1.2,versicolor
|
||||||
|
5,2.3,3.3,1,versicolor
|
||||||
|
5.6,2.7,4.2,1.3,versicolor
|
||||||
|
5.7,3,4.2,1.2,versicolor
|
||||||
|
5.7,2.9,4.2,1.3,versicolor
|
||||||
|
6.2,2.9,4.3,1.3,versicolor
|
||||||
|
5.1,2.5,3,1.1,versicolor
|
||||||
|
5.7,2.8,4.1,1.3,versicolor
|
||||||
|
6.3,3.3,6,2.5,virginica
|
||||||
|
5.8,2.7,5.1,1.9,virginica
|
||||||
|
7.1,3,5.9,2.1,virginica
|
||||||
|
6.3,2.9,5.6,1.8,virginica
|
||||||
|
6.5,3,5.8,2.2,virginica
|
||||||
|
7.6,3,6.6,2.1,virginica
|
||||||
|
4.9,2.5,4.5,1.7,virginica
|
||||||
|
7.3,2.9,6.3,1.8,virginica
|
||||||
|
6.7,2.5,5.8,1.8,virginica
|
||||||
|
7.2,3.6,6.1,2.5,virginica
|
||||||
|
6.5,3.2,5.1,2,virginica
|
||||||
|
6.4,2.7,5.3,1.9,virginica
|
||||||
|
6.8,3,5.5,2.1,virginica
|
||||||
|
5.7,2.5,5,2,virginica
|
||||||
|
5.8,2.8,5.1,2.4,virginica
|
||||||
|
6.4,3.2,5.3,2.3,virginica
|
||||||
|
6.5,3,5.5,1.8,virginica
|
||||||
|
7.7,3.8,6.7,2.2,virginica
|
||||||
|
7.7,2.6,6.9,2.3,virginica
|
||||||
|
6,2.2,5,1.5,virginica
|
||||||
|
6.9,3.2,5.7,2.3,virginica
|
||||||
|
5.6,2.8,4.9,2,virginica
|
||||||
|
7.7,2.8,6.7,2,virginica
|
||||||
|
6.3,2.7,4.9,1.8,virginica
|
||||||
|
6.7,3.3,5.7,2.1,virginica
|
||||||
|
7.2,3.2,6,1.8,virginica
|
||||||
|
6.2,2.8,4.8,1.8,virginica
|
||||||
|
6.1,3,4.9,1.8,virginica
|
||||||
|
6.4,2.8,5.6,2.1,virginica
|
||||||
|
7.2,3,5.8,1.6,virginica
|
||||||
|
7.4,2.8,6.1,1.9,virginica
|
||||||
|
7.9,3.8,6.4,2,virginica
|
||||||
|
6.4,2.8,5.6,2.2,virginica
|
||||||
|
6.3,2.8,5.1,1.5,virginica
|
||||||
|
6.1,2.6,5.6,1.4,virginica
|
||||||
|
7.7,3,6.1,2.3,virginica
|
||||||
|
6.3,3.4,5.6,2.4,virginica
|
||||||
|
6.4,3.1,5.5,1.8,virginica
|
||||||
|
6,3,4.8,1.8,virginica
|
||||||
|
6.9,3.1,5.4,2.1,virginica
|
||||||
|
6.7,3.1,5.6,2.4,virginica
|
||||||
|
6.9,3.1,5.1,2.3,virginica
|
||||||
|
5.8,2.7,5.1,1.9,virginica
|
||||||
|
6.8,3.2,5.9,2.3,virginica
|
||||||
|
6.7,3.3,5.7,2.5,virginica
|
||||||
|
6.7,3,5.2,2.3,virginica
|
||||||
|
6.3,2.5,5,1.9,virginica
|
||||||
|
6.5,3,5.2,2,virginica
|
||||||
|
6.2,3.4,5.4,2.3,virginica
|
||||||
|
5.9,3,5.1,1.8,virginica
|
||||||
|
22423
Praktikum Python Code/06-Deep-Learning/melville-moby_dick.txt
Normal file
22423
Praktikum Python Code/06-Deep-Learning/melville-moby_dick.txt
Normal file
File diff suppressed because it is too large
Load Diff
1072
Praktikum Python Code/06-Deep-Learning/moby_dick_four_chapters.txt
Normal file
1072
Praktikum Python Code/06-Deep-Learning/moby_dick_four_chapters.txt
Normal file
File diff suppressed because it is too large
Load Diff
BIN
Praktikum Python Code/06-Deep-Learning/model6.h5
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/model6.h5
Normal file
Binary file not shown.
BIN
Praktikum Python Code/06-Deep-Learning/myfirstmodel.h5
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/myfirstmodel.h5
Normal file
Binary file not shown.
BIN
Praktikum Python Code/06-Deep-Learning/simple
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/simple
Normal file
Binary file not shown.
BIN
Praktikum Python Code/06-Deep-Learning/simple.h5
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/simple.h5
Normal file
Binary file not shown.
BIN
Praktikum Python Code/06-Deep-Learning/test_qa.txt
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/test_qa.txt
Normal file
Binary file not shown.
BIN
Praktikum Python Code/06-Deep-Learning/train_qa.txt
Normal file
BIN
Praktikum Python Code/06-Deep-Learning/train_qa.txt
Normal file
Binary file not shown.
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,242 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"___\n",
|
||||||
|
"\n",
|
||||||
|
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
|
||||||
|
"___"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Working with Stop Words\n",
|
||||||
|
"In the previous sections we've seen how to access spaCy's 305 built-in stopwords, as well as the 318 stopword set available in scikit-learn. In this section we've outlined three tools (spaCy, NLTK and scikit-learn). Feel free to apply them individually or in combinations when cleaning your own text data."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# spaCy's built-in stopwords"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"305"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import spacy\n",
|
||||||
|
"nlp = spacy.load('en_core_web_sm')\n",
|
||||||
|
"stopwords = nlp.Defaults.stop_words\n",
|
||||||
|
"len(stopwords)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"set"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"type(stopwords)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'both', 'bottom', 'but', 'by', 'ca', 'call', 'can', 'cannot', 'could', 'did', 'do', 'does', 'doing', 'done', 'down', 'due', 'during', 'each', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'have', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'if', 'in', 'indeed', 'into', 'is', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'made', 'make', 'many', 'may', 'me', 'meanwhile', 'might', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'quite', 'rather', 're', 'really', 'regarding', 'same', 'say', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', 'show', 'side', 'since', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'under', 'unless', 'until', 'up', 'upon', 'us', 'used', 'using', 'various', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(sorted(list(stopwords)))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# NLTK's built-in stopwords"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"179"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from nltk.corpus import stopwords\n",
|
||||||
|
"stopwords = stopwords.words('english')\n",
|
||||||
|
"len(stopwords)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"list"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"type(stopwords)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', \"aren't\", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', \"couldn't\", 'd', 'did', 'didn', \"didn't\", 'do', 'does', 'doesn', \"doesn't\", 'doing', 'don', \"don't\", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', \"hadn't\", 'has', 'hasn', \"hasn't\", 'have', 'haven', \"haven't\", 'having', 'he', 'her', 'here', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'isn', \"isn't\", 'it', \"it's\", 'its', 'itself', 'just', 'll', 'm', 'ma', 'me', 'mightn', \"mightn't\", 'more', 'most', 'mustn', \"mustn't\", 'my', 'myself', 'needn', \"needn't\", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', \"shan't\", 'she', \"she's\", 'should', \"should've\", 'shouldn', \"shouldn't\", 'so', 'some', 'such', 't', 'than', 'that', \"that'll\", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn', \"wasn't\", 'we', 'were', 'weren', \"weren't\", 'what', 'when', 'where', 'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won', \"won't\", 'wouldn', \"wouldn't\", 'y', 'you', \"you'd\", \"you'll\", \"you're\", \"you've\", 'your', 'yours', 'yourself', 'yourselves']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(sorted(stopwords))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Scikit-learn's built-in stopwords"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"318"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.feature_extraction import text\n",
|
||||||
|
"stopwords = text.ENGLISH_STOP_WORDS\n",
|
||||||
|
"len(stopwords)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"frozenset"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"type(stopwords)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it', 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 'take', 'ten', 'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(sorted(list(stopwords)))"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.6.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
BIN
Praktikum Python Code/Pierian_Data_Logo.png
Normal file
BIN
Praktikum Python Code/Pierian_Data_Logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 5.8 KiB |
10001
Praktikum Python Code/TextFiles/amazonreviews.tsv
Normal file
10001
Praktikum Python Code/TextFiles/amazonreviews.tsv
Normal file
File diff suppressed because it is too large
Load Diff
2819
Praktikum Python Code/TextFiles/elon_tweets.txt
Normal file
2819
Praktikum Python Code/TextFiles/elon_tweets.txt
Normal file
File diff suppressed because it is too large
Load Diff
2820
Praktikum Python Code/TextFiles/elon_tweets2.txt
Normal file
2820
Praktikum Python Code/TextFiles/elon_tweets2.txt
Normal file
File diff suppressed because it is too large
Load Diff
11465
Praktikum Python Code/TextFiles/huckfinn.txt
Normal file
11465
Praktikum Python Code/TextFiles/huckfinn.txt
Normal file
File diff suppressed because it is too large
Load Diff
64313
Praktikum Python Code/TextFiles/moviereviews.tsv
Normal file
64313
Praktikum Python Code/TextFiles/moviereviews.tsv
Normal file
File diff suppressed because it is too large
Load Diff
6001
Praktikum Python Code/TextFiles/moviereviews2.tsv
Normal file
6001
Praktikum Python Code/TextFiles/moviereviews2.tsv
Normal file
File diff suppressed because it is too large
Load Diff
375
Praktikum Python Code/TextFiles/owlcreek.txt
Normal file
375
Praktikum Python Code/TextFiles/owlcreek.txt
Normal file
@ -0,0 +1,375 @@
|
|||||||
|
AN OCCURRENCE AT OWL CREEK BRIDGE
|
||||||
|
|
||||||
|
by Ambrose Bierce
|
||||||
|
|
||||||
|
I
|
||||||
|
|
||||||
|
A man stood upon a railroad bridge in northern Alabama, looking down
|
||||||
|
into the swift water twenty feet below. The man's hands were behind
|
||||||
|
his back, the wrists bound with a cord. A rope closely encircled his
|
||||||
|
neck. It was attached to a stout cross-timber above his head and the
|
||||||
|
slack fell to the level of his knees. Some loose boards laid upon the
|
||||||
|
ties supporting the rails of the railway supplied a footing for him
|
||||||
|
and his executioners--two private soldiers of the Federal army,
|
||||||
|
directed by a sergeant who in civil life may have been a deputy
|
||||||
|
sheriff. At a short remove upon the same temporary platform was an
|
||||||
|
officer in the uniform of his rank, armed. He was a captain. A
|
||||||
|
sentinel at each end of the bridge stood with his rifle in the
|
||||||
|
position known as "support," that is to say, vertical in front of the
|
||||||
|
left shoulder, the hammer resting on the forearm thrown straight
|
||||||
|
across the chest--a formal and unnatural position, enforcing an erect
|
||||||
|
carriage of the body. It did not appear to be the duty of these two
|
||||||
|
men to know what was occurring at the center of the bridge; they
|
||||||
|
merely blockaded the two ends of the foot planking that traversed it.
|
||||||
|
|
||||||
|
Beyond one of the sentinels nobody was in sight; the railroad ran
|
||||||
|
straight away into a forest for a hundred yards, then, curving, was
|
||||||
|
lost to view. Doubtless there was an outpost farther along. The
|
||||||
|
other bank of the stream was open ground--a gentle slope topped with
|
||||||
|
a stockade of vertical tree trunks, loopholed for rifles, with a
|
||||||
|
single embrasure through which protruded the muzzle of a brass cannon
|
||||||
|
commanding the bridge. Midway up the slope between the bridge and
|
||||||
|
fort were the spectators--a single company of infantry in line, at
|
||||||
|
"parade rest," the butts of their rifles on the ground, the barrels
|
||||||
|
inclining slightly backward against the right shoulder, the hands
|
||||||
|
crossed upon the stock. A lieutenant stood at the right of the line,
|
||||||
|
the point of his sword upon the ground, his left hand resting upon his
|
||||||
|
right. Excepting the group of four at the center of the bridge, not a
|
||||||
|
man moved. The company faced the bridge, staring stonily, motionless.
|
||||||
|
The sentinels, facing the banks of the stream, might have been statues
|
||||||
|
to adorn the bridge. The captain stood with folded arms, silent,
|
||||||
|
observing the work of his subordinates, but making no sign. Death is a
|
||||||
|
dignitary who when he comes announced is to be received with formal
|
||||||
|
manifestations of respect, even by those most familiar with him. In
|
||||||
|
the code of military etiquette silence and fixity are forms of
|
||||||
|
deference.
|
||||||
|
|
||||||
|
The man who was engaged in being hanged was apparently about
|
||||||
|
thirty-five years of age. He was a civilian, if one might judge from
|
||||||
|
his habit, which was that of a planter. His features were good--a
|
||||||
|
straight nose, firm mouth, broad forehead, from which his long, dark
|
||||||
|
hair was combed straight back, falling behind his ears to the collar
|
||||||
|
of his well fitting frock coat. He wore a moustache and pointed
|
||||||
|
beard, but no whiskers; his eyes were large and dark gray, and had a
|
||||||
|
kindly expression which one would hardly have expected in one whose
|
||||||
|
neck was in the hemp. Evidently this was no vulgar assassin. The
|
||||||
|
liberal military code makes provision for hanging many kinds of
|
||||||
|
persons, and gentlemen are not excluded.
|
||||||
|
|
||||||
|
The preparations being complete, the two private soldiers stepped
|
||||||
|
aside and each drew away the plank upon which he had been standing.
|
||||||
|
The sergeant turned to the captain, saluted and placed himself
|
||||||
|
immediately behind that officer, who in turn moved apart one pace.
|
||||||
|
These movements left the condemned man and the sergeant standing on
|
||||||
|
the two ends of the same plank, which spanned three of the cross-ties
|
||||||
|
of the bridge. The end upon which the civilian stood almost, but not
|
||||||
|
quite, reached a fourth. This plank had been held in place by the
|
||||||
|
weight of the captain; it was now held by that of the sergeant. At a
|
||||||
|
signal from the former the latter would step aside, the plank would
|
||||||
|
tilt and the condemned man go down between two ties. The arrangement
|
||||||
|
commended itself to his judgement as simple and effective. His face
|
||||||
|
had not been covered nor his eyes bandaged. He looked a moment at his
|
||||||
|
"unsteadfast footing," then let his gaze wander to the swirling water
|
||||||
|
of the stream racing madly beneath his feet. A piece of dancing
|
||||||
|
driftwood caught his attention and his eyes followed it down the
|
||||||
|
current. How slowly it appeared to move! What a sluggish stream!
|
||||||
|
|
||||||
|
He closed his eyes in order to fix his last thoughts upon his wife and
|
||||||
|
children. The water, touched to gold by the early sun, the brooding
|
||||||
|
mists under the banks at some distance down the stream, the fort, the
|
||||||
|
soldiers, the piece of drift--all had distracted him. And now he
|
||||||
|
became conscious of a new disturbance. Striking through the thought
|
||||||
|
of his dear ones was sound which he could neither ignore nor
|
||||||
|
understand, a sharp, distinct, metallic percussion like the stroke of
|
||||||
|
a blacksmith's hammer upon the anvil; it had the same ringing quality.
|
||||||
|
He wondered what it was, and whether immeasurably distant or near by--
|
||||||
|
it seemed both. Its recurrence was regular, but as slow as the
|
||||||
|
tolling of a death knell. He awaited each new stroke with impatience
|
||||||
|
and--he knew not why--apprehension. The intervals of silence grew
|
||||||
|
progressively longer; the delays became maddening. With their greater
|
||||||
|
infrequency the sounds increased in strength and sharpness. They hurt
|
||||||
|
his ear like the thrust of a knife; he feared he would shriek. What he
|
||||||
|
heard was the ticking of his watch.
|
||||||
|
|
||||||
|
He unclosed his eyes and saw again the water below him. "If I could
|
||||||
|
free my hands," he thought, "I might throw off the noose and spring
|
||||||
|
into the stream. By diving I could evade the bullets and, swimming
|
||||||
|
vigorously, reach the bank, take to the woods and get away home. My
|
||||||
|
home, thank God, is as yet outside their lines; my wife and little
|
||||||
|
ones are still beyond the invader's farthest advance."
|
||||||
|
|
||||||
|
As these thoughts, which have here to be set down in words, were
|
||||||
|
flashed into the doomed man's brain rather than evolved from it the
|
||||||
|
captain nodded to the sergeant. The sergeant stepped aside.
|
||||||
|
|
||||||
|
II
|
||||||
|
|
||||||
|
Peyton Farquhar was a well to do planter, of an old and highly
|
||||||
|
respected Alabama family. Being a slave owner and like other slave
|
||||||
|
owners a politician, he was naturally an original secessionist and
|
||||||
|
ardently devoted to the Southern cause. Circumstances of an imperious
|
||||||
|
nature, which it is unnecessary to relate here, had prevented him from
|
||||||
|
taking service with that gallant army which had fought the disastrous
|
||||||
|
campaigns ending with the fall of Corinth, and he chafed under the
|
||||||
|
inglorious restraint, longing for the release of his energies, the
|
||||||
|
larger life of the soldier, the opportunity for distinction. That
|
||||||
|
opportunity, he felt, would come, as it comes to all in wartime.
|
||||||
|
Meanwhile he did what he could. No service was too humble for him to
|
||||||
|
perform in the aid of the South, no adventure too perilous for him to
|
||||||
|
undertake if consistent with the character of a civilian who was at
|
||||||
|
heart a soldier, and who in good faith and without too much
|
||||||
|
qualification assented to at least a part of the frankly villainous
|
||||||
|
dictum that all is fair in love and war.
|
||||||
|
|
||||||
|
One evening while Farquhar and his wife were sitting on a rustic bench
|
||||||
|
near the entrance to his grounds, a gray-clad soldier rode up to the
|
||||||
|
gate and asked for a drink of water. Mrs. Farquhar was only too happy
|
||||||
|
to serve him with her own white hands. While she was fetching the
|
||||||
|
water her husband approached the dusty horseman and inquired eagerly
|
||||||
|
for news from the front.
|
||||||
|
|
||||||
|
"The Yanks are repairing the railroads," said the man, "and are
|
||||||
|
getting ready for another advance. They have reached the Owl Creek
|
||||||
|
bridge, put it in order and built a stockade on the north bank. The
|
||||||
|
commandant has issued an order, which is posted everywhere, declaring
|
||||||
|
that any civilian caught interfering with the railroad, its bridges,
|
||||||
|
tunnels, or trains will be summarily hanged. I saw the order."
|
||||||
|
|
||||||
|
"How far is it to the Owl Creek bridge?" Farquhar asked.
|
||||||
|
|
||||||
|
"About thirty miles."
|
||||||
|
|
||||||
|
"Is there no force on this side of the creek?"
|
||||||
|
|
||||||
|
"Only a picket post half a mile out, on the railroad, and a single
|
||||||
|
sentinel at this end of the bridge."
|
||||||
|
|
||||||
|
"Suppose a man--a civilian and student of hanging--should elude the
|
||||||
|
picket post and perhaps get the better of the sentinel," said
|
||||||
|
Farquhar, smiling, "what could he accomplish?"
|
||||||
|
|
||||||
|
The soldier reflected. "I was there a month ago," he replied. "I
|
||||||
|
observed that the flood of last winter had lodged a great quantity of
|
||||||
|
driftwood against the wooden pier at this end of the bridge. It is
|
||||||
|
now dry and would burn like tinder."
|
||||||
|
|
||||||
|
The lady had now brought the water, which the soldier drank. He
|
||||||
|
thanked her ceremoniously, bowed to her husband and rode away. An
|
||||||
|
hour later, after nightfall, he repassed the plantation, going
|
||||||
|
northward in the direction from which he had come. He was a Federal
|
||||||
|
scout.
|
||||||
|
|
||||||
|
III
|
||||||
|
|
||||||
|
As Peyton Farquhar fell straight downward through the bridge he lost
|
||||||
|
consciousness and was as one already dead. From this state he was
|
||||||
|
awakened--ages later, it seemed to him--by the pain of a sharp
|
||||||
|
pressure upon his throat, followed by a sense of suffocation. Keen,
|
||||||
|
poignant agonies seemed to shoot from his neck downward through every
|
||||||
|
fiber of his body and limbs. These pains appeared to flash along well
|
||||||
|
defined lines of ramification and to beat with an inconceivably rapid
|
||||||
|
periodicity. They seemed like streams of pulsating fire heating him
|
||||||
|
to an intolerable temperature. As to his head, he was conscious of
|
||||||
|
nothing but a feeling of fullness--of congestion. These sensations
|
||||||
|
were unaccompanied by thought. The intellectual part of his nature
|
||||||
|
was already effaced; he had power only to feel, and feeling was
|
||||||
|
torment. He was conscious of motion. Encompassed in a luminous cloud,
|
||||||
|
of which he was now merely the fiery heart, without material
|
||||||
|
substance, he swung through unthinkable arcs of oscillation, like a
|
||||||
|
vast pendulum. Then all at once, with terrible suddenness, the light
|
||||||
|
about him shot upward with the noise of a loud splash; a frightful
|
||||||
|
roaring was in his ears, and all was cold and dark. The power of
|
||||||
|
thought was restored; he knew that the rope had broken and he had
|
||||||
|
fallen into the stream. There was no additional strangulation; the
|
||||||
|
noose about his neck was already suffocating him and kept the water
|
||||||
|
from his lungs. To die of hanging at the bottom of a river!--the idea
|
||||||
|
seemed to him ludicrous. He opened his eyes in the darkness and saw
|
||||||
|
above him a gleam of light, but how distant, how inaccessible! He was
|
||||||
|
still sinking, for the light became fainter and fainter until it was a
|
||||||
|
mere glimmer. Then it began to grow and brighten, and he knew that he
|
||||||
|
was rising toward the surface--knew it with reluctance, for he was now
|
||||||
|
very comfortable. "To be hanged and drowned," he thought, "that is
|
||||||
|
not so bad; but I do not wish to be shot. No; I will not be shot;
|
||||||
|
that is not fair."
|
||||||
|
|
||||||
|
He was not conscious of an effort, but a sharp pain in his wrist
|
||||||
|
apprised him that he was trying to free his hands. He gave the
|
||||||
|
struggle his attention, as an idler might observe the feat of a
|
||||||
|
juggler, without interest in the outcome. What splendid effort!--what
|
||||||
|
magnificent, what superhuman strength! Ah, that was a fine endeavor!
|
||||||
|
Bravo! The cord fell away; his arms parted and floated upward, the
|
||||||
|
hands dimly seen on each side in the growing light. He watched them
|
||||||
|
with a new interest as first one and then the other pounced upon the
|
||||||
|
noose at his neck. They tore it away and thrust it fiercely aside,
|
||||||
|
its undulations resembling those of a water snake. "Put it back, put
|
||||||
|
it back!" He thought he shouted these words to his hands, for the
|
||||||
|
undoing of the noose had been succeeded by the direst pang that he had
|
||||||
|
yet experienced. His neck ached horribly; his brain was on fire, his
|
||||||
|
heart, which had been fluttering faintly, gave a great leap, trying to
|
||||||
|
force itself out at his mouth. His whole body was racked and wrenched
|
||||||
|
with an insupportable anguish! But his disobedient hands gave no heed
|
||||||
|
to the command. They beat the water vigorously with quick, downward
|
||||||
|
strokes, forcing him to the surface. He felt his head emerge; his
|
||||||
|
eyes were blinded by the sunlight; his chest expanded convulsively,
|
||||||
|
and with a supreme and crowning agony his lungs engulfed a great
|
||||||
|
draught of air, which instantly he expelled in a shriek!
|
||||||
|
|
||||||
|
He was now in full possession of his physical senses. They were,
|
||||||
|
indeed, preternaturally keen and alert. Something in the awful
|
||||||
|
disturbance of his organic system had so exalted and refined them that
|
||||||
|
they made record of things never before perceived. He felt the
|
||||||
|
ripples upon his face and heard their separate sounds as they struck.
|
||||||
|
He looked at the forest on the bank of the stream, saw the individual
|
||||||
|
trees, the leaves and the veining of each leaf--he saw the very
|
||||||
|
insects upon them: the locusts, the brilliant bodied flies, the gray
|
||||||
|
spiders stretching their webs from twig to twig. He noted the
|
||||||
|
prismatic colors in all the dewdrops upon a million blades of grass.
|
||||||
|
The humming of the gnats that danced above the eddies of the stream,
|
||||||
|
the beating of the dragon flies' wings, the strokes of the water
|
||||||
|
spiders' legs, like oars which had lifted their boat--all these made
|
||||||
|
audible music. A fish slid along beneath his eyes and he heard the
|
||||||
|
rush of its body parting the water.
|
||||||
|
|
||||||
|
He had come to the surface facing down the stream; in a moment the
|
||||||
|
visible world seemed to wheel slowly round, himself the pivotal point,
|
||||||
|
and he saw the bridge, the fort, the soldiers upon the bridge, the
|
||||||
|
captain, the sergeant, the two privates, his executioners. They were
|
||||||
|
in silhouette against the blue sky. They shouted and gesticulated,
|
||||||
|
pointing at him. The captain had drawn his pistol, but did not fire;
|
||||||
|
the others were unarmed. Their movements were grotesque and horrible,
|
||||||
|
their forms gigantic.
|
||||||
|
|
||||||
|
Suddenly he heard a sharp report and something struck the water
|
||||||
|
smartly within a few inches of his head, spattering his face with
|
||||||
|
spray. He heard a second report, and saw one of the sentinels with
|
||||||
|
his rifle at his shoulder, a light cloud of blue smoke rising from the
|
||||||
|
muzzle. The man in the water saw the eye of the man on the bridge
|
||||||
|
gazing into his own through the sights of the rifle. He observed that
|
||||||
|
it was a gray eye and remembered having read that gray eyes were
|
||||||
|
keenest, and that all famous marksmen had them. Nevertheless, this one
|
||||||
|
had missed.
|
||||||
|
|
||||||
|
A counter-swirl had caught Farquhar and turned him half round; he was
|
||||||
|
again looking at the forest on the bank opposite the fort. The sound
|
||||||
|
of a clear, high voice in a monotonous singsong now rang out behind
|
||||||
|
him and came across the water with a distinctness that pierced and
|
||||||
|
subdued all other sounds, even the beating of the ripples in his ears.
|
||||||
|
Although no soldier, he had frequented camps enough to know the dread
|
||||||
|
significance of that deliberate, drawling, aspirated chant; the
|
||||||
|
lieutenant on shore was taking a part in the morning's work. How
|
||||||
|
coldly and pitilessly--with what an even, calm intonation, presaging,
|
||||||
|
and enforcing tranquility in the men--with what accurately measured
|
||||||
|
interval fell those cruel words:
|
||||||
|
|
||||||
|
"Company! . . . Attention! . . . Shoulder arms! . . . Ready!. . .
|
||||||
|
Aim! . . . Fire!"
|
||||||
|
|
||||||
|
Farquhar dived--dived as deeply as he could. The water roared in his
|
||||||
|
ears like the voice of Niagara, yet he heard the dull thunder of the
|
||||||
|
volley and, rising again toward the surface, met shining bits of
|
||||||
|
metal, singularly flattened, oscillating slowly downward. Some of
|
||||||
|
them touched him on the face and hands, then fell away, continuing
|
||||||
|
their descent. One lodged between his collar and neck; it was
|
||||||
|
uncomfortably warm and he snatched it out.
|
||||||
|
|
||||||
|
As he rose to the surface, gasping for breath, he saw that he had been
|
||||||
|
a long time under water; he was perceptibly farther downstream--nearer
|
||||||
|
to safety. The soldiers had almost finished reloading; the metal
|
||||||
|
ramrods flashed all at once in the sunshine as they were drawn from
|
||||||
|
the barrels, turned in the air, and thrust into their sockets. The
|
||||||
|
two sentinels fired again, independently and ineffectually.
|
||||||
|
|
||||||
|
The hunted man saw all this over his shoulder; he was now swimming
|
||||||
|
vigorously with the current. His brain was as energetic as his arms
|
||||||
|
and legs; he thought with the rapidity of lightning:
|
||||||
|
|
||||||
|
"The officer," he reasoned, "will not make that martinet's error a
|
||||||
|
second time. It is as easy to dodge a volley as a single shot. He
|
||||||
|
has probably already given the command to fire at will. God help me,
|
||||||
|
I cannot dodge them all!"
|
||||||
|
|
||||||
|
An appalling splash within two yards of him was followed by a loud,
|
||||||
|
rushing sound, DIMINUENDO, which seemed to travel back through the air
|
||||||
|
to the fort and died in an explosion which stirred the very river to
|
||||||
|
its deeps! A rising sheet of water curved over him, fell down upon
|
||||||
|
him, blinded him, strangled him! The cannon had taken an hand in the
|
||||||
|
game. As he shook his head free from the commotion of the smitten
|
||||||
|
water he heard the deflected shot humming through the air ahead, and
|
||||||
|
in an instant it was cracking and smashing the branches in the forest
|
||||||
|
beyond.
|
||||||
|
|
||||||
|
"They will not do that again," he thought; "the next time they will
|
||||||
|
use a charge of grape. I must keep my eye upon the gun; the smoke
|
||||||
|
will apprise me--the report arrives too late; it lags behind the
|
||||||
|
missile. That is a good gun."
|
||||||
|
|
||||||
|
Suddenly he felt himself whirled round and round--spinning like a top.
|
||||||
|
The water, the banks, the forests, the now distant bridge, fort and
|
||||||
|
men, all were commingled and blurred. Objects were represented by
|
||||||
|
their colors only; circular horizontal streaks of color--that was all
|
||||||
|
he saw. He had been caught in a vortex and was being whirled on with a
|
||||||
|
velocity of advance and gyration that made him giddy and sick. In few
|
||||||
|
moments he was flung upon the gravel at the foot of the left bank of
|
||||||
|
the stream--the southern bank--and behind a projecting point which
|
||||||
|
concealed him from his enemies. The sudden arrest of his motion, the
|
||||||
|
abrasion of one of his hands on the gravel, restored him, and he wept
|
||||||
|
with delight. He dug his fingers into the sand, threw it over himself
|
||||||
|
in handfuls and audibly blessed it. It looked like diamonds, rubies,
|
||||||
|
emeralds; he could think of nothing beautiful which it did not
|
||||||
|
resemble. The trees upon the bank were giant garden plants; he noted
|
||||||
|
a definite order in their arrangement, inhaled the fragrance of their
|
||||||
|
blooms. A strange roseate light shone through the spaces among their
|
||||||
|
trunks and the wind made in their branches the music of AEolian harps.
|
||||||
|
He had not wish to perfect his escape--he was content to remain in
|
||||||
|
that enchanting spot until retaken.
|
||||||
|
|
||||||
|
A whiz and a rattle of grapeshot among the branches high above his
|
||||||
|
head roused him from his dream. The baffled cannoneer had fired him a
|
||||||
|
random farewell. He sprang to his feet, rushed up the sloping bank,
|
||||||
|
and plunged into the forest.
|
||||||
|
|
||||||
|
All that day he traveled, laying his course by the rounding sun. The
|
||||||
|
forest seemed interminable; nowhere did he discover a break in it, not
|
||||||
|
even a woodman's road. He had not known that he lived in so wild a
|
||||||
|
region. There was something uncanny in the revelation.
|
||||||
|
|
||||||
|
By nightfall he was fatigued, footsore, famished. The thought of his
|
||||||
|
wife and children urged him on. At last he found a road which led him
|
||||||
|
in what he knew to be the right direction. It was as wide and
|
||||||
|
straight as a city street, yet it seemed untraveled. No fields
|
||||||
|
bordered it, no dwelling anywhere. Not so much as the barking of a
|
||||||
|
dog suggested human habitation. The black bodies of the trees formed
|
||||||
|
a straight wall on both sides, terminating on the horizon in a point,
|
||||||
|
like a diagram in a lesson in perspective. Overhead, as he looked up
|
||||||
|
through this rift in the wood, shone great golden stars looking
|
||||||
|
unfamiliar and grouped in strange constellations. He was sure they
|
||||||
|
were arranged in some order which had a secret and malign
|
||||||
|
significance. The wood on either side was full of singular noises,
|
||||||
|
among which--once, twice, and again--he distinctly heard whispers in
|
||||||
|
an unknown tongue.
|
||||||
|
|
||||||
|
His neck was in pain and lifting his hand to it found it horribly
|
||||||
|
swollen. He knew that it had a circle of black where the rope had
|
||||||
|
bruised it. His eyes felt congested; he could no longer close them.
|
||||||
|
His tongue was swollen with thirst; he relieved its fever by thrusting
|
||||||
|
it forward from between his teeth into the cold air. How softly the
|
||||||
|
turf had carpeted the untraveled avenue--he could no longer feel the
|
||||||
|
roadway beneath his feet!
|
||||||
|
|
||||||
|
Doubtless, despite his suffering, he had fallen asleep while walking,
|
||||||
|
for now he sees another scene--perhaps he has merely recovered from a
|
||||||
|
delirium. He stands at the gate of his own home. All is as he left
|
||||||
|
it, and all bright and beautiful in the morning sunshine. He must
|
||||||
|
have traveled the entire night. As he pushes open the gate and passes
|
||||||
|
up the wide white walk, he sees a flutter of female garments; his
|
||||||
|
wife, looking fresh and cool and sweet, steps down from the veranda to
|
||||||
|
meet him. At the bottom of the steps she stands waiting, with a smile
|
||||||
|
of ineffable joy, an attitude of matchless grace and dignity. Ah, how
|
||||||
|
beautiful she is! He springs forwards with extended arms. As he is
|
||||||
|
about to clasp her he feels a stunning blow upon the back of the neck;
|
||||||
|
a blinding white light blazes all about him with a sound like the
|
||||||
|
shock of a cannon--then all is darkness and silence!
|
||||||
|
|
||||||
|
Peyton Farquhar was dead; his body, with a broken neck, swung gently
|
||||||
|
from side to side beneath the timbers of the Owl Creek bridge.
|
||||||
|
|
||||||
136
Praktikum Python Code/TextFiles/peterrabbit.txt
Normal file
136
Praktikum Python Code/TextFiles/peterrabbit.txt
Normal file
@ -0,0 +1,136 @@
|
|||||||
|
The Tale of Peter Rabbit, by Beatrix Potter (1902).
|
||||||
|
|
||||||
|
Once upon a time there were four little Rabbits, and their names
|
||||||
|
were--
|
||||||
|
|
||||||
|
Flopsy,
|
||||||
|
Mopsy,
|
||||||
|
Cotton-tail,
|
||||||
|
and Peter.
|
||||||
|
|
||||||
|
They lived with their Mother in a sand-bank, underneath the root of a
|
||||||
|
very big fir-tree.
|
||||||
|
|
||||||
|
'Now my dears,' said old Mrs. Rabbit one morning, 'you may go into
|
||||||
|
the fields or down the lane, but don't go into Mr. McGregor's garden:
|
||||||
|
your Father had an accident there; he was put in a pie by Mrs.
|
||||||
|
McGregor.'
|
||||||
|
|
||||||
|
'Now run along, and don't get into mischief. I am going out.'
|
||||||
|
|
||||||
|
Then old Mrs. Rabbit took a basket and her umbrella, and went through
|
||||||
|
the wood to the baker's. She bought a loaf of brown bread and five
|
||||||
|
currant buns.
|
||||||
|
|
||||||
|
Flopsy, Mopsy, and Cottontail, who were good little bunnies, went
|
||||||
|
down the lane to gather blackberries:
|
||||||
|
|
||||||
|
But Peter, who was very naughty, ran straight away to Mr. McGregor's
|
||||||
|
garden, and squeezed under the gate!
|
||||||
|
|
||||||
|
First he ate some lettuces and some French beans; and then he ate
|
||||||
|
some radishes;
|
||||||
|
|
||||||
|
And then, feeling rather sick, he went to look for some parsley.
|
||||||
|
|
||||||
|
But round the end of a cucumber frame, whom should he meet but Mr.
|
||||||
|
McGregor!
|
||||||
|
|
||||||
|
Mr. McGregor was on his hands and knees planting out young cabbages,
|
||||||
|
but he jumped up and ran after Peter, waving a rake and calling out,
|
||||||
|
'Stop thief!'
|
||||||
|
|
||||||
|
Peter was most dreadfully frightened; he rushed all over the garden,
|
||||||
|
for he had forgotten the way back to the gate.
|
||||||
|
|
||||||
|
He lost one of his shoes among the cabbages, and the other shoe
|
||||||
|
amongst the potatoes.
|
||||||
|
|
||||||
|
After losing them, he ran on four legs and went faster, so that I
|
||||||
|
think he might have got away altogether if he had not unfortunately
|
||||||
|
run into a gooseberry net, and got caught by the large buttons on his
|
||||||
|
jacket. It was a blue jacket with brass buttons, quite new.
|
||||||
|
|
||||||
|
Peter gave himself up for lost, and shed big tears; but his sobs were
|
||||||
|
overheard by some friendly sparrows, who flew to him in great
|
||||||
|
excitement, and implored him to exert himself.
|
||||||
|
|
||||||
|
Mr. McGregor came up with a sieve, which he intended to pop upon the
|
||||||
|
top of Peter; but Peter wriggled out just in time, leaving his jacket
|
||||||
|
behind him.
|
||||||
|
|
||||||
|
And rushed into the tool-shed, and jumped into a can. It would have
|
||||||
|
been a beautiful thing to hide in, if it had not had so much water in it.
|
||||||
|
|
||||||
|
Mr. McGregor was quite sure that Peter was somewhere in the
|
||||||
|
tool-shed, perhaps hidden underneath a flower-pot. He began to turn
|
||||||
|
them over carefully, looking under each.
|
||||||
|
|
||||||
|
Presently Peter sneezed--'Kertyschoo!' Mr. McGregor was after him in
|
||||||
|
no time.
|
||||||
|
|
||||||
|
And tried to put his foot upon Peter, who jumped out of a window,
|
||||||
|
upsetting three plants. The window was too small for Mr. McGregor, and
|
||||||
|
he was tired of running after Peter. He went back to his work.
|
||||||
|
|
||||||
|
Peter sat down to rest; he was out of breath and trembling with
|
||||||
|
fright, and he had not the least idea which way to go. Also he was
|
||||||
|
very damp with sitting in that can.
|
||||||
|
|
||||||
|
After a time he began to wander about, going lippity--lippity--not
|
||||||
|
very fast, and looking all round.
|
||||||
|
|
||||||
|
He found a door in a wall; but it was locked, and there was no room
|
||||||
|
for a fat little rabbit to squeeze underneath.
|
||||||
|
|
||||||
|
An old mouse was running in and out over the stone doorstep, carrying
|
||||||
|
peas and beans to her family in the wood. Peter asked her the way to
|
||||||
|
the gate, but she had such a large pea in her mouth that she could not
|
||||||
|
answer. She only shook her head at him. Peter began to cry.
|
||||||
|
|
||||||
|
Then he tried to find his way straight across the garden, but he
|
||||||
|
became more and more puzzled. Presently, he came to a pond where Mr.
|
||||||
|
McGregor filled his water-cans. A white cat was staring at some
|
||||||
|
gold-fish, she sat very, very still, but now and then the tip of her
|
||||||
|
tail twitched as if it were alive. Peter thought it best to go away
|
||||||
|
without speaking to her; he had heard about cats from his cousin,
|
||||||
|
little Benjamin Bunny.
|
||||||
|
|
||||||
|
He went back towards the tool-shed, but suddenly, quite close to him,
|
||||||
|
he heard the noise of a hoe--scr-r-ritch, scratch, scratch, scritch.
|
||||||
|
Peter scuttered underneath the bushes. But presently, as nothing
|
||||||
|
happened, he came out, and climbed upon a wheelbarrow and peeped over.
|
||||||
|
The first thing he saw was Mr. McGregor hoeing onions. His back was
|
||||||
|
turned towards Peter, and beyond him was the gate!
|
||||||
|
|
||||||
|
Peter got down very quietly off the wheelbarrow; and started running
|
||||||
|
as fast as he could go, along a straight walk behind some
|
||||||
|
black-currant bushes.
|
||||||
|
|
||||||
|
Mr. McGregor caught sight of him at the corner, but Peter did not
|
||||||
|
care. He slipped underneath the gate, and was safe at last in the wood
|
||||||
|
outside the garden.
|
||||||
|
|
||||||
|
Mr. McGregor hung up the little jacket and the shoes for a scare-crow
|
||||||
|
to frighten the blackbirds.
|
||||||
|
|
||||||
|
Peter never stopped running or looked behind him till he got home to
|
||||||
|
the big fir-tree.
|
||||||
|
|
||||||
|
He was so tired that he flopped down upon the nice soft sand on the
|
||||||
|
floor of the rabbit-hole and shut his eyes. His mother was busy
|
||||||
|
cooking; she wondered what he had done with his clothes. It was the
|
||||||
|
second little jacket and pair of shoes that Peter had lost in a
|
||||||
|
fortnight!
|
||||||
|
|
||||||
|
I am sorry to say that Peter was not very well during the evening.
|
||||||
|
|
||||||
|
His mother put him to bed, and made some camomile tea; and she gave a
|
||||||
|
dose of it to Peter!
|
||||||
|
|
||||||
|
'One table-spoonful to be taken at bed-time.'
|
||||||
|
|
||||||
|
But Flopsy, Mopsy, and Cotton-tail had bread and milk and
|
||||||
|
blackberries for supper.
|
||||||
|
|
||||||
|
THE END
|
||||||
137
Praktikum Python Code/TextFiles/reaganomics.txt
Normal file
137
Praktikum Python Code/TextFiles/reaganomics.txt
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
REAGANOMICS
|
||||||
|
https://en.wikipedia.org/wiki/Reaganomics
|
||||||
|
|
||||||
|
Reaganomics (a portmanteau of [Ronald] Reagan and economics attributed to Paul Harvey)[1] refers to the economic policies promoted by U.S. President Ronald Reagan during the 1980s. These policies are commonly associated with supply-side economics, referred to as trickle-down economics or voodoo economics by political opponents, and free-market economics by political advocates.
|
||||||
|
|
||||||
|
The four pillars of Reagan's economic policy were to reduce the growth of government spending, reduce the federal income tax and capital gains tax, reduce government regulation, and tighten the money supply in order to reduce inflation.[2]
|
||||||
|
|
||||||
|
The results of Reaganomics are still debated. Supporters point to the end of stagflation, stronger GDP growth, and an entrepreneur revolution in the decades that followed.[3][4] Critics point to the widening income gap, an atmosphere of greed, and the national debt tripling in eight years which ultimately reversed the post-World War II trend of a shrinking national debt as percentage of GDP.[5][6]
|
||||||
|
|
||||||
|
HISTORICAL CONTEXT
|
||||||
|
|
||||||
|
Prior to the Reagan administration, the United States economy experienced a decade of high unemployment and persistently high inflation (known as stagflation). Attacks on Keynesian economic orthodoxy as well as empirical economic models such as the Phillips Curve grew. Political pressure favored stimulus resulting in an expansion of the money supply. President Richard Nixon's wage and price controls were phased out.[7] The federal oil reserves were created to ease any future short term shocks. President Jimmy Carter had begun phasing out price controls on petroleum while he created the Department of Energy. Much of the credit for the resolution of the stagflation is given to two causes: a three-year contraction of the money supply by the Federal Reserve Board under Paul Volcker, initiated in the last year of Carter's presidency, and long-term easing of supply and pricing in oil during the 1980s oil glut.[citation needed]
|
||||||
|
|
||||||
|
In stating that his intention was to lower taxes, Reagan's approach was a departure from his immediate predecessors. Reagan enacted lower marginal tax rates as well as simplified income tax codes and continued deregulation. During Reagan's eight year presidency, the annual deficits averaged 4.0% of GDP, compared to a 2.2% average during the preceding eight years.[8] The real (inflation adjusted) average rate of growth in federal spending fell from 4% under Jimmy Carter to 2.5% under Ronald Reagan.[9][10] GDP per employed person increased at an average 1.5% rate during the Reagan administration, compared to an average 0.6% during the preceding eight years.[11] Private sector productivity growth, measured as real output per hour of all persons, increased at an average rate of 1.9% during Reagan's eight years, compared to an average 1.3% during the preceding eight years.[12] Federal net outlays as a percent of GDP averaged 21.4% under Reagan, compared to 19.1% during the preceding eight years.[13]
|
||||||
|
|
||||||
|
During the Nixon and Ford Administrations, before Reagan's election, a combined supply and demand side policy was considered unconventional by the moderate wing of the Republican Party. While running against Reagan for the Presidential nomination in 1980, George H. W. Bush had derided Reaganomics as "voodoo economics".[14] Similarly, in 1976, Gerald Ford had severely criticized Reagan's proposal to turn back a large part of the Federal budget to the states.
|
||||||
|
|
||||||
|
JUSTIFICATIONS
|
||||||
|
|
||||||
|
In his 1980 campaign speeches, Reagan presented his economic proposals as a return to the free enterprise principles, free market economy that had been in favor before the Great Depression and FDR's New Deal policies. At the same time he attracted a following from the supply-side economics movement, which formed in opposition to Keynesian demand-stimulus economics. This movement produced some of the strongest supporters for Reagan's policies during his term in office.
|
||||||
|
|
||||||
|
The contention of the proponents, that the tax rate cuts would more than cover any increases in federal debt, was influenced by a theoretical taxation model based on the elasticity of tax rates, known as the Laffer curve. Arthur Laffer's model predicts that excessive tax rates actually reduce potential tax revenues, by lowering the incentive to produce; the model also predicts that insufficient tax rates (rates below the optimum level for a given economy) lead directly to a reduction in tax revenues.
|
||||||
|
|
||||||
|
POLICIES
|
||||||
|
|
||||||
|
Reagan lifted remaining domestic petroleum price and allocation controls on January 28, 1981,[15] and lowered the oil windfall profits tax in August 1981. He ended the oil windfall profits tax in 1988.[16] During the first year of Reagan's presidency, federal income tax rates were lowered significantly with the signing of the Economic Recovery Tax Act of 1981,[17] which lowered the top marginal tax bracket from 70% to 50% and the lowest bracket from 14% to 11%. This act slashed estate taxes and trimmed taxes paid by business corporations by $150 billion over a five-year period. In 1982 Reagan agreed to a rollback of corporate tax cuts and a smaller rollback of individual income tax cuts. The 1982 tax increase undid a third of the initial tax cut. In 1983 Reagan instituted a payroll tax increase on Social Security and Medicare hospital insurance.[18] In 1984 another bill was introduced that closed tax loopholes. According to tax historian Joseph Thorndike, the bills of 1982 and 1984 "constituted the biggest tax increase ever enacted during peacetime".[19]
|
||||||
|
|
||||||
|
With the Tax Reform Act of 1986, Reagan and Congress sought to simplify the tax system by eliminating many deductions, reducing the highest marginal rates, and reducing the number of tax brackets.[20][21][22][23] In 1983, Democrats Bill Bradley and Dick Gephardt had offered a proposal; in 1984 Reagan had the Treasury Department produce its own plan. The 1986 act aimed to be revenue-neutral: while it reduced the top marginal rate, it also cleaned up the tax base by removing certain tax write-offs, preferences, and exceptions, thus raising the effective tax on activities previously specially favored by the code. Ultimately, the combination of the decrease in deductions and decrease in rates raised revenue equal to about 4% of existing tax revenue.[24]
|
||||||
|
|
||||||
|
Federal revenue share of GDP fell from 19.6% in fiscal 1981 to 17.3% in 1984, before rising back to 18.4% by fiscal year 1989. Personal income tax revenues fell during this period relative to GDP, while payroll tax revenues rose relative to GDP.[25] Reagan's 1981 cut in the top regular tax rate on unearned income reduced the maximum capital gains rate to only 20% – its lowest level since the Hoover administration.[26] The 1986 act set tax rates on capital gains at the same level as the rates on ordinary income like salaries and wages, with both topping out at 28%.[27]
|
||||||
|
|
||||||
|
Reagan significantly increased public expenditures, primarily the Department of Defense, which rose (in constant 2000 dollars) from $267.1 billion in 1980 (4.9% of GDP and 22.7% of public expenditure) to $393.1 billion in 1988 (5.8% of GDP and 27.3% of public expenditure); most of those years military spending was about 6% of GDP, exceeding this number in 4 different years. All these numbers had not been seen since the end of U.S. involvement in the Vietnam War in 1973.[28] In 1981, Reagan significantly reduced the maximum tax rate, which affected the highest income earners, and lowered the top marginal tax rate from 70% to 50%; in 1986 he further reduced the rate to 28%.[29] The federal deficit under Reagan peaked at 6% of GDP in 1983, falling to 3.2% of GDP in 1987[30] and to 3.1% of GDP in his final budget.[31] The inflation-adjusted rate of growth in federal spending fell from 4% under Jimmy Carter to 2.5% under Ronald Reagan. This was the slowest rate of growth in inflation adjusted spending since Eisenhower. However, federal deficit as percent of GDP was up throughout the Reagan presidency from 2.7% at the end of (and throughout) the Carter administration.[9][31][32] As a short-run strategy to reduce inflation and lower nominal interest rates, the U.S. borrowed both domestically and abroad to cover the Federal budget deficits, raising the national debt from $997 billion to $2.85 trillion.[33] This led to the U.S. moving from the world's largest international creditor to the world's largest debtor nation.[5] Reagan described the new debt as the "greatest disappointment" of his presidency.[34]
|
||||||
|
|
||||||
|
According to William A. Niskanen, one of the architects of Reaganomics, "Reagan delivered on each of his four major policy objectives, although not to the extent that he and his supporters had hoped", and notes that the most substantial change was in the tax code, where the top marginal individual income tax rate fell from 70.1% to 28.4%, and there was a "major reversal in the tax treatment of business income", with effect of "reducing the tax bias among types of investment but increasing the average effective tax rate on new investment". Roger Porter, another architect of the program, acknowledges that the program was weakened by the many hands that changed the President's calculus, such as Congress.[2][35] President Reagan raised taxes eleven times over the course of his presidency, but the overall tax burden went down during his presidency.[36][37] According to Paul Krugman, "Over all, the 1982 tax increase undid about a third of the 1981 cut; as a share of GDP, the increase was substantially larger than Mr. Clinton's 1993 tax increase."[18] According to historian and domestic policy adviser Bruce Bartlett, Reagan's tax increases over the course of his presidency took back half of the 1981 tax cut. Though since the Reagan tax reductions, top marginal tax rates have remained lower than at any point in US history since 1931, when the top marginal rate was raised from 25% to 63%.[38]
|
||||||
|
|
||||||
|
RESULTS
|
||||||
|
|
||||||
|
Overview
|
||||||
|
|
||||||
|
Spending during the years Reagan budgeted (FY 1982–89) averaged 21.6% GDP, roughly tied with President Obama for the highest among any recent President. Each faced a severe recession early in their administration. In addition, the public debt rose from 26% GDP in 1980 to 41% GDP by 1988. In dollar terms, the public debt rose from $712 billion in 1980 to $2.052 trillion in 1988, a roughly three-fold increase.[25]:143 The unemployment rate rose from 7% in 1980 to 11% in 1982, then declined to 5% in 1988. The inflation rate declined from 10% in 1980 to 4% in 1988.[2]
|
||||||
|
|
||||||
|
Some economists have stated that Reagan's policies were an important part of bringing about the third longest peacetime economic expansion in U.S. history.[39][40] During the Reagan administration, real GDP growth averaged 3.5%, compared to 2.9% during the preceding eight years.[41] The annual average unemployment rate declined by 1.7 percentage points, from 7.2% in 1980 to 5.5% in 1988, after it had increased by 1.6 percentage points over the preceding eight years.[42][43] Nonfarm employment increased by 16.1 million during Reagan's presidency, compared to 15.4 million during the preceding eight years,[44] while manufacturing employment declined by 582,000 after rising 363,000 during the preceding eight years.[45] Reagan's administration is the only one not to have raised the minimum wage.[46] The inflation rate, 13.5% in 1980, fell to 4.1% in 1988, due to the Federal Reserve increasing interest rates (prime rate peaking at 20.5% in August 1981[47]).[48] The latter contributed to a recession from July 1981 to November 1982 during which unemployment rose to 9.7% and GDP fell by 1.9%. Additionally, income growth slowed for middle- and lower-class (2.4% to 1.8%) and rose for the upper-class (2.2% to 4.83%).[49]
|
||||||
|
|
||||||
|
The misery index, defined as the inflation rate added to the unemployment rate, shrank from 19.33 when he began his administration to 9.72 when he left, the greatest improvement record for a President since Harry S. Truman left office.[50] In terms of American households, the percentage of total households making less than $10,000 a year (in real 2007 dollars) shrank from 8.8% in 1980 to 8.3% in 1988 while the percentage of households making over $75,000 went from 20.2% to 25.7% during that period, both signs of progress.[51]
|
||||||
|
|
||||||
|
Employment and wages
|
||||||
|
|
||||||
|
The job growth (measured for non-farm payrolls) under the Reagan administration averaged 168,000 per month, versus 216,000 for Carter, 55,000 for H.W. Bush, and 239,000 for Clinton. Measuring the number of jobs created per month is limited for longer time periods as the population grows. To address this, we can measure annual job growth percentages, comparing the beginning and ending number of jobs during their time in office to determine an annual growth rate. Jobs grew by 2.0% annually under Reagan, versus 3.1% under Carter, 0.6% under H.W. Bush, and 2.4% under Clinton.[52]
|
||||||
|
|
||||||
|
The unemployment rate averaged 7.5% under Reagan, compared to an average 6.6% during the preceding eight years. Declining steadily after December 1982, the rate was 5.4% the month Reagan left office.[53]
|
||||||
|
|
||||||
|
The average real hourly wage for production and nonsupervisory workers continued the decline that had begun in 1973, albeit at a slower rate, and remained below the pre-Reagan level in every Reagan year.[54]
|
||||||
|
|
||||||
|
The labor force participation rate increased by 2.6 percentage points during Reagan's eight years, compared to 3.9 percentage points during the preceding eight years.[55]
|
||||||
|
|
||||||
|
Growth rates
|
||||||
|
|
||||||
|
Following the 1981 recession, the unemployment rate had averaged slightly higher (6.75% vs. 6.35%), productivity growth lower (1.38% vs. 1.92%), and private investment as a percentage of GDP slightly less (16.08% vs. 16.86%).[citation needed] In the 1980's, industrial productivity growth in the United States matched that of its trading partners after trailing them in the 1970's. By 1990, manufacturing's share of GNP exceeded the post-World War II low hit in 1982 and matched "the level of output achieved in the 1960's when American factories hummed at a feverish clip".[56]
|
||||||
|
|
||||||
|
GDP growth
|
||||||
|
|
||||||
|
Real GDP grew over one-third during Reagan’s presidency, an over $2 trillion increase. The compound annual growth rate of GDP was 3.6% during Reagan's eight years, compared to 2.7% during the preceding eight years.[57] Real GDP per capita grew 2.6% under Reagan, compared to 1.9% average growth during the preceding eight years.[58]
|
||||||
|
|
||||||
|
Income and wealth
|
||||||
|
In nominal terms, median household income grew at a compound annual growth rate (CAGR) of 5.5% during the Reagan presidency, compared to 8.5% during the preceding five years (pre-1975 data are unavailable).[59] Real median family income grew by $4,492 during the Reagan period, compared to a $1,270 increase during the preceding eight years.[60] After declining from 1974 through 1980, real mean personal income rose $4,708 by 1988.[61] Nominal household net worth increased by a CAGR of 8.4%, compared to 9.3% during the preceding eight years.[62]
|
||||||
|
|
||||||
|
Poverty level
|
||||||
|
|
||||||
|
The percentage of the total population below the poverty level increased from 13.0% in 1980 to 15.2% in 1983, then declined back to 13.0% in 1988.[64] During Reagan's first term, critics noted homelessness as a visible problem in U.S. urban centers.[65] In the closing weeks of his presidency, Reagan told David Brinkley that the homeless "make it their own choice for staying out there," noting his belief that there "are shelters in virtually every city, and shelters here, and those people still prefer out there on the grates or the lawn to going into one of those shelters". He also stated that "a large proportion" of them are "mentally impaired." A result (he believed) of ACLU (and similar organizations) lawsuits against institutions.[66] His policies became widely known as "trickle-down economics", due to the significant cuts in the upper tax brackets, as that extra money for the wealthy could trickle along to low-income groups.[67]
|
||||||
|
|
||||||
|
Federal income tax and payroll tax levels
|
||||||
|
|
||||||
|
During the Reagan administration, fiscal year federal receipts grew from $599 billion to $991 billion (an increase of 65%) while fiscal year federal outlays grew from $678 billion to $1144 billion (an increase of 69%).[68][69] According to a 1996 report of the Joint Economic Committee of the United States Congress, during Reagan's two terms, and through 1993, the top 10% of taxpayers paid an increased share of income taxes (not including payroll taxes) to the Federal government, while the lowest 50% of taxpayers paid a reduced share of income tax revenue.[70] Personal income tax revenues declined from 9.4% GDP in 1981 to 8.3% GDP in 1989, while payroll tax revenues increased from 6.0% GDP to 6.7% GDP during the same period.[25]
|
||||||
|
|
||||||
|
Tax receipts
|
||||||
|
|
||||||
|
According to a 2003 Treasury study, the tax cuts in the Economic Recovery Tax Act of 1981 resulted in a significant decline in revenue relative to a baseline without the cuts, approximately $111 billion (in 1992 dollars) on average during the first four years after implementation or nearly 3% GDP annually.[71][72] Other tax bills had neutral or, in the case of the Tax Equity and Fiscal Responsibility Act of 1982, a (~+1% of GDP) increase in revenue as a share of GDP. It should be noted, however, that the study did not examine the longer-term impact of Reagan tax policy, including sunset clauses and "the long-run, fully-phased-in effect of the tax bills".[72] The fact that tax receipts as a percentage of GDP fell following the Economic Recovery Tax Act of 1981 shows a decrease in tax burden as share of GDP and a commensurate increase in the deficit, as spending did not fall relative to GDP. Total federal tax receipts increased in every Reagan year except 1982, at an annual average rate of 6.2% compared to 10.8% during the preceding eight years.[73]
|
||||||
|
|
||||||
|
The effect of Reagan's 1981 tax cuts (reduced revenue relative to a baseline without the cuts) were at least partially offset by phased in Social Security payroll tax increases that had been enacted by President Jimmy Carter and the 95th Congress in 1977, and further increases by Reagan in 1983[74] and following years, also to counter the uses of tax shelters.[75] An accounting indicated nominal tax receipts increased from $599 billion in 1981 to $1.032 trillion in 1990, an increase of 72% in current dollars. In 2005 dollars, the tax receipts in 1990 were $1.5 trillion, an increase of 20% above inflation.[76]
|
||||||
|
|
||||||
|
Debt and government expenditures
|
||||||
|
Reagan was inaugurated in January 1981, so the first fiscal year (FY) he budgeted was 1982 and the final year was 1989.
|
||||||
|
|
||||||
|
During Reagan's presidency, the federal debt held by the public nearly tripled in nominal terms, from $738 billion to $2.1 trillion.[77] This led to the U.S. moving from the world's largest international creditor to the world's largest debtor nation.[5] Reagan described the new debt as the "greatest disappointment" of his presidency.[34]
|
||||||
|
The federal deficit as percentage of GDP rose from 2.5% of GDP in fiscal year 1981 to a peak of 5.7% of GDP in 1983, then fell to 2.7% GDP in 1989.[78]
|
||||||
|
Total federal outlays averaged of 21.8% of GDP from 1981–88, versus the 1974–1980 average of 20.1% of GDP. This was the highest of any President from Carter through Obama.[79]
|
||||||
|
Total federal revenues averaged 17.7% of GDP from 1981–88, versus the 1974–80 average of 17.6% of GDP.[80]
|
||||||
|
Federal individual income tax revenues fell from 8.7% of GDP in 1980 to a trough of 7.5% of GDP in 1984, then rose to 7.8% of GDP in 1988.[81]
|
||||||
|
Business and market performance
|
||||||
|
Nominal after-tax corporate profits grew at a compound annual growth rate of 3.0% during Reagan's eight years, compared to 13.0% during the preceding eight years.[82] The S&P 500 Index increased 113.3% during the 2024 trading days under Reagan, compared to 10.4% during the preceding 2024 trading days.[83] The business sector share of GDP, measured as gross private domestic investment, declined by 0.7 percentage points under Reagan, after increasing 0.7 percentage points during the preceding eight years.[84]
|
||||||
|
|
||||||
|
Size of federal government
|
||||||
|
The federal government's share of GDP increased 0.2 percentage points under Reagan, while it decreased 1.5 percentage points during the preceding eight years.[85] The number of federal civilian employees increased 4.2% during Reagan's eight years, compared to 6.5% during the preceding eight years.[86]
|
||||||
|
|
||||||
|
As a candidate, Reagan asserted he would shrink government by abolishing the Cabinet-level departments of energy and education. He abolished neither, but elevated veterans affairs from independent agency status to Cabinet-level department status.[87][88]
|
||||||
|
|
||||||
|
Income distribution
|
||||||
|
Further information: Income inequality in the United States
|
||||||
|
Continuing a trend that began in the 1970s, income inequality grew and accelerated in the 1980s. The Economist wrote in 2006: "After the 1973 oil shocks, productivity growth suddenly slowed. A few years later, at the start of the 1980s, the gap between rich and poor began to widen."[89] According to the CBO:
|
||||||
|
|
||||||
|
The top 1% of income earners' share of income before transfers and taxes rose from 9.0% in 1979 to a peak of 13.8% in 1986, before falling to 12.3% in 1989.
|
||||||
|
The top 1% share of income earners' of income after transfers and taxes rose from 7.4% in 1979 to a peak of 12.8% in 1986, before falling to 11.0% in 1989.
|
||||||
|
The bottom 90% had a lower share of the income in 1989 vs. 1979.[90]
|
||||||
|
|
||||||
|
ANALYSIS
|
||||||
|
|
||||||
|
According to a 1996 study[93] by the Cato Institute, a libertarian think tank, on 8 of the 10 key economic variables examined, the American economy performed better during the Reagan years than during the pre- and post-Reagan years. The study asserted that real median family income grew by $4,000 and during the eight Reagan years and experienced a loss of almost $1,500 in the post-Reagan years. Interest rates, inflation, and unemployment fell faster under Reagan than they did immediately before or after his presidency. The only economic variable that was lower during period than in both the pre- and post-Reagan years was the savings rate, which fell rapidly in the 1980s. The productivity rate was higher in the pre-Reagan years but lower in the post-Reagan years.[93] The Cato study was dismissive of any positive effects of tightening, and subsequent loosening, of Federal Reserve monetary policy under "inflation hawk" Paul Volcker, whom President Carter had appointed in 1979 to halt the persistent inflation of the 1970s.
|
||||||
|
|
||||||
|
Economic analyst Stephen Moore stated in the Cato analysis, "No act in the last quarter century had a more profound impact on the U.S. economy of the eighties and nineties than the Reagan tax cut of 1981." He argued that Reagan's tax cuts, combined with an emphasis on federal monetary policy, deregulation, and expansion of free trade created a sustained economic expansion, the greatest American sustained wave of prosperity ever. He also claims that the American economy grew by more than a third in size, producing a $15 trillion increase in American wealth. Consumer and investor confidence soared. Cutting federal income taxes, cutting the U.S. government spending budget, cutting useless programs, scaling down the government work force, maintaining low interest rates, and keeping a watchful inflation hedge on the monetary supply was Ronald Reagan's formula for a successful economic turnaround.[93]
|
||||||
|
|
||||||
|
Milton Friedman stated, "Reaganomics had four simple principles: Lower marginal tax rates, less regulation, restrained government spending, noninflationary monetary policy. Though Reagan did not achieve all of his goals, he made good progress."[94]
|
||||||
|
|
||||||
|
The Tax Reform Act of 1986 and its impact on the alternative minimum tax (AMT) reduced nominal rates on the wealthy and eliminated tax deductions, while raising tax rates on lower-income individuals.[94][95][96][97] The across the board tax system reduced marginal rates and further reduced bracket creep from inflation. The highest income earners (with incomes exceeding $1,000,000) received a tax break, restoring a flatter tax system.[98] In 2006, the IRS's National Taxpayer Advocate's report characterized the effective rise in the AMT for individuals as a problem with the tax code.[99] Through 2007, the revised AMT had brought in more tax revenue than the former tax code, which has made it difficult for Congress to reform.[98][100]
|
||||||
|
|
||||||
|
Economist Paul Krugman argued the economic expansion during the Reagan administration was primarily the result of the business cycle and the monetary policy by Paul Volcker.[101] Krugman argues that there was nothing unusual about the economy under Reagan because unemployment was reducing from a high peak and that it is consistent with Keynesian economics for the economy to grow as employment increases if inflation remains low.[102]
|
||||||
|
|
||||||
|
The CBO Historical Tables indicate that federal spending during Reagan's two terms (FY 1981–88) averaged 22.4% GDP, well above the 20.6% GDP average from 1971 to 2009. In addition, the public debt rose from 26.1% GDP in 1980 to 41.0% GDP by 1988. In dollar terms, the public debt rose from $712 billion in 1980 to $2,052 billion in 1988, a three-fold increase.[25] Krugman argued in June 2012 that Reagan's policies were consistent with Keynesian stimulus theories, pointing to the significant increase in per-capita spending under Reagan.[103]
|
||||||
|
|
||||||
|
William Niskanen noted that during the Reagan years, privately held federal debt increased from 22% to 38% of GDP, despite a long peacetime expansion. Second, the savings and loan problem led to an additional debt of about $125 billion. Third, greater enforcement of U.S. trade laws increased the share of U.S. imports subjected to trade restrictions from 12% in 1980 to 23% in 1988.[2]
|
||||||
|
|
||||||
|
Economists Raghuram Rajan and Luigi Zingales pointed out that many deregulation efforts had either taken place or had begun before Reagan (note the deregulation of airlines and trucking under Carter, and the beginning of deregulatory reform in railroads, telephones, natural gas, and banking). They stated, "The move toward markets preceded the leader [Reagan] who is seen as one of their saviors."[104] Economists Paul Joskow and Roger Noll made a similar contention.[105]
|
||||||
|
|
||||||
|
Economist William A. Niskanen, a member of Reagan's Council of Economic Advisers wrote that deregulation had the "lowest priority" of the items on the Reagan agenda[2] given that Reagan "failed to sustain the momentum for deregulation initiated in the 1970s" and that he "added more trade barriers than any administration since Hoover." By contrast, economist Milton Friedman has pointed to the number of pages added to the Federal Register each year as evidence of Reagan's anti-regulation presidency (the Register records the rules and regulations that federal agencies issue per year). The number of pages added to the Register each year declined sharply at the start of the Ronald Reagan presidency breaking a steady and sharp increase since 1960. The increase in the number of pages added per year resumed an upward, though less steep, trend after Reagan left office. In contrast, the number of pages being added each year increased under Ford, Carter, George H. W. Bush, Clinton, George W. Bush, and Obama.[106] The number of pages in Federal Register is however criticized as an extremely crude measure of regulatory activity, because it can be easily manipulated (e.g. font sizes have been changed to keep page count low).[107] The apparent contradiction between Niskanen's statements and Friedman's data may be resolved by seeing Niskanen as referring to statutory deregulation (laws passed by Congress) and Friedman to administrative deregulation (rules and regulations implemented by federal agencies). A 2016 study by the Congressional Research Service found that Reagan's average annual number of final federal regulatory rules published in the Federal Register was higher than during the Clinton, George W. Bush or Obama's administrations, even though the Reagan economy was considerably smaller than during those later presidents.[108] Another study by the QuantGov project of the libertarian Mercatus Center found that the Reagan administration added restrictive regulations — containing such terms as "shall," "prohibited" or "may not" — at a faster average annual rate than did Clinton, Bush or Obama.[109]
|
||||||
|
|
||||||
|
Greg Mankiw, a conservative Republican economist who served as chairman of the Council of Economic Advisors under President George W. Bush, wrote in 2007:
|
||||||
|
|
||||||
|
I used the phrase "charlatans and cranks" in the first edition of my principles textbook to describe some of the economic advisers to Ronald Reagan, who told him that broad-based income tax cuts would have such large supply-side effects that the tax cuts would raise tax revenue. I did not find such a claim credible, based on the available evidence. I never have, and I still don't...My other work has remained consistent with this view. In a paper on dynamic scoring, written while I was working at the White House, Matthew Weinzierl and I estimated that a broad-based income tax cut (applying to both capital and labor income) would recoup only about a quarter of the lost revenue through supply-side growth effects. For a cut in capital income taxes, the feedback is larger — about 50 percent — but still well under 100 percent. A chapter on dynamic scoring in the 2004 Economic Report of the President says about the the [sic] same thing.[110]
|
||||||
|
|
||||||
|
Glenn Hubbard, who preceded Mankiw as Bush's CEA chair, also disputed the assertion that tax cuts increase tax revenues, writing in his 2003 Economic Report of the President: "Although the economy grows in response to tax reductions (because of higher consumption in the short run and improved incentives in the long run), it is unlikely to grow so much that lost tax revenue is completely recovered by the higher level of economic activity."[111]
|
||||||
|
|
||||||
|
In 1986, Martin Feldstein — a self-described "traditional supply sider" who served as Reagan's chairman of the Council of Economic Advisors from 1982 to 1984 — characterized the "new supply siders" who emerged circa 1980:
|
||||||
|
|
||||||
|
What distinguished the new supply siders from the traditional supply siders as the 1980s began was not the policies they advocated but the claims that they made for those policies...The "new" supply siders were much more extravagant in their claims. They projected rapid growth, dramatic increases in tax revenue, a sharp rise in saving, and a relatively painless reduction in inflation. The height of supply side hyperbole was the "Laffer curve" proposition that the tax cut would actually increase tax revenue because it would unleash an enormously depressed supply of effort. Another remarkable proposition was the claim that even if the tax cuts did lead to an increased budget deficit, that would not reduce the funds available for investment in plant and equipment because tax changes would raise the saving rate by enough to finance the increased deficit...Nevertheless, I have no doubt that the loose talk of the supply side extremists gave fundamentally good policies a bad name and led to quantitative mistakes that not only contributed to subsequent budget deficits but that also made it more difficult to modify policy when those deficits became apparent.[112]
|
||||||
|
|
||||||
|
FOOTNOTES
|
||||||
|
|
||||||
|
https://en.wikipedia.org/wiki/Reaganomics#Footnotes
|
||||||
1000
Praktikum Python Code/TextFiles/reuters.csv
Normal file
1000
Praktikum Python Code/TextFiles/reuters.csv
Normal file
File diff suppressed because it is too large
Load Diff
75
Praktikum Python Code/TextFiles/sms_readme.txt
Normal file
75
Praktikum Python Code/TextFiles/sms_readme.txt
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
SMS Spam Collection v.1
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
1. DESCRIPTION
|
||||||
|
--------------
|
||||||
|
|
||||||
|
The SMS Spam Collection v.1 (hereafter the corpus) is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.
|
||||||
|
|
||||||
|
1.1. Compilation
|
||||||
|
----------------
|
||||||
|
|
||||||
|
This corpus has been collected from free or free for research sources at the Web:
|
||||||
|
|
||||||
|
- A collection of between 425 SMS spam messages extracted manually from the Grumbletext Web site. This is a UK forum in which cell phone users make public claims about SMS spam messages, most of them without reporting the very spam message received. The identification of the text of spam messages in the claims is a very hard and time-consuming task, and it involved carefully scanning hundreds of web pages. The Grumbletext Web site is: http://www.grumbletext.co.uk/
|
||||||
|
- A list of 450 SMS ham messages collected from Caroline Tag's PhD Theses available at http://etheses.bham.ac.uk/253/1/Tagg09PhD.pdf
|
||||||
|
- A subset of 3,375 SMS ham messages of the NUS SMS Corpus (NSC), which is a corpus of about 10,000 legitimate messages collected for research at the Department of Computer Science at the National University of Singapore. The messages largely originate from Singaporeans and mostly from students attending the University. These messages were collected from volunteers who were made aware that their contributions were going to be made publicly available. The NUS SMS Corpus is avalaible at: http://www.comp.nus.edu.sg/~rpnlpir/downloads/corpora/smsCorpus/
|
||||||
|
- The amount of 1,002 SMS ham messages and 322 spam messages extracted from the SMS Spam Corpus v.0.1 Big created by José María Gómez Hidalgo and public available at: http://www.esp.uem.es/jmgomez/smsspamcorpus/
|
||||||
|
|
||||||
|
|
||||||
|
1.2. Statistics
|
||||||
|
---------------
|
||||||
|
|
||||||
|
There is one collection:
|
||||||
|
|
||||||
|
- The SMS Spam Collection v.1 (text file: smsspamcollection) has a total of 4,827 SMS legitimate messages (86.6%) and a total of 747 (13.4%) spam messages.
|
||||||
|
|
||||||
|
|
||||||
|
1.3. Format
|
||||||
|
-----------
|
||||||
|
|
||||||
|
The files contain one message per line. Each line is composed by two columns: one with label (ham or spam) and other with the raw text. Here are some examples:
|
||||||
|
|
||||||
|
ham What you doing?how are you?
|
||||||
|
ham Ok lar... Joking wif u oni...
|
||||||
|
ham dun say so early hor... U c already then say...
|
||||||
|
ham MY NO. IN LUTON 0125698789 RING ME IF UR AROUND! H*
|
||||||
|
ham Siva is in hostel aha:-.
|
||||||
|
ham Cos i was out shopping wif darren jus now n i called him 2 ask wat present he wan lor. Then he started guessing who i was wif n he finally guessed darren lor.
|
||||||
|
spam FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! ubscribe6GBP/ mnth inc 3hrs 16 stop?txtStop
|
||||||
|
spam Sunshine Quiz! Win a super Sony DVD recorder if you canname the capital of Australia? Text MQUIZ to 82277. B
|
||||||
|
spam URGENT! Your Mobile No 07808726822 was awarded a L2,000 Bonus Caller Prize on 02/09/03! This is our 2nd attempt to contact YOU! Call 0871-872-9758 BOX95QU
|
||||||
|
|
||||||
|
Note: messages are not chronologically sorted.
|
||||||
|
|
||||||
|
|
||||||
|
2. USAGE
|
||||||
|
--------
|
||||||
|
|
||||||
|
We offer a comprehensive study of this corpus in the following paper that is under review. This work presents a number of statistics, studies and baseline results for several machine learning methods.
|
||||||
|
|
||||||
|
[1] Almeida, T.A., Gómez Hidalgo, J.M., Yamakami, A. Contributions to the study of SMS Spam Filtering: New Collection and Results. Proceedings of the 2011 ACM Symposium on Document Engineering (ACM DOCENG'11), Mountain View, CA, USA, 2011. (Under review)
|
||||||
|
|
||||||
|
|
||||||
|
3. ABOUT
|
||||||
|
--------
|
||||||
|
|
||||||
|
The corpus has been collected by Tiago Agostinho de Almeida (http://www.dt.fee.unicamp.br/~tiago) and José María Gómez Hidalgo (http://www.esp.uem.es/jmgomez).
|
||||||
|
|
||||||
|
We would like to thank Dr. Min-Yen Kan (http://www.comp.nus.edu.sg/~kanmy/) and his team for making the NUS SMS Corpus available. See: http://www.comp.nus.edu.sg/~rpnlpir/downloads/corpora/smsCorpus/. He is currently collecting a bigger SMS corpus at: http://wing.comp.nus.edu.sg:8080/SMSCorpus/
|
||||||
|
|
||||||
|
4. LICENSE/DISCLAIMER
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
We would appreciate if:
|
||||||
|
|
||||||
|
- In case you find this corpus useful, please make a reference to previous paper and the web page: http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/ in your papers, research, etc.
|
||||||
|
- Send us a message to tiago@dt.fee.unicamp.br in case you make use of the corpus.
|
||||||
|
|
||||||
|
The SMS Spam Collection v.1 is provided for free and with no limitations excepting:
|
||||||
|
|
||||||
|
1. Tiago Agostinho de Almeida and José María Gómez Hidalgo hold the copyrigth (c) for the SMS Spam Collection v.1.
|
||||||
|
|
||||||
|
2. No Warranty/Use At Your Risk. THE CORPUS IS MADE AT NO CHARGE. ACCORDINGLY, THE CORPUS IS PROVIDED `AS IS,' WITHOUT WARRANTY OF ANY KIND, INCLUDING WITHOUT LIMITATION THE WARRANTIES THAT THEY ARE MERCHANTABLE, FIT FOR A PARTICULAR PURPOSE OR NON-INFRINGING. YOU ARE SOLELY RESPONSIBLE FOR YOUR USE, DISTRIBUTION, MODIFICATION, REPRODUCTION AND PUBLICATION OF THE CORPUS AND ANY DERIVATIVE WORKS THEREOF BY YOU AND ANY OF YOUR SUBLICENSEES (COLLECTIVELY, `YOUR CORPUS USE'). THE ENTIRE RISK AS TO YOUR CORPUS USE IS BORNE BY YOU. YOU AGREE TO INDEMNIFY AND HOLD THE COPYRIGHT HOLDERS, AND THEIR AFFILIATES HARMLESS FROM ANY CLAIMS ARISING FROM OR RELATING TO YOUR CORPUS USE.
|
||||||
|
|
||||||
|
3. Limitation of Liability. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR THEIR AFFILIATES, OR THE CORPUS CONTRIBUTING EDITORS, BE LIABLE FOR ANY INDIRECT, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES, INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF GOODWILL OR ANY AND ALL OTHER COMMERCIAL DAMAGES OR LOSSES, EVEN IF ADVISED OF THE POSSIBILITY THEREOF, AND REGARDLESS OF WHETHER ANY CLAIM IS BASED UPON ANY CONTRACT, TORT OR OTHER LEGAL OR EQUITABLE THEORY, RELATING OR ARISING FROM THE CORPUS, YOUR CORPUS USE OR THIS LICENSE AGREEMENT.
|
||||||
5575
Praktikum Python Code/TextFiles/smsspamcollection.tsv
Normal file
5575
Praktikum Python Code/TextFiles/smsspamcollection.tsv
Normal file
File diff suppressed because it is too large
Load Diff
118
Praktikum Python Code/moviereviews/moviereviewsREADME.txt
Normal file
118
Praktikum Python Code/moviereviews/moviereviewsREADME.txt
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
|
||||||
|
=======
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
|
||||||
|
This README v2.0 (June, 2004) for the v2.0 polarity dataset comes from
|
||||||
|
the URL http://www.cs.cornell.edu/people/pabo/movie-review-data .
|
||||||
|
|
||||||
|
=======
|
||||||
|
|
||||||
|
What's New -- June, 2004
|
||||||
|
|
||||||
|
This dataset represents an enhancement of the review corpus v1.0
|
||||||
|
described in README v1.1: it contains more reviews, and labels were
|
||||||
|
created with an improved rating-extraction system.
|
||||||
|
|
||||||
|
=======
|
||||||
|
|
||||||
|
Citation Info
|
||||||
|
|
||||||
|
This data was first used in Bo Pang and Lillian Lee,
|
||||||
|
``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization
|
||||||
|
Based on Minimum Cuts'', Proceedings of the ACL, 2004.
|
||||||
|
|
||||||
|
@InProceedings{Pang+Lee:04a,
|
||||||
|
author = {Bo Pang and Lillian Lee},
|
||||||
|
title = {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts},
|
||||||
|
booktitle = "Proceedings of the ACL",
|
||||||
|
year = 2004
|
||||||
|
}
|
||||||
|
|
||||||
|
=======
|
||||||
|
|
||||||
|
Data Format Summary
|
||||||
|
|
||||||
|
- review_polarity.tar.gz: contains this readme and data used in
|
||||||
|
the experiments described in Pang/Lee ACL 2004.
|
||||||
|
|
||||||
|
Specifically:
|
||||||
|
|
||||||
|
Within the folder "txt_sentoken" are the 2000 processed down-cased
|
||||||
|
text files used in Pang/Lee ACL 2004; the names of the two
|
||||||
|
subdirectories in that folder, "pos" and "neg", indicate the true
|
||||||
|
classification (sentiment) of the component files according to our
|
||||||
|
automatic rating classifier (see section "Rating Decision" below).
|
||||||
|
|
||||||
|
File names consist of a cross-validation tag plus the name of the
|
||||||
|
original html file. The ten folds used in the Pang/Lee ACL 2004 paper's
|
||||||
|
experiments were:
|
||||||
|
|
||||||
|
fold 1: files tagged cv000 through cv099, in numerical order
|
||||||
|
fold 2: files tagged cv100 through cv199, in numerical order
|
||||||
|
...
|
||||||
|
fold 10: files tagged cv900 through cv999, in numerical order
|
||||||
|
|
||||||
|
Hence, the file neg/cv114_19501.txt, for example, was labeled as
|
||||||
|
negative, served as a member of fold 2, and was extracted from the
|
||||||
|
file 19501.html in polarity_html.zip (see below).
|
||||||
|
|
||||||
|
Each line in each text file corresponds to a single sentence, as
|
||||||
|
determined by Adwait Ratnaparkhi's sentence boundary detector
|
||||||
|
MXTERMINATOR.
|
||||||
|
|
||||||
|
Preliminary steps were taken to remove rating information from the
|
||||||
|
text files, but only the rating information upon which the rating
|
||||||
|
decision was based is guaranteed to have been removed. Thus, if the
|
||||||
|
original review contains several instances of rating information,
|
||||||
|
potentially given in different forms, those not recognized as valid
|
||||||
|
ratings remain part of the review text.
|
||||||
|
|
||||||
|
- polarity_html.zip: The original source files from which the
|
||||||
|
processed, labeled, and (randomly) selected data in
|
||||||
|
review_polarity.tar.gz was derived.
|
||||||
|
|
||||||
|
Specifically:
|
||||||
|
|
||||||
|
This data consists of unprocessed, unlabeled html files from the
|
||||||
|
IMDb archive of the rec.arts.movies.reviews newsgroup,
|
||||||
|
http://reviews.imdb.com/Reviews. The files in review_polarity.tar.gz
|
||||||
|
represent a processed subset of these files.
|
||||||
|
|
||||||
|
=======
|
||||||
|
|
||||||
|
Rating Decision (Appendix A)
|
||||||
|
|
||||||
|
This section describes how we determined whether a review was positive
|
||||||
|
or negative.
|
||||||
|
|
||||||
|
The original html files do not have consistent formats -- a review may
|
||||||
|
not have the author's rating with it, and when it does, the rating can
|
||||||
|
appear at different places in the file in different forms. We only
|
||||||
|
recognize some of the more explicit ratings, which are extracted via a
|
||||||
|
set of ad-hoc rules. In essence, a file's classification is determined
|
||||||
|
based on the first rating we were able to identify.
|
||||||
|
|
||||||
|
|
||||||
|
- In order to obtain more accurate rating decisions, the maximum
|
||||||
|
rating must be specified explicitly, both for numerical ratings
|
||||||
|
and star ratings. ("8/10", "four out of five", and "OUT OF
|
||||||
|
****: ***" are examples of rating indications we recognize.)
|
||||||
|
|
||||||
|
- With a five-star system (or compatible number systems):
|
||||||
|
three-and-a-half stars and up are considered positive,
|
||||||
|
two stars and below are considered negative.
|
||||||
|
- With a four-star system (or compatible number system):
|
||||||
|
three stars and up are considered positive,
|
||||||
|
one-and-a-half stars and below are considered negative.
|
||||||
|
- With a letter grade system:
|
||||||
|
B or above is considered positive,
|
||||||
|
C- or below is considered negative.
|
||||||
|
|
||||||
|
We attempted to recognize half stars, but they are specified in an
|
||||||
|
especially free way, which makes them difficult to recognize. Hence,
|
||||||
|
we may lose a half star very occasionally; but this only results in 2.5
|
||||||
|
stars in five star system being categorized as negative, which is
|
||||||
|
still reasonable.
|
||||||
|
|
||||||
|
|
||||||
13
Praktikum Python Code/moviereviews/neg/cv001_19502.txt
Normal file
13
Praktikum Python Code/moviereviews/neg/cv001_19502.txt
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
the happy bastard's quick movie review
|
||||||
|
damn that y2k bug .
|
||||||
|
it's got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they kick the power back on .
|
||||||
|
little do they know the power within . . .
|
||||||
|
going for the gore and bringing on a few action sequences here and there , virus still feels very empty , like a movie going for all flash and no substance .
|
||||||
|
we don't know why the crew was really out in the middle of nowhere , we don't know the origin of what took over the ship ( just that a big pink flashy thing hit the mir ) , and , of course , we don't know why donald sutherland is stumbling around drunkenly throughout .
|
||||||
|
here , it's just " hey , let's chase these people around with some robots " .
|
||||||
|
the acting is below average , even from the likes of curtis .
|
||||||
|
you're more likely to get a kick out of her work in halloween h20 .
|
||||||
|
sutherland is wasted and baldwin , well , he's acting like a baldwin , of course .
|
||||||
|
the real star here are stan winston's robot design , some schnazzy cgi , and the occasional good gore shot , like picking into someone's brain .
|
||||||
|
so , if robots and body parts really turn you on , here's your movie .
|
||||||
|
otherwise , it's pretty much a sunken ship of a movie .
|
||||||
23
Praktikum Python Code/moviereviews/neg/cv002_17424.txt
Normal file
23
Praktikum Python Code/moviereviews/neg/cv002_17424.txt
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
it is movies like these that make a jaded movie viewer thankful for the invention of the timex indiglo watch .
|
||||||
|
based on the late 1960's television show by the same name , the mod squad tells the tale of three reformed criminals under the employ of the police to go undercover .
|
||||||
|
however , things go wrong as evidence gets stolen and they are immediately under suspicion .
|
||||||
|
of course , the ads make it seem like so much more .
|
||||||
|
quick cuts , cool music , claire dane's nice hair and cute outfits , car chases , stuff blowing up , and the like .
|
||||||
|
sounds like a cool movie , does it not ?
|
||||||
|
after the first fifteen minutes , it quickly becomes apparent that it is not .
|
||||||
|
the mod squad is certainly a slick looking production , complete with nice hair and costumes , but that simply isn't enough .
|
||||||
|
the film is best described as a cross between an hour-long cop show and a music video , both stretched out into the span of an hour and a half .
|
||||||
|
and with it comes every single clich ? .
|
||||||
|
it doesn't really matter that the film is based on a television show , as most of the plot elements have been recycled from everything we've already seen .
|
||||||
|
the characters and acting is nothing spectacular , sometimes even bordering on wooden .
|
||||||
|
claire danes and omar epps deliver their lines as if they are bored , which really transfers onto the audience .
|
||||||
|
the only one to escape relatively unscathed is giovanni ribisi , who plays the resident crazy man , ultimately being the only thing worth watching .
|
||||||
|
unfortunately , even he's not enough to save this convoluted mess , as all the characters don't do much apart from occupying screen time .
|
||||||
|
with the young cast , cool clothes , nice hair , and hip soundtrack , it appears that the film is geared towards the teenage mindset .
|
||||||
|
despite an american 'r' rating ( which the content does not justify ) , the film is way too juvenile for the older mindset .
|
||||||
|
information on the characters is literally spoon-fed to the audience ( would it be that hard to show us instead of telling us ? ) , dialogue is poorly written , and the plot is extremely predictable .
|
||||||
|
the way the film progresses , you likely won't even care if the heroes are in any jeopardy , because you'll know they aren't .
|
||||||
|
basing the show on a 1960's television show that nobody remembers is of questionable wisdom , especially when one considers the target audience and the fact that the number of memorable films based on television shows can be counted on one hand ( even one that's missing a finger or two ) .
|
||||||
|
the number of times that i checked my watch ( six ) is a clear indication that this film is not one of them .
|
||||||
|
it is clear that the film is nothing more than an attempt to cash in on the teenage spending dollar , judging from the rash of really awful teen-flicks that we've been seeing as of late .
|
||||||
|
avoid this film at all costs .
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user