565 lines
20 KiB
Plaintext
565 lines
20 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Possible Approach\n",
|
|
"\n",
|
|
"## Hypothesis, does adding Pos, Neg, and Neu values from Sentiment Analysis improve the original model??"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>label</th>\n",
|
|
" <th>review</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>how do films like mouse hunt get into theatres...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>some talented actresses are blessed with a dem...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>pos</td>\n",
|
|
" <td>this has been an extraordinary year for austra...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>pos</td>\n",
|
|
" <td>according to hollywood movies made in last few...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>my first press screening of 1998 and already i...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" label review\n",
|
|
"0 neg how do films like mouse hunt get into theatres...\n",
|
|
"1 neg some talented actresses are blessed with a dem...\n",
|
|
"2 pos this has been an extraordinary year for austra...\n",
|
|
"3 pos according to hollywood movies made in last few...\n",
|
|
"4 neg my first press screening of 1998 and already i..."
|
|
]
|
|
},
|
|
"execution_count": 24,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"df = pd.read_csv('../TextFiles/moviereviews.tsv', sep='\\t')\n",
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# REMOVE NaN VALUES AND EMPTY STRINGS:\n",
|
|
"df.dropna(inplace=True)\n",
|
|
"\n",
|
|
"blanks = [] # start with an empty list\n",
|
|
"\n",
|
|
"for i,lb,rv in df.itertuples(): # iterate over the DataFrame\n",
|
|
" if type(rv)==str: # avoid NaN values\n",
|
|
" if rv.isspace(): # test 'review' for whitespace\n",
|
|
" blanks.append(i) # add matching index numbers to the list\n",
|
|
"\n",
|
|
"df.drop(blanks, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
|
|
"\n",
|
|
"sid = SentimentIntensityAnalyzer()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>label</th>\n",
|
|
" <th>review</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>how do films like mouse hunt get into theatres...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>some talented actresses are blessed with a dem...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>pos</td>\n",
|
|
" <td>this has been an extraordinary year for austra...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>pos</td>\n",
|
|
" <td>according to hollywood movies made in last few...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>my first press screening of 1998 and already i...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" label review\n",
|
|
"0 neg how do films like mouse hunt get into theatres...\n",
|
|
"1 neg some talented actresses are blessed with a dem...\n",
|
|
"2 pos this has been an extraordinary year for austra...\n",
|
|
"3 pos according to hollywood movies made in last few...\n",
|
|
"4 neg my first press screening of 1998 and already i..."
|
|
]
|
|
},
|
|
"execution_count": 27,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 28,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>label</th>\n",
|
|
" <th>review</th>\n",
|
|
" <th>scores</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>how do films like mouse hunt get into theatres...</td>\n",
|
|
" <td>{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>some talented actresses are blessed with a dem...</td>\n",
|
|
" <td>{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>pos</td>\n",
|
|
" <td>this has been an extraordinary year for austra...</td>\n",
|
|
" <td>{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>pos</td>\n",
|
|
" <td>according to hollywood movies made in last few...</td>\n",
|
|
" <td>{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>my first press screening of 1998 and already i...</td>\n",
|
|
" <td>{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" label review \\\n",
|
|
"0 neg how do films like mouse hunt get into theatres... \n",
|
|
"1 neg some talented actresses are blessed with a dem... \n",
|
|
"2 pos this has been an extraordinary year for austra... \n",
|
|
"3 pos according to hollywood movies made in last few... \n",
|
|
"4 neg my first press screening of 1998 and already i... \n",
|
|
"\n",
|
|
" scores \n",
|
|
"0 {'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co... \n",
|
|
"1 {'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com... \n",
|
|
"2 {'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com... \n",
|
|
"3 {'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co... \n",
|
|
"4 {'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com... "
|
|
]
|
|
},
|
|
"execution_count": 29,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"df['positive'] = df['scores'].apply(lambda score_dict: score_dict['pos'])\n",
|
|
"df['negative'] = df['scores'].apply(lambda score_dict: score_dict['neg'])\n",
|
|
"df['neutral'] = df['scores'].apply(lambda score_dict: score_dict['neu'])\n",
|
|
"df['compound'] =df['scores'].apply(lambda score_dict: score_dict['compound'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>label</th>\n",
|
|
" <th>review</th>\n",
|
|
" <th>scores</th>\n",
|
|
" <th>positive</th>\n",
|
|
" <th>negative</th>\n",
|
|
" <th>neutral</th>\n",
|
|
" <th>compound</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>how do films like mouse hunt get into theatres...</td>\n",
|
|
" <td>{'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co...</td>\n",
|
|
" <td>0.101</td>\n",
|
|
" <td>0.121</td>\n",
|
|
" <td>0.778</td>\n",
|
|
" <td>-0.9125</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>some talented actresses are blessed with a dem...</td>\n",
|
|
" <td>{'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com...</td>\n",
|
|
" <td>0.105</td>\n",
|
|
" <td>0.120</td>\n",
|
|
" <td>0.775</td>\n",
|
|
" <td>-0.8618</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>pos</td>\n",
|
|
" <td>this has been an extraordinary year for austra...</td>\n",
|
|
" <td>{'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com...</td>\n",
|
|
" <td>0.150</td>\n",
|
|
" <td>0.067</td>\n",
|
|
" <td>0.783</td>\n",
|
|
" <td>0.9953</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>pos</td>\n",
|
|
" <td>according to hollywood movies made in last few...</td>\n",
|
|
" <td>{'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...</td>\n",
|
|
" <td>0.145</td>\n",
|
|
" <td>0.069</td>\n",
|
|
" <td>0.786</td>\n",
|
|
" <td>0.9972</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>neg</td>\n",
|
|
" <td>my first press screening of 1998 and already i...</td>\n",
|
|
" <td>{'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com...</td>\n",
|
|
" <td>0.088</td>\n",
|
|
" <td>0.090</td>\n",
|
|
" <td>0.822</td>\n",
|
|
" <td>-0.7264</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" label review \\\n",
|
|
"0 neg how do films like mouse hunt get into theatres... \n",
|
|
"1 neg some talented actresses are blessed with a dem... \n",
|
|
"2 pos this has been an extraordinary year for austra... \n",
|
|
"3 pos according to hollywood movies made in last few... \n",
|
|
"4 neg my first press screening of 1998 and already i... \n",
|
|
"\n",
|
|
" scores positive negative \\\n",
|
|
"0 {'neg': 0.121, 'neu': 0.778, 'pos': 0.101, 'co... 0.101 0.121 \n",
|
|
"1 {'neg': 0.12, 'neu': 0.775, 'pos': 0.105, 'com... 0.105 0.120 \n",
|
|
"2 {'neg': 0.067, 'neu': 0.783, 'pos': 0.15, 'com... 0.150 0.067 \n",
|
|
"3 {'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co... 0.145 0.069 \n",
|
|
"4 {'neg': 0.09, 'neu': 0.822, 'pos': 0.088, 'com... 0.088 0.090 \n",
|
|
"\n",
|
|
" neutral compound \n",
|
|
"0 0.778 -0.9125 \n",
|
|
"1 0.775 -0.8618 \n",
|
|
"2 0.783 0.9953 \n",
|
|
"3 0.786 0.9972 \n",
|
|
"4 0.822 -0.7264 "
|
|
]
|
|
},
|
|
"execution_count": 31,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 33,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"here's a rarity : a children's film that attempts to tackle a weighty subject , is there a god ? \r\n",
|
|
"done well , it could have been a gem among the wasteland of modern children's cinema . \r\n",
|
|
"unfortunately , it isn't . \r\n",
|
|
"with jumbled messages , and an unclear audience , wide awake was better left asleep . \r\n",
|
|
"fifth grader joshua beal ( joseph cross ) is in the middle of a moral crisis . \r\n",
|
|
"his beloved grandfather ( robert loggia ) has died , and joshua has begun a quest . \r\n",
|
|
"he wants to find god , to discover why bad things happen . \r\n",
|
|
"this religious quest is slightly disturbing for his parents ( dana delany and denis leary ) , but they do their best to cope with their son as he explores different religious faiths . \r\n",
|
|
"at his catholic school , his favorite teacher , sister terry ( rosie o'donnell ) , tries to give him guidance , but this is a journey he must make on his own . \r\n",
|
|
"meanwhile , he is having the most momentous year of his life . \r\n",
|
|
"he has several adventures with his daredevil best friend dave ( timothy reifsnyder ) , he gets his first crush , and begins to wake up to the world around him while he is on his spiritual journey . \r\n",
|
|
"it is somewhat confusing as to what the real audience for wide awake is expected to be . \r\n",
|
|
"on its surface , it appears to be a kid's film . \r\n",
|
|
"however , it deals with serious issues , and is likely to be boring for today's instant-gratification kids . \r\n",
|
|
"and while it might seem heartening to see that someone is trying to produce something thoughtful for the kidvid audience , wide awake asks serious questions , but only delivers a cheap gimmick for an answer . \r\n",
|
|
"if there were a bit more meat in the story , adults on a nostalgic bent might get a kick out of the movie . \r\n",
|
|
"the actors who might have created a great cast ( o'donnell , leary and delany ) are wasted in roles that amount to little more than cameos . \r\n",
|
|
"the nostalgic elements ( best friend , favorite teacher , first crush , etc . ) have been done much better in other movies , and actually seem more like filler here . \r\n",
|
|
"the film's strongest scenes are some touching flashbacks depicting joshua's relationship with his grandfather . \r\n",
|
|
"they show more depth than is present anywhere else in the movie . \r\n",
|
|
"maybe the film would have been better if , instead of playing the relationship through flashbacks , it were set entirely during joshua's last year with his grandpa . \r\n",
|
|
"it certainly would have been more entertaining . \r\n",
|
|
"wide awake can best be described as a failed experiment . \r\n",
|
|
"it starts out with noble aspirations , but never delivers on its promise . \r\n",
|
|
"parents who do take their children to see this one ought to be prepared to answer some tough questions . . . that is if their kids aren't bored to death first . \r\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(df.iloc[15]['review'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {
|
|
"collapsed": true
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.metrics import accuracy_score,classification_report,confusion_matrix"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"0.6367389060887513"
|
|
]
|
|
},
|
|
"execution_count": 17,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"accuracy_score(df['label'],df['comp_score'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {
|
|
"scrolled": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" neg 0.72 0.44 0.55 969\n",
|
|
" pos 0.60 0.83 0.70 969\n",
|
|
"\n",
|
|
" micro avg 0.64 0.64 0.64 1938\n",
|
|
" macro avg 0.66 0.64 0.62 1938\n",
|
|
"weighted avg 0.66 0.64 0.62 1938\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(classification_report(df['label'],df['comp_score']))"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.6.7"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|