materi-praktikum/Praktikum Python Code/06-Deep-Learning/01-Text-Generation-with-Neural-Networks.ipynb

774 lines
14 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"___\n",
"\n",
"<a href='http://www.pieriandata.com'> <img src='../Pierian_Data_Logo.png' /></a>\n",
"___\n",
"# Text Generation with Neural Networks"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Functions for Processing Text\n",
"\n",
"### Reading in files as a string text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def read_file(filepath):\n",
" \n",
" with open(filepath) as f:\n",
" str_text = f.read()\n",
" \n",
" return str_text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"read_file('moby_dick_four_chapters.txt')"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### Tokenize and Clean Text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import spacy\n",
"nlp = spacy.load('en',disable=['parser', 'tagger','ner'])\n",
"\n",
"nlp.max_length = 1198623"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def separate_punc(doc_text):\n",
" return [token.text.lower() for token in nlp(doc_text) if token.text not in '\\n\\n \\n\\n\\n!\"-#$%&()--.*+,-/:;<=>?@[\\\\]^_`{|}~\\t\\n ']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"d = read_file('melville-moby_dick.txt')\n",
"tokens = separate_punc(d)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tokens"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"len(tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"4431/25"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create Sequences of Tokens"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# organize into sequences of tokens\n",
"train_len = 25+1 # 50 training words , then one target word\n",
"\n",
"# Empty list of sequences\n",
"text_sequences = []\n",
"\n",
"for i in range(train_len, len(tokens)):\n",
" \n",
" # Grab train_len# amount of characters\n",
" seq = tokens[i-train_len:i]\n",
" \n",
" # Add to list of sequences\n",
" text_sequences.append(seq)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"' '.join(text_sequences[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"' '.join(text_sequences[1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"' '.join(text_sequences[2])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"len(text_sequences)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Keras"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Keras Tokenization"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from keras.preprocessing.text import Tokenizer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# integer encode sequences of words\n",
"tokenizer = Tokenizer()\n",
"tokenizer.fit_on_texts(text_sequences)\n",
"sequences = tokenizer.texts_to_sequences(text_sequences)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sequences[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tokenizer.index_word"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for i in sequences[0]:\n",
" print(f'{i} : {tokenizer.index_word[i]}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tokenizer.word_counts"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"vocabulary_size = len(tokenizer.word_counts)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Convert to Numpy Matrix"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sequences = np.array(sequences)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sequences"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Creating an LSTM based model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import keras\n",
"from keras.models import Sequential\n",
"from keras.layers import Dense,LSTM,Embedding"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def create_model(vocabulary_size, seq_len):\n",
" model = Sequential()\n",
" model.add(Embedding(vocabulary_size, 25, input_length=seq_len))\n",
" model.add(LSTM(150, return_sequences=True))\n",
" model.add(LSTM(150))\n",
" model.add(Dense(150, activation='relu'))\n",
"\n",
" model.add(Dense(vocabulary_size, activation='softmax'))\n",
" \n",
" model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
" \n",
" model.summary()\n",
" \n",
" return model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Train / Test Split"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from keras.utils import to_categorical"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sequences"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# First 49 words\n",
"sequences[:,:-1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# last Word\n",
"sequences[:,-1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X = sequences[:,:-1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y = sequences[:,-1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"y = to_categorical(y, num_classes=vocabulary_size+1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"seq_len = X.shape[1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"seq_len"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Training the Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# define model\n",
"model = create_model(vocabulary_size+1, seq_len)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"---\n",
"\n",
"----"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from pickle import dump,load"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# fit model\n",
"model.fit(X, y, batch_size=128, epochs=300,verbose=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true,
"scrolled": true
},
"outputs": [],
"source": [
"# save the model to file\n",
"model.save('epochBIG.h5')\n",
"# save the tokenizer\n",
"dump(tokenizer, open('epochBIG', 'wb'))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Generating New Text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from random import randint\n",
"from pickle import load\n",
"from keras.models import load_model\n",
"from keras.preprocessing.sequence import pad_sequences"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):\n",
" '''\n",
" INPUTS:\n",
" model : model that was trained on text data\n",
" tokenizer : tokenizer that was fit on text data\n",
" seq_len : length of training sequence\n",
" seed_text : raw string text to serve as the seed\n",
" num_gen_words : number of words to be generated by model\n",
" '''\n",
" \n",
" # Final Output\n",
" output_text = []\n",
" \n",
" # Intial Seed Sequence\n",
" input_text = seed_text\n",
" \n",
" # Create num_gen_words\n",
" for i in range(num_gen_words):\n",
" \n",
" # Take the input text string and encode it to a sequence\n",
" encoded_text = tokenizer.texts_to_sequences([input_text])[0]\n",
" \n",
" # Pad sequences to our trained rate (50 words in the video)\n",
" pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')\n",
" \n",
" # Predict Class Probabilities for each word\n",
" pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]\n",
" \n",
" # Grab word\n",
" pred_word = tokenizer.index_word[pred_word_ind] \n",
" \n",
" # Update the sequence of input text (shifting one over with the new word)\n",
" input_text += ' ' + pred_word\n",
" \n",
" output_text.append(pred_word)\n",
" \n",
" # Make it look like a sentence.\n",
" return ' '.join(output_text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Grab a random seed sequence"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"text_sequences[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import random\n",
"random.seed(101)\n",
"random_pick = random.randint(0,len(text_sequences))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"random_seed_text = text_sequences[random_pick]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"random_seed_text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"seed_text = ' '.join(random_seed_text)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"seed_text"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Exploring Generated Sequence"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"full_text = read_file('moby_dick_four_chapters.txt')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"for i,word in enumerate(full_text.split()):\n",
" if word == 'inkling':\n",
" print(' '.join(full_text.split()[i-20:i+20]))\n",
" print('\\n')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Great Job!"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}