{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "___\n", "\n", " \n", "___\n", "# Text Generation with Neural Networks" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Functions for Processing Text\n", "\n", "### Reading in files as a string text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def read_file(filepath):\n", " \n", " with open(filepath) as f:\n", " str_text = f.read()\n", " \n", " return str_text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "read_file('moby_dick_four_chapters.txt')" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "### Tokenize and Clean Text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import spacy\n", "nlp = spacy.load('en',disable=['parser', 'tagger','ner'])\n", "\n", "nlp.max_length = 1198623" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def separate_punc(doc_text):\n", " return [token.text.lower() for token in nlp(doc_text) if token.text not in '\\n\\n \\n\\n\\n!\"-#$%&()--.*+,-/:;<=>?@[\\\\]^_`{|}~\\t\\n ']" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "d = read_file('melville-moby_dick.txt')\n", "tokens = separate_punc(d)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tokens" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "len(tokens)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "4431/25" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create Sequences of Tokens" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# organize into sequences of tokens\n", "train_len = 25+1 # 50 training words , then one target word\n", "\n", "# Empty list of sequences\n", "text_sequences = []\n", "\n", "for i in range(train_len, len(tokens)):\n", " \n", " # Grab train_len# amount of characters\n", " seq = tokens[i-train_len:i]\n", " \n", " # Add to list of sequences\n", " text_sequences.append(seq)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "' '.join(text_sequences[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "' '.join(text_sequences[1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "' '.join(text_sequences[2])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "len(text_sequences)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Keras" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Keras Tokenization" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from keras.preprocessing.text import Tokenizer" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# integer encode sequences of words\n", "tokenizer = Tokenizer()\n", "tokenizer.fit_on_texts(text_sequences)\n", "sequences = tokenizer.texts_to_sequences(text_sequences)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sequences[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tokenizer.index_word" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "for i in sequences[0]:\n", " print(f'{i} : {tokenizer.index_word[i]}')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "tokenizer.word_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "vocabulary_size = len(tokenizer.word_counts)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Convert to Numpy Matrix" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import numpy as np" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sequences = np.array(sequences)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sequences" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Creating an LSTM based model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import keras\n", "from keras.models import Sequential\n", "from keras.layers import Dense,LSTM,Embedding" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def create_model(vocabulary_size, seq_len):\n", " model = Sequential()\n", " model.add(Embedding(vocabulary_size, 25, input_length=seq_len))\n", " model.add(LSTM(150, return_sequences=True))\n", " model.add(LSTM(150))\n", " model.add(Dense(150, activation='relu'))\n", "\n", " model.add(Dense(vocabulary_size, activation='softmax'))\n", " \n", " model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n", " \n", " model.summary()\n", " \n", " return model" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Train / Test Split" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from keras.utils import to_categorical" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "sequences" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# First 49 words\n", "sequences[:,:-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# last Word\n", "sequences[:,-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "X = sequences[:,:-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "y = sequences[:,-1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "y = to_categorical(y, num_classes=vocabulary_size+1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "seq_len = X.shape[1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "seq_len" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Training the Model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# define model\n", "model = create_model(vocabulary_size+1, seq_len)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---\n", "\n", "----" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from pickle import dump,load" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# fit model\n", "model.fit(X, y, batch_size=128, epochs=300,verbose=1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "scrolled": true }, "outputs": [], "source": [ "# save the model to file\n", "model.save('epochBIG.h5')\n", "# save the tokenizer\n", "dump(tokenizer, open('epochBIG', 'wb'))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generating New Text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from random import randint\n", "from pickle import load\n", "from keras.models import load_model\n", "from keras.preprocessing.sequence import pad_sequences" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):\n", " '''\n", " INPUTS:\n", " model : model that was trained on text data\n", " tokenizer : tokenizer that was fit on text data\n", " seq_len : length of training sequence\n", " seed_text : raw string text to serve as the seed\n", " num_gen_words : number of words to be generated by model\n", " '''\n", " \n", " # Final Output\n", " output_text = []\n", " \n", " # Intial Seed Sequence\n", " input_text = seed_text\n", " \n", " # Create num_gen_words\n", " for i in range(num_gen_words):\n", " \n", " # Take the input text string and encode it to a sequence\n", " encoded_text = tokenizer.texts_to_sequences([input_text])[0]\n", " \n", " # Pad sequences to our trained rate (50 words in the video)\n", " pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')\n", " \n", " # Predict Class Probabilities for each word\n", " pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]\n", " \n", " # Grab word\n", " pred_word = tokenizer.index_word[pred_word_ind] \n", " \n", " # Update the sequence of input text (shifting one over with the new word)\n", " input_text += ' ' + pred_word\n", " \n", " output_text.append(pred_word)\n", " \n", " # Make it look like a sentence.\n", " return ' '.join(output_text)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Grab a random seed sequence" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "text_sequences[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import random\n", "random.seed(101)\n", "random_pick = random.randint(0,len(text_sequences))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "random_seed_text = text_sequences[random_pick]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "random_seed_text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "seed_text = ' '.join(random_seed_text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "seed_text" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Exploring Generated Sequence" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "full_text = read_file('moby_dick_four_chapters.txt')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ "for i,word in enumerate(full_text.split()):\n", " if word == 'inkling':\n", " print(' '.join(full_text.split()[i-20:i+20]))\n", " print('\\n')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Great Job!" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }