Natural Language Processing (NLP)_Task_Jupyter_Notebook

profileCjvanaarde
L3T20.ipynb

{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "eHGm8RQlK46m" }, "source": [ "# Get and explore dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "-erlYw1jXtgh" }, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "from matplotlib import pyplot\n", "import numpy as np\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "goZIhX8nII4T" }, "outputs": [], "source": [ "# Make results reproducible - set random seed\n", "from numpy.random import seed\n", "seed(42)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "v_fEysQHII4W", "tags": [] }, "outputs": [], "source": [ "negative_file = \"negative.txt\"\n", "positive_file = \"positive.txt\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "0rWm4yycII4Y", "tags": [] }, "outputs": [], "source": [ "# Do not modify - helper function to load and preprocess data\n", "def filter_words(line): \n", " line = re.sub(r'[^\\w\\s]','',line.rstrip())\n", " words = line.split(\" \") \n", " words = [i.lower() for i in words if i] \n", " return \" \".join(words)\n", "\n", "def load_data(filename):\n", " thefile = open(filename, 'r') \n", " lines = thefile.readlines() \n", "\n", " data = []\n", " for l in range(0,len(lines)): \n", " if(lines[l-1].strip() == \"<title>\"): \n", " theline = filter_words(lines[l])\n", " if(len(theline) < 50):\n", " data.append(theline) \n", " \n", " return data\n", "\n", "# Helper function to convert categorical data to class label\n", "def to_word_label(y):\n", " y = to_class(y) \n", " return [\"positive\" if i==0 else \"negative\" for i in y]\n", "\n", "# Helper function to convert class label to numeric label\n", "def to_numeric_label(y):\n", " return [0 if i==\"positive\" else 1 for i in word_labels]\n", "\n", "# Helper function: this function needs to be called before sending arrays to sklearn metrics,\n", "# it converts back to class form from categorical form. ie: [1,0] --> 0, [0,1] --> 1\n", "def to_class(y):\n", " return np.argmax(y,axis=1) " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 70 }, "colab_type": "code", "id": "jjGUiFm9II4b", "outputId": "aa5eb89b-85f2-4d61-f33a-b00361b19a41", "tags": [] }, "outputs": [], "source": [ "positive = load_data(positive_file)\n", "negative = load_data(negative_file)\n", "\n", "print(positive[0:10])\n", "print(negative[0:10])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "yrgEYOOCII4d", "tags": [] }, "outputs": [], "source": [ "# Do not modify - Combines the positive and negative reviews into a single list and create labels\n", "data = positive + negative\n", "word_labels = [\"positive\"] * len(positive) + [\"negative\"] * len(negative) \n", "\n", "# Converts labels to numbers in one-hot encoding - [1, 0] (positive) or [0, 1] (negative)\n", "from keras.utils import to_categorical\n", "labels = to_categorical(to_numeric_label(word_labels))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "97Uh2uBpII4l", "tags": [] }, "outputs": [], "source": [ "# Write some code to investigate the dataset. \n", "# - Calculate and report the mean review size, its standard deviation and create a boxplot.\n", "# - Calculate the number of unique words in the dataset\n", "# - Perform any other dataset investigation that you feel would be valuable\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 50 }, "colab_type": "code", "id": "_XXnArgcII4i", "outputId": "020b2ee2-4c2a-4bc0-bb55-e2ae3841fe57", "tags": [] }, "outputs": [], "source": [ "# Do not modify - Tokenize the vocabulary \n", "from keras.preprocessing.text import Tokenizer\n", "\n", "tokenizer = Tokenizer(num_words=25)\n", "\n", "tokenizer.fit_on_texts(data) #create the vocabularry\n", "\n", "tokenized_data = tokenizer.texts_to_sequences(data) #tokenize the data using the vocabulary\n", "\n", "vocab_size = len(tokenizer.word_index) + 1 \n", "\n", "# Compare a sample of the data before and after tokenization\n", "print(data[0:5])\n", "print(tokenized_data[0:5])" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "_vsMmLjdK1Gf" }, "source": [ "# Pre-processing" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "24DaXd1zII4q", "tags": [] }, "outputs": [], "source": [ "# Pre-processing\n", "# Write some code to pre-process the data so that each review is the same length\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "AFTJcFy6II4s", "tags": [] }, "outputs": [], "source": [ "# Write some code to split the data into a training and test set. Make sure you shuffle the data. Use 20% for the test set.\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 128 }, "colab_type": "code", "id": "lDth1BSzII4u", "outputId": "4614049a-7151-4407-9bc8-54604bbe7fdd" }, "outputs": [], "source": [ "\n", "from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score\n", "\n", "# Fill in the following function so it\n", "# - makes a prediction for the test set given the model\n", "# - reports the precision, recall and f1 score. Also print the confusion matrix. \n", "# You will need to use the helper to_class function to convert y_pred and y_test before supplying them to the sklearn functions.\n", "\n", "def assess_model(model, X_test, y_test): \n", " #To do" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "nTqLbbE6MpGt" }, "source": [ "# Build and tune model" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "3X70rA4uMXNv" }, "source": [ "Define network architecture" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "wKoXWKG4II5F" }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "llS0-VKBMbz-" }, "source": [ "Train model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "F5IJs0QuMe_I" }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "rniBBEiyMRKD" }, "source": [ "Examine performance of model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "eBVogHg2II5T", "tags": [] }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "HB8kzt-IME4U" }, "source": [ "Plot graphs for accuracy and loss" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "hPyJ78unMJUI" }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "eAZ1LyFBMLoK" }, "source": [ "# Make a prediction" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "vfPM5LokII5V", "tags": [] }, "outputs": [], "source": [ "# This is a very small set of completed new data to use to make predictions.\n", "prediction_data = [\"this book is fabulous\",\"i hated this book\", \"the best\", \"no good\", \"okay\"]\n", "tokenized = tokenizer.texts_to_sequences(prediction_data)\n", "padded = pad_sequences(tokenized, padding='post', maxlen=maxlen)\n", "\n", "# Supply this data to each of your models and see how it does. \n", "# You can call the helper function \"to_word_label\" to map the output of the model to the name of the\n", "# class it was predicted to belong to.\n", "\n", "\n", "\n", "\n" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "L3T20.ipynb", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.2" } }, "nbformat": 4, "nbformat_minor": 1 }