diff --git a/Fitur_Ekstraksi_BOW.ipynb b/Fitur_Ekstraksi_BOW.ipynb new file mode 100644 index 0000000..559ed26 --- /dev/null +++ b/Fitur_Ekstraksi_BOW.ipynb @@ -0,0 +1,640 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "qBYcPYAb059g" + }, + "outputs": [], + "source": [ + "# =========================\n", + "# 1. IMPORT LIBRARY\n", + "# =========================\n", + "import re\n", + "import pandas as pd\n", + "\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "mo-yt5Ob1N8j" + }, + "outputs": [], + "source": [ + "# =========================\n", + "# 2. DATA TEKS MANUAL\n", + "# =========================\n", + "documents = [\n", + " \"saya suka belajar data science\",\n", + " \"machine learning sangat menarik\",\n", + " \"saya tidak suka matematika\",\n", + " \"belajar python itu menyenangkan\",\n", + " \"data science membutuhkan matematika\",\n", + " \"python sangat membantu data science\"\n", + "]\n", + "\n", + "labels = [\n", + " \"positif\",\n", + " \"positif\",\n", + " \"negatif\",\n", + " \"positif\",\n", + " \"netral\",\n", + " \"positif\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "FkmxRAFq1oDK" + }, + "outputs": [], + "source": [ + "# =========================\n", + "# 3. PREPROCESSING\n", + "# =========================\n", + "def clean_text(text):\n", + " text = text.lower()\n", + " text = re.sub(r\"[^a-z\\s]\", \"\", text)\n", + " text = re.sub(r\"\\s+\", \" \", text).strip()\n", + " return text\n", + "\n", + "documents = [clean_text(doc) for doc in documents]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ybC1Vo2C_c3q", + "outputId": "1695b30d-059d-4bce-e224-7c100b1958ee" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "=== Bag of Words ===\n", + " belajar data itu learning machine matematika membantu membutuhkan \\\n", + "0 1 1 0 0 0 0 0 0 \n", + "1 0 0 0 1 1 0 0 0 \n", + "2 0 0 0 0 0 1 0 0 \n", + "3 1 0 1 0 0 0 0 0 \n", + "4 0 1 0 0 0 1 0 1 \n", + "5 0 1 0 0 0 0 1 0 \n", + "\n", + " menarik menyenangkan python sangat saya science suka tidak \n", + "0 0 0 0 0 1 1 1 0 \n", + "1 1 0 0 1 0 0 0 0 \n", + "2 0 0 0 0 1 0 1 1 \n", + "3 0 1 1 0 0 0 0 0 \n", + "4 0 0 0 0 0 1 0 0 \n", + "5 0 0 1 1 0 1 0 0 \n" + ] + } + ], + "source": [ + "# =========================\n", + "# 4. FEATURE EXTRACTION (BOW)\n", + "# =========================\n", + "vectorizer = CountVectorizer()\n", + "X = vectorizer.fit_transform(documents)\n", + "\n", + "df_bow = pd.DataFrame(\n", + " X.toarray(),\n", + " columns=vectorizer.get_feature_names_out()\n", + ")\n", + "\n", + "print(\"=== Bag of Words ===\")\n", + "print(df_bow)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 80 + }, + "id": "s6S-Ma4R1xuq", + "outputId": "f65359e3-bb87-42b4-fb37-9c035f88e7ad" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "MultinomialNB()" + ], + "text/html": [ + "
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()