{ "cells": [ { "cell_type": "code", "execution_count": 11, "metadata": { "id": "qBYcPYAb059g" }, "outputs": [], "source": [ "# =========================\n", "# 1. IMPORT LIBRARY\n", "# =========================\n", "import re\n", "import pandas as pd\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.metrics import accuracy_score, confusion_matrix, classification_report" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "id": "mo-yt5Ob1N8j" }, "outputs": [], "source": [ "# =========================\n", "# 2. DATA TEKS MANUAL\n", "# =========================\n", "documents = [\n", " \"saya suka belajar data science\",\n", " \"machine learning sangat menarik\",\n", " \"saya tidak suka matematika\",\n", " \"belajar python itu menyenangkan\",\n", " \"data science membutuhkan matematika\",\n", " \"python sangat membantu data science\"\n", "]\n", "\n", "labels = [\n", " \"positif\",\n", " \"positif\",\n", " \"negatif\",\n", " \"positif\",\n", " \"netral\",\n", " \"positif\"\n", "]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "FkmxRAFq1oDK" }, "outputs": [], "source": [ "# =========================\n", "# 3. PREPROCESSING\n", "# =========================\n", "def clean_text(text):\n", " text = text.lower()\n", " text = re.sub(r\"[^a-z\\s]\", \"\", text)\n", " text = re.sub(r\"\\s+\", \" \", text).strip()\n", " return text\n", "\n", "documents = [clean_text(doc) for doc in documents]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ybC1Vo2C_c3q", "outputId": "1695b30d-059d-4bce-e224-7c100b1958ee" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "=== Bag of Words ===\n", " belajar data itu learning machine matematika membantu membutuhkan \\\n", "0 1 1 0 0 0 0 0 0 \n", "1 0 0 0 1 1 0 0 0 \n", "2 0 0 0 0 0 1 0 0 \n", "3 1 0 1 0 0 0 0 0 \n", "4 0 1 0 0 0 1 0 1 \n", "5 0 1 0 0 0 0 1 0 \n", "\n", " menarik menyenangkan python sangat saya science suka tidak \n", "0 0 0 0 0 1 1 1 0 \n", "1 1 0 0 1 0 0 0 0 \n", "2 0 0 0 0 1 0 1 1 \n", "3 0 1 1 0 0 0 0 0 \n", "4 0 0 0 0 0 1 0 0 \n", "5 0 0 1 1 0 1 0 0 \n" ] } ], "source": [ "# =========================\n", "# 4. FEATURE EXTRACTION (BOW)\n", "# =========================\n", "vectorizer = CountVectorizer()\n", "X = vectorizer.fit_transform(documents)\n", "\n", "df_bow = pd.DataFrame(\n", " X.toarray(),\n", " columns=vectorizer.get_feature_names_out()\n", ")\n", "\n", "print(\"=== Bag of Words ===\")\n", "print(df_bow)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "s6S-Ma4R1xuq", "outputId": "f65359e3-bb87-42b4-fb37-9c035f88e7ad" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "MultinomialNB()" ], "text/html": [ "
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()