Upload files to "Fitur_Ekstraksi_BOW_SVM_NB"

2025-11-22 10:47:53 +07:00 · 2025-11-22 10:47:53 +07:00 · 8149bce06e
commit 8149bce06e
parent 57e0cc9e4e
1 changed files with 351 additions and 0 deletions
--- a/Fitur_Ekstraksi_BOW_SVM_NB/Fitur_Ekstraksi_BOW_SVM_NB.ipynb
+++ b/Fitur_Ekstraksi_BOW_SVM_NB/Fitur_Ekstraksi_BOW_SVM_NB.ipynb
@ -0,0 +1,351 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "qBYcPYAb059g",
+    "outputId": "9f57b704-da1b-4495-d366-24c30586dc76"
+   },
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Masukkan jumlah dokumen yang ingin dimasukkan:  3\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Input jumlah dokumen\n",
+    "import pandas as pd\n",
+    "n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "mo-yt5Ob1N8j",
+    "outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Masukkan teks untuk dokumen ke-1: saya belajar nlp di kampus\n",
+      "Masukkan teks untuk dokumen ke-2: saya suka belajar ai\n",
+      "Masukkan teks untuk dokumen ke-3: mahasiswa belajar data science dan nlp\n",
+      "\n",
+      "=== Dokumen yang Dimasukkan ===\n",
+      "Doc 1: saya belajar nlp di kampus\n",
+      "Doc 2: saya suka belajar ai\n",
+      "Doc 3: mahasiswa belajar data science dan nlp\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Input teks dokumen satu per satu\n",
+    "documents = []\n",
+    "for i in range(n):\n",
+    "    teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
+    "    documents.append(teks)\n",
+    "\n",
+    "print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
+    "for i, doc in enumerate(documents):\n",
+    "    print(f\"Doc {i+1}: {doc}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "FkmxRAFq1oDK",
+    "outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Hasil Tokenisasi ===\n",
+      "Doc 1: ['saya', 'belajar', 'nlp', 'di', 'kampus']\n",
+      "Doc 2: ['saya', 'suka', 'belajar', 'ai']\n",
+      "Doc 3: ['mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Tahap Tokenisasi\n",
+    "tokenized_docs = []\n",
+    "for doc in documents:\n",
+    "    tokens = doc.lower().split()\n",
+    "    tokenized_docs.append(tokens)\n",
+    "\n",
+    "print(\"\\n=== Hasil Tokenisasi ===\")\n",
+    "for i, tokens in enumerate(tokenized_docs):\n",
+    "    print(f\"Doc {i+1}: {tokens}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "ybC1Vo2C_c3q",
+    "outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
+      "['saya', 'belajar', 'nlp', 'di', 'kampus', 'saya', 'suka', 'belajar', 'ai', 'mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n",
+      "Jumlah total kata dalam seluruh dokumen: 15\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Pembuatan Corpus\n",
+    "corpus_all = [word for doc in tokenized_docs for word in doc]\n",
+    "\n",
+    "print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
+    "print(corpus_all)\n",
+    "print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "s6S-Ma4R1xuq",
+    "outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Vocabulary (Kata Unik) ===\n",
+      "['ai', 'belajar', 'dan', 'data', 'di', 'kampus', 'mahasiswa', 'nlp', 'saya', 'science', 'suka']\n",
+      "Jumlah kata unik (vocabulary size): 11\n",
+      "\n",
+      "=== Vocabulary (Kata Unik) ===\n",
+      " 1. ai\n",
+      " 2. belajar\n",
+      " 3. dan\n",
+      " 4. data\n",
+      " 5. di\n",
+      " 6. kampus\n",
+      " 7. mahasiswa\n",
+      " 8. nlp\n",
+      " 9. saya\n",
+      "10. science\n",
+      "11. suka\n",
+      "\n",
+      "Jumlah kata unik (vocabulary size): 11\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Pembuatan Vocabulary\n",
+    "vocabulary = sorted(set(corpus_all))\n",
+    "\n",
+    "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
+    "print(vocabulary)\n",
+    "print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
+    "\n",
+    "\n",
+    "vocabulary = sorted(set(corpus_all))\n",
+    "\n",
+    "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
+    "for idx, word in enumerate(vocabulary, start=1):\n",
+    "    print(f\"{idx:>2}. {word}\")\n",
+    "print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "id": "ShevCTva2Fg9"
+   },
+   "outputs": [],
+   "source": [
+    "# Representasi Numerik (Matriks BoW)\n",
+    "bow_matrix = []\n",
+    "for doc in tokenized_docs:\n",
+    "    vector = [doc.count(word) for word in vocabulary]\n",
+    "    bow_matrix.append(vector)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "-yB6D2pY2M0E",
+    "outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Matriks Bag of Words ===\n",
+      "    ai  belajar  dan  data  di  kampus  mahasiswa  nlp  saya  science  suka\n",
+      "D1   0        1    0     0   1       1          0    1     1        0     0\n",
+      "D2   1        1    0     0   0       0          0    0     1        0     1\n",
+      "D3   0        1    1     1   0       0          1    1     0        1     0\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
+    "df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)]  # ubah label indeks jadi D1, D2, D3\n",
+    "\n",
+    "print(\"\\n=== Matriks Bag of Words ===\")\n",
+    "print(df_bow)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "8ruf5vKL2rGD",
+    "outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
+      "         Kata  Frekuensi\n",
+      "0     belajar          3\n",
+      "1         nlp          2\n",
+      "2        saya          2\n",
+      "3         dan          1\n",
+      "4          ai          1\n",
+      "5        data          1\n",
+      "6          di          1\n",
+      "7   mahasiswa          1\n",
+      "8      kampus          1\n",
+      "9     science          1\n",
+      "10       suka          1\n",
+      "Frekuensi kata: 11\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
+    "word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
+    "word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
+    "\n",
+    "print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
+    "print(word_frequencies)\n",
+    "print(f\"Frekuensi kata: {len(word_frequencies)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "NQjExannHuj0"
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ffe1c09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# === SVM Classification ===\n",
+    "from sklearn.svm import LinearSVC\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "\n",
+    "svm_model = LinearSVC()\n",
+    "svm_model.fit(X_train_tfidf, y_train)\n",
+    "\n",
+    "svm_pred = svm_model.predict(X_test_tfidf)\n",
+    "\n",
+    "print(\"=== SVM Accuracy ===\", accuracy_score(y_test, svm_pred))\n",
+    "print(classification_report(y_test, svm_pred))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e2f604d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# === Naive Bayes Classification ===\n",
+    "from sklearn.naive_bayes import MultinomialNB\n",
+    "\n",
+    "nb_model = MultinomialNB()\n",
+    "nb_model.fit(X_train_tfidf, y_train)\n",
+    "\n",
+    "nb_pred = nb_model.predict(X_test_tfidf)\n",
+    "\n",
+    "print(\"=== Naive Bayes Accuracy ===\", accuracy_score(y_test, nb_pred))\n",
+    "print(classification_report(y_test, nb_pred))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}