commit 6230c0122a84e75ccd94887988718615d442bd8b Author: 202210715229 <202210715229@mhs.ubharajaya.ac.id> Date: Sat Nov 22 09:20:07 2025 +0700 first commit diff --git a/.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb b/.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb new file mode 100644 index 0000000..7ad6ccf --- /dev/null +++ b/.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qBYcPYAb059g", + "outputId": "9f57b704-da1b-4495-d366-24c30586dc76", + "scrolled": true + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Masukkan jumlah dokumen yang ingin dimasukkan: 4\n" + ] + } + ], + "source": [ + "# Input jumlah dokumen\n", + "import pandas as pd\n", + "n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mo-yt5Ob1N8j", + "outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32" + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Masukkan teks untuk dokumen ke-1: saya adalah seorang pria\n", + "Masukkan teks untuk dokumen ke-2: saya adalah pria yang memiliki hati\n", + "Masukkan teks untuk dokumen ke-3: hati saya telah terisi satu nama\n", + "Masukkan teks untuk dokumen ke-4: di dalam hati saya terukir nama pasangan saya\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Dokumen yang Dimasukkan ===\n", + "Doc 1: saya adalah seorang pria\n", + "Doc 2: saya adalah pria yang memiliki hati\n", + "Doc 3: hati saya telah terisi satu nama\n", + "Doc 4: di dalam hati saya terukir nama pasangan saya\n" + ] + } + ], + "source": [ + "# Input teks dokumen satu per satu\n", + "documents = []\n", + "for i in range(n):\n", + " teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n", + " documents.append(teks)\n", + "\n", + "print(\"\\n=== Dokumen yang Dimasukkan ===\")\n", + "for i, doc in enumerate(documents):\n", + " print(f\"Doc {i+1}: {doc}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FkmxRAFq1oDK", + "outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Hasil Tokenisasi ===\n", + "Doc 1: ['saya', 'adalah', 'seorang', 'pria']\n", + "Doc 2: ['saya', 'adalah', 'pria', 'yang', 'memiliki', 'hati']\n", + "Doc 3: ['hati', 'saya', 'telah', 'terisi', 'satu', 'nama']\n", + "Doc 4: ['di', 'dalam', 'hati', 'saya', 'terukir', 'nama', 'pasangan', 'saya']\n" + ] + } + ], + "source": [ + "# Tahap Tokenisasi\n", + "tokenized_docs = []\n", + "for doc in documents:\n", + " tokens = doc.lower().split()\n", + " tokenized_docs.append(tokens)\n", + "\n", + "print(\"\\n=== Hasil Tokenisasi ===\")\n", + "for i, tokens in enumerate(tokenized_docs):\n", + " print(f\"Doc {i+1}: {tokens}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ybC1Vo2C_c3q", + "outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n", + "['saya', 'adalah', 'seorang', 'pria', 'saya', 'adalah', 'pria', 'yang', 'memiliki', 'hati', 'hati', 'saya', 'telah', 'terisi', 'satu', 'nama', 'di', 'dalam', 'hati', 'saya', 'terukir', 'nama', 'pasangan', 'saya']\n", + "Jumlah total kata dalam seluruh dokumen: 24\n" + ] + } + ], + "source": [ + "# Pembuatan Corpus\n", + "corpus_all = [word for doc in tokenized_docs for word in doc]\n", + "\n", + "print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n", + "print(corpus_all)\n", + "print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "s6S-Ma4R1xuq", + "outputId": "98c3685b-1798-4038-d17e-6e45ca419b51" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Vocabulary (Kata Unik) ===\n", + "['adalah', 'dalam', 'di', 'hati', 'memiliki', 'nama', 'pasangan', 'pria', 'satu', 'saya', 'seorang', 'telah', 'terisi', 'terukir', 'yang']\n", + "Jumlah kata unik (vocabulary size): 15\n", + "\n", + "=== Vocabulary (Kata Unik) ===\n", + " 1. adalah\n", + " 2. dalam\n", + " 3. di\n", + " 4. hati\n", + " 5. memiliki\n", + " 6. nama\n", + " 7. pasangan\n", + " 8. pria\n", + " 9. satu\n", + "10. saya\n", + "11. seorang\n", + "12. telah\n", + "13. terisi\n", + "14. terukir\n", + "15. yang\n", + "\n", + "Jumlah kata unik (vocabulary size): 15\n" + ] + } + ], + "source": [ + "# Pembuatan Vocabulary\n", + "vocabulary = sorted(set(corpus_all))\n", + "\n", + "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n", + "print(vocabulary)\n", + "print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n", + "\n", + "\n", + "vocabulary = sorted(set(corpus_all))\n", + "\n", + "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n", + "for idx, word in enumerate(vocabulary, start=1):\n", + " print(f\"{idx:>2}. {word}\")\n", + "print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "ShevCTva2Fg9" + }, + "outputs": [], + "source": [ + "# Representasi Numerik (Matriks BoW)\n", + "bow_matrix = []\n", + "for doc in tokenized_docs:\n", + " vector = [doc.count(word) for word in vocabulary]\n", + " bow_matrix.append(vector)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-yB6D2pY2M0E", + "outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Matriks Bag of Words ===\n", + " adalah dalam di hati memiliki nama pasangan pria satu saya \\\n", + "D1 1 0 0 0 0 0 0 1 0 1 \n", + "D2 1 0 0 1 1 0 0 1 0 1 \n", + "D3 0 0 0 1 0 1 0 0 1 1 \n", + "D4 0 1 1 1 0 1 1 0 0 2 \n", + "\n", + " seorang telah terisi terukir yang \n", + "D1 1 0 0 0 0 \n", + "D2 0 0 0 0 1 \n", + "D3 0 1 1 0 0 \n", + "D4 0 0 0 1 0 \n" + ] + } + ], + "source": [ + "df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n", + "df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n", + "\n", + "print(\"\\n=== Matriks Bag of Words ===\")\n", + "print(df_bow)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8ruf5vKL2rGD", + "outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n", + " Kata Frekuensi\n", + "0 saya 5\n", + "1 hati 3\n", + "2 nama 2\n", + "3 pria 2\n", + "4 adalah 2\n", + "5 di 1\n", + "6 dalam 1\n", + "7 pasangan 1\n", + "8 memiliki 1\n", + "9 satu 1\n", + "10 seorang 1\n", + "11 telah 1\n", + "12 terisi 1\n", + "13 terukir 1\n", + "14 yang 1\n", + "Frekuensi kata: 15\n" + ] + } + ], + "source": [ + "# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n", + "word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n", + "word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n", + "\n", + "print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n", + "print(word_frequencies)\n", + "print(f\"Frekuensi kata: {len(word_frequencies)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NQjExannHuj0" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/Fitur_Ekstraksi_BOW.ipynb b/Fitur_Ekstraksi_BOW.ipynb new file mode 100644 index 0000000..7ad6ccf --- /dev/null +++ b/Fitur_Ekstraksi_BOW.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qBYcPYAb059g", + "outputId": "9f57b704-da1b-4495-d366-24c30586dc76", + "scrolled": true + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Masukkan jumlah dokumen yang ingin dimasukkan: 4\n" + ] + } + ], + "source": [ + "# Input jumlah dokumen\n", + "import pandas as pd\n", + "n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "mo-yt5Ob1N8j", + "outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32" + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Masukkan teks untuk dokumen ke-1: saya adalah seorang pria\n", + "Masukkan teks untuk dokumen ke-2: saya adalah pria yang memiliki hati\n", + "Masukkan teks untuk dokumen ke-3: hati saya telah terisi satu nama\n", + "Masukkan teks untuk dokumen ke-4: di dalam hati saya terukir nama pasangan saya\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Dokumen yang Dimasukkan ===\n", + "Doc 1: saya adalah seorang pria\n", + "Doc 2: saya adalah pria yang memiliki hati\n", + "Doc 3: hati saya telah terisi satu nama\n", + "Doc 4: di dalam hati saya terukir nama pasangan saya\n" + ] + } + ], + "source": [ + "# Input teks dokumen satu per satu\n", + "documents = []\n", + "for i in range(n):\n", + " teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n", + " documents.append(teks)\n", + "\n", + "print(\"\\n=== Dokumen yang Dimasukkan ===\")\n", + "for i, doc in enumerate(documents):\n", + " print(f\"Doc {i+1}: {doc}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FkmxRAFq1oDK", + "outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Hasil Tokenisasi ===\n", + "Doc 1: ['saya', 'adalah', 'seorang', 'pria']\n", + "Doc 2: ['saya', 'adalah', 'pria', 'yang', 'memiliki', 'hati']\n", + "Doc 3: ['hati', 'saya', 'telah', 'terisi', 'satu', 'nama']\n", + "Doc 4: ['di', 'dalam', 'hati', 'saya', 'terukir', 'nama', 'pasangan', 'saya']\n" + ] + } + ], + "source": [ + "# Tahap Tokenisasi\n", + "tokenized_docs = []\n", + "for doc in documents:\n", + " tokens = doc.lower().split()\n", + " tokenized_docs.append(tokens)\n", + "\n", + "print(\"\\n=== Hasil Tokenisasi ===\")\n", + "for i, tokens in enumerate(tokenized_docs):\n", + " print(f\"Doc {i+1}: {tokens}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ybC1Vo2C_c3q", + "outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n", + "['saya', 'adalah', 'seorang', 'pria', 'saya', 'adalah', 'pria', 'yang', 'memiliki', 'hati', 'hati', 'saya', 'telah', 'terisi', 'satu', 'nama', 'di', 'dalam', 'hati', 'saya', 'terukir', 'nama', 'pasangan', 'saya']\n", + "Jumlah total kata dalam seluruh dokumen: 24\n" + ] + } + ], + "source": [ + "# Pembuatan Corpus\n", + "corpus_all = [word for doc in tokenized_docs for word in doc]\n", + "\n", + "print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n", + "print(corpus_all)\n", + "print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "s6S-Ma4R1xuq", + "outputId": "98c3685b-1798-4038-d17e-6e45ca419b51" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Vocabulary (Kata Unik) ===\n", + "['adalah', 'dalam', 'di', 'hati', 'memiliki', 'nama', 'pasangan', 'pria', 'satu', 'saya', 'seorang', 'telah', 'terisi', 'terukir', 'yang']\n", + "Jumlah kata unik (vocabulary size): 15\n", + "\n", + "=== Vocabulary (Kata Unik) ===\n", + " 1. adalah\n", + " 2. dalam\n", + " 3. di\n", + " 4. hati\n", + " 5. memiliki\n", + " 6. nama\n", + " 7. pasangan\n", + " 8. pria\n", + " 9. satu\n", + "10. saya\n", + "11. seorang\n", + "12. telah\n", + "13. terisi\n", + "14. terukir\n", + "15. yang\n", + "\n", + "Jumlah kata unik (vocabulary size): 15\n" + ] + } + ], + "source": [ + "# Pembuatan Vocabulary\n", + "vocabulary = sorted(set(corpus_all))\n", + "\n", + "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n", + "print(vocabulary)\n", + "print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n", + "\n", + "\n", + "vocabulary = sorted(set(corpus_all))\n", + "\n", + "print(\"\\n=== Vocabulary (Kata Unik) ===\")\n", + "for idx, word in enumerate(vocabulary, start=1):\n", + " print(f\"{idx:>2}. {word}\")\n", + "print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "ShevCTva2Fg9" + }, + "outputs": [], + "source": [ + "# Representasi Numerik (Matriks BoW)\n", + "bow_matrix = []\n", + "for doc in tokenized_docs:\n", + " vector = [doc.count(word) for word in vocabulary]\n", + " bow_matrix.append(vector)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-yB6D2pY2M0E", + "outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Matriks Bag of Words ===\n", + " adalah dalam di hati memiliki nama pasangan pria satu saya \\\n", + "D1 1 0 0 0 0 0 0 1 0 1 \n", + "D2 1 0 0 1 1 0 0 1 0 1 \n", + "D3 0 0 0 1 0 1 0 0 1 1 \n", + "D4 0 1 1 1 0 1 1 0 0 2 \n", + "\n", + " seorang telah terisi terukir yang \n", + "D1 1 0 0 0 0 \n", + "D2 0 0 0 0 1 \n", + "D3 0 1 1 0 0 \n", + "D4 0 0 0 1 0 \n" + ] + } + ], + "source": [ + "df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n", + "df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n", + "\n", + "print(\"\\n=== Matriks Bag of Words ===\")\n", + "print(df_bow)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8ruf5vKL2rGD", + "outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n", + " Kata Frekuensi\n", + "0 saya 5\n", + "1 hati 3\n", + "2 nama 2\n", + "3 pria 2\n", + "4 adalah 2\n", + "5 di 1\n", + "6 dalam 1\n", + "7 pasangan 1\n", + "8 memiliki 1\n", + "9 satu 1\n", + "10 seorang 1\n", + "11 telah 1\n", + "12 terisi 1\n", + "13 terukir 1\n", + "14 yang 1\n", + "Frekuensi kata: 15\n" + ] + } + ], + "source": [ + "# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n", + "word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n", + "word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n", + "\n", + "print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n", + "print(word_frequencies)\n", + "print(f\"Frekuensi kata: {len(word_frequencies)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NQjExannHuj0" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}