Repositori-NLP/Fitur_Ekstraksi_BOW-checkpoint.ipynb

299 lines
7.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qBYcPYAb059g",
"outputId": "9f57b704-da1b-4495-d366-24c30586dc76"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Jumlah dokumen: 3\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# Jumlah dokumen\n",
"n = 3\n",
"print(\"Jumlah dokumen:\", n)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mo-yt5Ob1N8j",
"outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
},
"outputs": [],
"source": [
"# Input teks dokumen satu per satu\n",
"documents = []\n",
"\n",
"for i in range(n):\n",
" teks = input(f\"Masukkan teks dokumen ke-{i+1}: \")\n",
" documents.append(teks)\n",
"\n",
"print(\"\\n=== Daftar Dokumen ===\")\n",
"for i, doc in enumerate(documents):\n",
" print(f\"Dokumen {i+1}: {doc}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FkmxRAFq1oDK",
"outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Hasil Tokenisasi ===\n",
"Doc 1: ['saya', 'belajar', 'nlp', 'di', 'kampus']\n",
"Doc 2: ['saya', 'suka', 'belajar', 'ai']\n",
"Doc 3: ['mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n"
]
}
],
"source": [
"# Tahap Tokenisasi\n",
"tokenized_docs = []\n",
"for doc in documents:\n",
" tokens = doc.lower().split()\n",
" tokenized_docs.append(tokens)\n",
"\n",
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
"for i, tokens in enumerate(tokenized_docs):\n",
" print(f\"Doc {i+1}: {tokens}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ybC1Vo2C_c3q",
"outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
"['saya', 'belajar', 'nlp', 'di', 'kampus', 'saya', 'suka', 'belajar', 'ai', 'mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n",
"Jumlah total kata dalam seluruh dokumen: 15\n"
]
}
],
"source": [
"# Pembuatan Corpus\n",
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
"\n",
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
"print(corpus_all)\n",
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "s6S-Ma4R1xuq",
"outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Vocabulary (Kata Unik) ===\n",
"['ai', 'belajar', 'dan', 'data', 'di', 'kampus', 'mahasiswa', 'nlp', 'saya', 'science', 'suka']\n",
"Jumlah kata unik (vocabulary size): 11\n",
"\n",
"=== Vocabulary (Kata Unik) ===\n",
" 1. ai\n",
" 2. belajar\n",
" 3. dan\n",
" 4. data\n",
" 5. di\n",
" 6. kampus\n",
" 7. mahasiswa\n",
" 8. nlp\n",
" 9. saya\n",
"10. science\n",
"11. suka\n",
"\n",
"Jumlah kata unik (vocabulary size): 11\n"
]
}
],
"source": [
"# Pembuatan Vocabulary\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"print(vocabulary)\n",
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
"\n",
"\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"for idx, word in enumerate(vocabulary, start=1):\n",
" print(f\"{idx:>2}. {word}\")\n",
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "ShevCTva2Fg9"
},
"outputs": [],
"source": [
"# Representasi Numerik (Matriks BoW)\n",
"bow_matrix = []\n",
"for doc in tokenized_docs:\n",
" vector = [doc.count(word) for word in vocabulary]\n",
" bow_matrix.append(vector)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-yB6D2pY2M0E",
"outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Matriks Bag of Words ===\n",
" ai belajar dan data di kampus mahasiswa nlp saya science suka\n",
"D1 0 1 0 0 1 1 0 1 1 0 0\n",
"D2 1 1 0 0 0 0 0 0 1 0 1\n",
"D3 0 1 1 1 0 0 1 1 0 1 0\n"
]
}
],
"source": [
"df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
"df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n",
"\n",
"print(\"\\n=== Matriks Bag of Words ===\")\n",
"print(df_bow)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8ruf5vKL2rGD",
"outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
" Kata Frekuensi\n",
"0 belajar 3\n",
"1 nlp 2\n",
"2 saya 2\n",
"3 dan 1\n",
"4 ai 1\n",
"5 data 1\n",
"6 di 1\n",
"7 mahasiswa 1\n",
"8 kampus 1\n",
"9 science 1\n",
"10 suka 1\n",
"Frekuensi kata: 11\n"
]
}
],
"source": [
"# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
"word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
"word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
"\n",
"print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
"print(word_frequencies)\n",
"print(f\"Frekuensi kata: {len(word_frequencies)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NQjExannHuj0"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}