Initial Commit
This commit is contained in:
commit
4489eb5e9b
33
.Trash-0/files/Untitled.ipynb
Normal file
33
.Trash-0/files/Untitled.ipynb
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "28217e47-db17-4572-853d-151630b47bc8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
3
.Trash-0/info/Untitled.ipynb.trashinfo
Normal file
3
.Trash-0/info/Untitled.ipynb.trashinfo
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
[Trash Info]
|
||||||
|
Path=Untitled.ipynb
|
||||||
|
DeletionDate=2025-11-14T15:53:34
|
||||||
300
.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb
Normal file
300
.ipynb_checkpoints/Fitur_Ekstraksi_BOW-checkpoint.ipynb
Normal file
@ -0,0 +1,300 @@
|
|||||||
|
{
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"display_name": "Python 3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "qBYcPYAb059g",
|
||||||
|
"outputId": "9f57b704-da1b-4495-d366-24c30586dc76"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Masukkan jumlah dokumen yang ingin dimasukkan: 3\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Input jumlah dokumen\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Input teks dokumen satu per satu\n",
|
||||||
|
"documents = []\n",
|
||||||
|
"for i in range(n):\n",
|
||||||
|
" teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
|
||||||
|
" documents.append(teks)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
|
||||||
|
"for i, doc in enumerate(documents):\n",
|
||||||
|
" print(f\"Doc {i+1}: {doc}\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "mo-yt5Ob1N8j",
|
||||||
|
"outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Masukkan teks untuk dokumen ke-1: saya belajar nlp di kampus\n",
|
||||||
|
"Masukkan teks untuk dokumen ke-2: saya suka belajar ai\n",
|
||||||
|
"Masukkan teks untuk dokumen ke-3: mahasiswa belajar data science dan nlp\n",
|
||||||
|
"\n",
|
||||||
|
"=== Dokumen yang Dimasukkan ===\n",
|
||||||
|
"Doc 1: saya belajar nlp di kampus\n",
|
||||||
|
"Doc 2: saya suka belajar ai\n",
|
||||||
|
"Doc 3: mahasiswa belajar data science dan nlp\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Tahap Tokenisasi\n",
|
||||||
|
"tokenized_docs = []\n",
|
||||||
|
"for doc in documents:\n",
|
||||||
|
" tokens = doc.lower().split()\n",
|
||||||
|
" tokenized_docs.append(tokens)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
|
||||||
|
"for i, tokens in enumerate(tokenized_docs):\n",
|
||||||
|
" print(f\"Doc {i+1}: {tokens}\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "FkmxRAFq1oDK",
|
||||||
|
"outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"=== Hasil Tokenisasi ===\n",
|
||||||
|
"Doc 1: ['saya', 'belajar', 'nlp', 'di', 'kampus']\n",
|
||||||
|
"Doc 2: ['saya', 'suka', 'belajar', 'ai']\n",
|
||||||
|
"Doc 3: ['mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Pembuatan Corpus\n",
|
||||||
|
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
|
||||||
|
"print(corpus_all)\n",
|
||||||
|
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "ybC1Vo2C_c3q",
|
||||||
|
"outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
|
||||||
|
"['saya', 'belajar', 'nlp', 'di', 'kampus', 'saya', 'suka', 'belajar', 'ai', 'mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n",
|
||||||
|
"Jumlah total kata dalam seluruh dokumen: 15\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Pembuatan Vocabulary\n",
|
||||||
|
"vocabulary = sorted(set(corpus_all))\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
|
||||||
|
"print(vocabulary)\n",
|
||||||
|
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"vocabulary = sorted(set(corpus_all))\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
|
||||||
|
"for idx, word in enumerate(vocabulary, start=1):\n",
|
||||||
|
" print(f\"{idx:>2}. {word}\")\n",
|
||||||
|
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "s6S-Ma4R1xuq",
|
||||||
|
"outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"=== Vocabulary (Kata Unik) ===\n",
|
||||||
|
"['ai', 'belajar', 'dan', 'data', 'di', 'kampus', 'mahasiswa', 'nlp', 'saya', 'science', 'suka']\n",
|
||||||
|
"Jumlah kata unik (vocabulary size): 11\n",
|
||||||
|
"\n",
|
||||||
|
"=== Vocabulary (Kata Unik) ===\n",
|
||||||
|
" 1. ai\n",
|
||||||
|
" 2. belajar\n",
|
||||||
|
" 3. dan\n",
|
||||||
|
" 4. data\n",
|
||||||
|
" 5. di\n",
|
||||||
|
" 6. kampus\n",
|
||||||
|
" 7. mahasiswa\n",
|
||||||
|
" 8. nlp\n",
|
||||||
|
" 9. saya\n",
|
||||||
|
"10. science\n",
|
||||||
|
"11. suka\n",
|
||||||
|
"\n",
|
||||||
|
"Jumlah kata unik (vocabulary size): 11\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Representasi Numerik (Matriks BoW)\n",
|
||||||
|
"bow_matrix = []\n",
|
||||||
|
"for doc in tokenized_docs:\n",
|
||||||
|
" vector = [doc.count(word) for word in vocabulary]\n",
|
||||||
|
" bow_matrix.append(vector)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "ShevCTva2Fg9"
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"outputs": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
|
||||||
|
"df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Matriks Bag of Words ===\")\n",
|
||||||
|
"print(df_bow)"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "-yB6D2pY2M0E",
|
||||||
|
"outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"=== Matriks Bag of Words ===\n",
|
||||||
|
" ai belajar dan data di kampus mahasiswa nlp saya science suka\n",
|
||||||
|
"D1 0 1 0 0 1 1 0 1 1 0 0\n",
|
||||||
|
"D2 1 1 0 0 0 0 0 0 1 0 1\n",
|
||||||
|
"D3 0 1 1 1 0 0 1 1 0 1 0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
|
||||||
|
"word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
|
||||||
|
"word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
|
||||||
|
"print(word_frequencies)\n",
|
||||||
|
"print(f\"Frekuensi kata: {len(word_frequencies)}\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "8ruf5vKL2rGD",
|
||||||
|
"outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
|
||||||
|
" Kata Frekuensi\n",
|
||||||
|
"0 belajar 3\n",
|
||||||
|
"1 nlp 2\n",
|
||||||
|
"2 saya 2\n",
|
||||||
|
"3 dan 1\n",
|
||||||
|
"4 ai 1\n",
|
||||||
|
"5 data 1\n",
|
||||||
|
"6 di 1\n",
|
||||||
|
"7 mahasiswa 1\n",
|
||||||
|
"8 kampus 1\n",
|
||||||
|
"9 science 1\n",
|
||||||
|
"10 suka 1\n",
|
||||||
|
"Frekuensi kata: 11\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [],
|
||||||
|
"metadata": {
|
||||||
|
"id": "NQjExannHuj0"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
374
.ipynb_checkpoints/N-Gram-checkpoint.ipynb
Normal file
374
.ipynb_checkpoints/N-Gram-checkpoint.ipynb
Normal file
@ -0,0 +1,374 @@
|
|||||||
|
{
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"display_name": "Python 3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"# **PENGOLAH BAHASA ALAMI F7A1 | Pertemuan ke-5 - Jum'at, 17 Oktober 2025**\n",
|
||||||
|
"## **Tugas:** Membuat N-GRAM (Unigram, Bigram, & Trigram)\n",
|
||||||
|
"### **Dosen Pengampu:** Muhammad Yasir, S.Si., M.Kom.\n",
|
||||||
|
"#### **Disusun Oleh:** Mega Gloria (202210715173)\n",
|
||||||
|
"\n"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "JVPdWpz3hhbj"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"# **UNIGRAM**"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "4Mvva3v65h1v"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"from IPython.display import clear_output\n",
|
||||||
|
"import math\n",
|
||||||
|
"\n",
|
||||||
|
"# 1. Input Kalimat dan Tokenisasi\n",
|
||||||
|
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
|
||||||
|
"\n",
|
||||||
|
"# Bersihkan output (khusus lingkungan notebook)\n",
|
||||||
|
"try:\n",
|
||||||
|
" clear_output()\n",
|
||||||
|
"except:\n",
|
||||||
|
" pass\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Corpus: {kalimat}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Tokenize\n",
|
||||||
|
"tokens = kalimat.lower().split()\n",
|
||||||
|
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 2. Hitung Frekuensi Unigram\n",
|
||||||
|
"unigram_counts = Counter(tokens)\n",
|
||||||
|
"total_tokens = sum(unigram_counts.values())\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nFrekuensi Unigram dalam kalimat\")\n",
|
||||||
|
"for pair, count in unigram_counts.items():\n",
|
||||||
|
" print(f\" ('{pair}'): {count}\")\n",
|
||||||
|
"print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
|
||||||
|
"unigram_probabilities = {}\n",
|
||||||
|
"for word, count in unigram_counts.items():\n",
|
||||||
|
" prob = count / total_tokens\n",
|
||||||
|
" unigram_probabilities[word] = prob\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas masing-masing unigram:\")\n",
|
||||||
|
"for word, prob in unigram_probabilities.items():\n",
|
||||||
|
" print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n",
|
||||||
|
"p_kalimat = 1\n",
|
||||||
|
"prob_parts = []\n",
|
||||||
|
"\n",
|
||||||
|
"# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n",
|
||||||
|
"for word in tokens:\n",
|
||||||
|
" prob_value = unigram_probabilities[word]\n",
|
||||||
|
" p_kalimat *= prob_value\n",
|
||||||
|
" # Format: P(word)=prob_value\n",
|
||||||
|
" prob_parts.append(f\"P({word})={prob_value:.2f}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n",
|
||||||
|
"prob_str = \" x \".join(prob_parts)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
|
||||||
|
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "1cub_VJnUJMl",
|
||||||
|
"outputId": "1889eb61-4f3f-4780-f42e-02368076cce3"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Corpus: saya suka makan nasi\n",
|
||||||
|
"Tokens (4): ['saya', 'suka', 'makan', 'nasi']\n",
|
||||||
|
"\n",
|
||||||
|
"Frekuensi Unigram dalam kalimat\n",
|
||||||
|
" ('saya'): 1\n",
|
||||||
|
" ('suka'): 1\n",
|
||||||
|
" ('makan'): 1\n",
|
||||||
|
" ('nasi'): 1\n",
|
||||||
|
"\n",
|
||||||
|
"Total unigram dalam 1 kalimat: 4\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas masing-masing unigram:\n",
|
||||||
|
" P(saya) = 0.25 (25.00%)\n",
|
||||||
|
" P(suka) = 0.25 (25.00%)\n",
|
||||||
|
" P(makan) = 0.25 (25.00%)\n",
|
||||||
|
" P(nasi) = 0.25 (25.00%)\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
|
||||||
|
" P(saya suka makan nasi) = P(saya)=0.25 x P(suka)=0.25 x P(makan)=0.25 x P(nasi)=0.25 = 0.0039 (0.39%)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"# **BIGRAM**"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "Vstwt996-FrS"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"from IPython.display import clear_output\n",
|
||||||
|
"import math\n",
|
||||||
|
"\n",
|
||||||
|
"# 1. Input Kalimat dan Tokenisasi\n",
|
||||||
|
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
|
||||||
|
"\n",
|
||||||
|
"# Bersihkan output (khusus lingkungan notebook)\n",
|
||||||
|
"try:\n",
|
||||||
|
" clear_output()\n",
|
||||||
|
"except:\n",
|
||||||
|
" pass\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Corpus: {kalimat}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Tokenisasi\n",
|
||||||
|
"tokens = kalimat.lower().split()\n",
|
||||||
|
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 2. Hitung Frekuensi Unigram dan Bigram\n",
|
||||||
|
"unigram_counts = Counter(tokens)\n",
|
||||||
|
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
|
||||||
|
"bigram_counts = Counter(bigrams)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
|
||||||
|
"for pair, count in bigram_counts.items():\n",
|
||||||
|
" print(f\" {pair}: {count}\")\n",
|
||||||
|
"print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
|
||||||
|
"bigram_probabilities = {}\n",
|
||||||
|
"for (w1, w2), count in bigram_counts.items():\n",
|
||||||
|
" prob = count / unigram_counts[w1]\n",
|
||||||
|
" bigram_probabilities[(w1, w2)] = prob\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas masing-masing bigram:\")\n",
|
||||||
|
"for (w1, w2), prob in bigram_probabilities.items():\n",
|
||||||
|
" print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n",
|
||||||
|
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n",
|
||||||
|
"total_tokens = sum(unigram_counts.values())\n",
|
||||||
|
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n",
|
||||||
|
"p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n",
|
||||||
|
"\n",
|
||||||
|
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n",
|
||||||
|
"\n",
|
||||||
|
"for i in range(1, len(tokens)):\n",
|
||||||
|
" pair = (tokens[i-1], tokens[i])\n",
|
||||||
|
" p = bigram_probabilities.get(pair, 0)\n",
|
||||||
|
" p_kalimat *= p\n",
|
||||||
|
" prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Gabungkan rumus perkalian untuk ditampilkan\n",
|
||||||
|
"prob_str = \" x \".join(prob_str_parts)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
|
||||||
|
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "XRIY4qgTVbjl",
|
||||||
|
"outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Corpus: saya makan nasi dan saya makan roti\n",
|
||||||
|
"Tokens (7): ['saya', 'makan', 'nasi', 'dan', 'saya', 'makan', 'roti']\n",
|
||||||
|
"\n",
|
||||||
|
"Frekuensi Bigram dalam kalimat:\n",
|
||||||
|
" ('saya', 'makan'): 2\n",
|
||||||
|
" ('makan', 'nasi'): 1\n",
|
||||||
|
" ('nasi', 'dan'): 1\n",
|
||||||
|
" ('dan', 'saya'): 1\n",
|
||||||
|
" ('makan', 'roti'): 1\n",
|
||||||
|
"\n",
|
||||||
|
"Total bigram dalam 1 kalimat: 6\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas masing-masing bigram:\n",
|
||||||
|
" P(makan|saya) = 1.00 (100.00%)\n",
|
||||||
|
" P(nasi|makan) = 0.50 (50.00%)\n",
|
||||||
|
" P(dan|nasi) = 1.00 (100.00%)\n",
|
||||||
|
" P(saya|dan) = 1.00 (100.00%)\n",
|
||||||
|
" P(roti|makan) = 0.50 (50.00%)\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
|
||||||
|
" P(saya makan nasi dan saya makan roti) = P(saya)=0.29 x P(makan|saya)=1.00 x P(nasi|makan)=0.50 x P(dan|nasi)=1.00 x P(saya|dan)=1.00 x P(makan|saya)=1.00 x P(roti|makan)=0.50 = 0.071429 (7.14%)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"source": [
|
||||||
|
"# **TRIGRAM**"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"id": "E6n1IU8X-G9S"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"from IPython.display import clear_output\n",
|
||||||
|
"import math\n",
|
||||||
|
"\n",
|
||||||
|
"# 1. Input Kalimat dan Tokenisasi\n",
|
||||||
|
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
|
||||||
|
"\n",
|
||||||
|
"# Bersihkan output (khusus lingkungan notebook)\n",
|
||||||
|
"try:\n",
|
||||||
|
" clear_output()\n",
|
||||||
|
"except:\n",
|
||||||
|
" pass\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Corpus: {kalimat}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Tokenisasi\n",
|
||||||
|
"tokens = kalimat.lower().split()\n",
|
||||||
|
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 2. Hitung Frekuensi Bigram dan Trigram\n",
|
||||||
|
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
|
||||||
|
"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
|
||||||
|
"\n",
|
||||||
|
"bigram_counts = Counter(bigrams)\n",
|
||||||
|
"trigram_counts = Counter(trigrams)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
|
||||||
|
"for tg, count in trigram_counts.items():\n",
|
||||||
|
" print(f\" {tg}: {count}\")\n",
|
||||||
|
"print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n",
|
||||||
|
"trigram_probabilities = {}\n",
|
||||||
|
"for (w1, w2, w3), count in trigram_counts.items():\n",
|
||||||
|
" # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n",
|
||||||
|
" if bigram_counts[(w1, w2)] > 0:\n",
|
||||||
|
" prob = count / bigram_counts[(w1, w2)]\n",
|
||||||
|
" else:\n",
|
||||||
|
" prob = 0\n",
|
||||||
|
" trigram_probabilities[(w1, w2, w3)] = prob\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas masing-masing trigram:\")\n",
|
||||||
|
"for (w1, w2, w3), prob in trigram_probabilities.items():\n",
|
||||||
|
" print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n",
|
||||||
|
"unigram_counts = Counter(tokens)\n",
|
||||||
|
"total_tokens = sum(unigram_counts.values())\n",
|
||||||
|
"\n",
|
||||||
|
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n",
|
||||||
|
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n",
|
||||||
|
"\n",
|
||||||
|
"# a. P(w1)\n",
|
||||||
|
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
|
||||||
|
"\n",
|
||||||
|
"# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n",
|
||||||
|
"if len(tokens) > 1:\n",
|
||||||
|
" count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n",
|
||||||
|
" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
|
||||||
|
"else:\n",
|
||||||
|
" p_w2_w1 = 1.0 # Jika hanya 1 kata\n",
|
||||||
|
"\n",
|
||||||
|
"p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n",
|
||||||
|
"\n",
|
||||||
|
"# Daftar bagian rumus untuk ditampilkan\n",
|
||||||
|
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n",
|
||||||
|
"if len(tokens) > 1:\n",
|
||||||
|
" prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n",
|
||||||
|
"for i in range(len(tokens) - 2):\n",
|
||||||
|
" triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
|
||||||
|
" p = trigram_probabilities.get(triplet, 0)\n",
|
||||||
|
" p_kalimat *= p\n",
|
||||||
|
" prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n",
|
||||||
|
"\n",
|
||||||
|
"prob_str = \" x \".join(prob_str_parts)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
|
||||||
|
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "BIRARsj2FHJg",
|
||||||
|
"outputId": "968d420e-9370-40e5-e7e1-148e1d351d62"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Corpus: mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning\n",
|
||||||
|
"Tokens (7): ['mahasiswa', 'mengerjakan', 'tugas', 'kemudian', 'mahasiswa', 'upload', 'e-learning']\n",
|
||||||
|
"\n",
|
||||||
|
"Frekuensi Trigram dalam kalimat:\n",
|
||||||
|
" ('mahasiswa', 'mengerjakan', 'tugas'): 1\n",
|
||||||
|
" ('mengerjakan', 'tugas', 'kemudian'): 1\n",
|
||||||
|
" ('tugas', 'kemudian', 'mahasiswa'): 1\n",
|
||||||
|
" ('kemudian', 'mahasiswa', 'upload'): 1\n",
|
||||||
|
" ('mahasiswa', 'upload', 'e-learning'): 1\n",
|
||||||
|
"\n",
|
||||||
|
"Total trigram dalam 1 kalimat: 5\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas masing-masing trigram:\n",
|
||||||
|
" P(tugas|mahasiswa,mengerjakan) = 1.00 (100.00%)\n",
|
||||||
|
" P(kemudian|mengerjakan,tugas) = 1.00 (100.00%)\n",
|
||||||
|
" P(mahasiswa|tugas,kemudian) = 1.00 (100.00%)\n",
|
||||||
|
" P(upload|kemudian,mahasiswa) = 1.00 (100.00%)\n",
|
||||||
|
" P(e-learning|mahasiswa,upload) = 1.00 (100.00%)\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
|
||||||
|
" P(mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning) = P(mahasiswa)=0.29 x P(mengerjakan|mahasiswa)=0.50 x P(tugas|mahasiswa,mengerjakan)=1.00 x P(kemudian|mengerjakan,tugas)=1.00 x P(mahasiswa|tugas,kemudian)=1.00 x P(upload|kemudian,mahasiswa)=1.00 x P(e-learning|mahasiswa,upload)=1.00 = 0.142857 (14.29%)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
1
.ipynb_checkpoints/README-checkpoint.md
Normal file
1
.ipynb_checkpoints/README-checkpoint.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
# Kompilasi Materi Praktikum
|
||||||
75
.virtual_documents/Fitur_Ekstraksi_BOW.ipynb
Normal file
75
.virtual_documents/Fitur_Ekstraksi_BOW.ipynb
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
# Input jumlah dokumen
|
||||||
|
import pandas as pd
|
||||||
|
n = int(input("Masukkan jumlah dokumen yang ingin dimasukkan: "))
|
||||||
|
|
||||||
|
|
||||||
|
# Input teks dokumen satu per satu
|
||||||
|
documents = []
|
||||||
|
for i in range(n):
|
||||||
|
teks = input(f"Masukkan teks untuk dokumen ke-{i+1}: ")
|
||||||
|
documents.append(teks)
|
||||||
|
|
||||||
|
print("\n=== Dokumen yang Dimasukkan ===")
|
||||||
|
for i, doc in enumerate(documents):
|
||||||
|
print(f"Doc {i+1}: {doc}")
|
||||||
|
|
||||||
|
|
||||||
|
# Tahap Tokenisasi
|
||||||
|
tokenized_docs = []
|
||||||
|
for doc in documents:
|
||||||
|
tokens = doc.lower().split()
|
||||||
|
tokenized_docs.append(tokens)
|
||||||
|
|
||||||
|
print("\n=== Hasil Tokenisasi ===")
|
||||||
|
for i, tokens in enumerate(tokenized_docs):
|
||||||
|
print(f"Doc {i+1}: {tokens}")
|
||||||
|
|
||||||
|
|
||||||
|
# Pembuatan Corpus
|
||||||
|
corpus_all = [word for doc in tokenized_docs for word in doc]
|
||||||
|
|
||||||
|
print("\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===")
|
||||||
|
print(corpus_all)
|
||||||
|
print(f"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}")
|
||||||
|
|
||||||
|
|
||||||
|
# Pembuatan Vocabulary
|
||||||
|
vocabulary = sorted(set(corpus_all))
|
||||||
|
|
||||||
|
print("\n=== Vocabulary (Kata Unik) ===")
|
||||||
|
print(vocabulary)
|
||||||
|
print(f"Jumlah kata unik (vocabulary size): {len(vocabulary)}")
|
||||||
|
|
||||||
|
|
||||||
|
vocabulary = sorted(set(corpus_all))
|
||||||
|
|
||||||
|
print("\n=== Vocabulary (Kata Unik) ===")
|
||||||
|
for idx, word in enumerate(vocabulary, start=1):
|
||||||
|
print(f"{idx:>2}. {word}")
|
||||||
|
print(f"\nJumlah kata unik (vocabulary size): {len(vocabulary)}")
|
||||||
|
|
||||||
|
|
||||||
|
# Representasi Numerik (Matriks BoW)
|
||||||
|
bow_matrix = []
|
||||||
|
for doc in tokenized_docs:
|
||||||
|
vector = [doc.count(word) for word in vocabulary]
|
||||||
|
bow_matrix.append(vector)
|
||||||
|
|
||||||
|
|
||||||
|
df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)
|
||||||
|
df_bow.index = [f"D{i}" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3
|
||||||
|
|
||||||
|
print("\n=== Matriks Bag of Words ===")
|
||||||
|
print(df_bow)
|
||||||
|
|
||||||
|
|
||||||
|
# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)
|
||||||
|
word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()
|
||||||
|
word_frequencies.columns = ["Kata", "Frekuensi"]
|
||||||
|
|
||||||
|
print("\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===")
|
||||||
|
print(word_frequencies)
|
||||||
|
print(f"Frekuensi kata: {len(word_frequencies)}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
209
.virtual_documents/IFD.ipynb
Normal file
209
.virtual_documents/IFD.ipynb
Normal file
@ -0,0 +1,209 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
from IPython.display import clear_output
|
||||||
|
import math
|
||||||
|
|
||||||
|
# 1. Input Kalimat dan Tokenisasi
|
||||||
|
kalimat = input("Masukkan kalimat: ").strip()
|
||||||
|
|
||||||
|
# Bersihkan output (khusus lingkungan notebook)
|
||||||
|
try:
|
||||||
|
clear_output()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"Corpus: {kalimat}")
|
||||||
|
|
||||||
|
# Tokenize
|
||||||
|
tokens = kalimat.lower().split()
|
||||||
|
print(f"Tokens ({len(tokens)}): {tokens}")
|
||||||
|
|
||||||
|
# 2. Hitung Frekuensi Unigram
|
||||||
|
unigram_counts = Counter(tokens)
|
||||||
|
total_tokens = sum(unigram_counts.values())
|
||||||
|
|
||||||
|
print("\nFrekuensi Unigram dalam kalimat")
|
||||||
|
for pair, count in unigram_counts.items():
|
||||||
|
print(f" ('{pair}'): {count}")
|
||||||
|
print(f"\nTotal unigram dalam 1 kalimat: {total_tokens}")
|
||||||
|
|
||||||
|
# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata
|
||||||
|
unigram_probabilities = {}
|
||||||
|
for word, count in unigram_counts.items():
|
||||||
|
prob = count / total_tokens
|
||||||
|
unigram_probabilities[word] = prob
|
||||||
|
|
||||||
|
print("\nProbabilitas masing-masing unigram:")
|
||||||
|
for word, prob in unigram_probabilities.items():
|
||||||
|
print(f" P({word}) = {prob:.2f} ({prob*100:.2f}%)")
|
||||||
|
|
||||||
|
# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)
|
||||||
|
p_kalimat = 1
|
||||||
|
prob_parts = []
|
||||||
|
|
||||||
|
# Loop untuk menghitung probabilitas total dan membangun string rumus detail
|
||||||
|
for word in tokens:
|
||||||
|
prob_value = unigram_probabilities[word]
|
||||||
|
p_kalimat *= prob_value
|
||||||
|
# Format: P(word)=prob_value
|
||||||
|
prob_parts.append(f"P({word})={prob_value:.2f}")
|
||||||
|
|
||||||
|
# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail
|
||||||
|
prob_str = " x ".join(prob_parts)
|
||||||
|
|
||||||
|
print("\nProbabilitas Keseluruhan Kalimat (Model Unigram):")
|
||||||
|
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
from IPython.display import clear_output
|
||||||
|
import math
|
||||||
|
|
||||||
|
# 1. Input Kalimat dan Tokenisasi
|
||||||
|
kalimat = input("Masukkan kalimat: ").strip()
|
||||||
|
|
||||||
|
# Bersihkan output (khusus lingkungan notebook)
|
||||||
|
try:
|
||||||
|
clear_output()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"Corpus: {kalimat}")
|
||||||
|
|
||||||
|
# Tokenisasi
|
||||||
|
tokens = kalimat.lower().split()
|
||||||
|
print(f"Tokens ({len(tokens)}): {tokens}")
|
||||||
|
|
||||||
|
# 2. Hitung Frekuensi Unigram dan Bigram
|
||||||
|
unigram_counts = Counter(tokens)
|
||||||
|
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
|
||||||
|
bigram_counts = Counter(bigrams)
|
||||||
|
|
||||||
|
print("\nFrekuensi Bigram dalam kalimat:")
|
||||||
|
for pair, count in bigram_counts.items():
|
||||||
|
print(f" {pair}: {count}")
|
||||||
|
print(f"\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}")
|
||||||
|
|
||||||
|
# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)
|
||||||
|
bigram_probabilities = {}
|
||||||
|
for (w1, w2), count in bigram_counts.items():
|
||||||
|
prob = count / unigram_counts[w1]
|
||||||
|
bigram_probabilities[(w1, w2)] = prob
|
||||||
|
|
||||||
|
print("\nProbabilitas masing-masing bigram:")
|
||||||
|
for (w1, w2), prob in bigram_probabilities.items():
|
||||||
|
print(f" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)")
|
||||||
|
|
||||||
|
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)
|
||||||
|
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...
|
||||||
|
total_tokens = sum(unigram_counts.values())
|
||||||
|
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)
|
||||||
|
p_kalimat = p_w1 # Inisialisasi dengan P(w1)
|
||||||
|
|
||||||
|
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"] # Tambahkan P(w1) ke rumus
|
||||||
|
|
||||||
|
for i in range(1, len(tokens)):
|
||||||
|
pair = (tokens[i-1], tokens[i])
|
||||||
|
p = bigram_probabilities.get(pair, 0)
|
||||||
|
p_kalimat *= p
|
||||||
|
prob_str_parts.append(f"P({pair[1]}|{pair[0]})={p:.2f}")
|
||||||
|
|
||||||
|
# Gabungkan rumus perkalian untuk ditampilkan
|
||||||
|
prob_str = " x ".join(prob_str_parts)
|
||||||
|
|
||||||
|
print("\nProbabilitas Keseluruhan Kalimat (Model Bigram):")
|
||||||
|
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
from IPython.display import clear_output
|
||||||
|
import math
|
||||||
|
|
||||||
|
# 1. Input Kalimat dan Tokenisasi
|
||||||
|
kalimat = input("Masukkan kalimat: ").strip()
|
||||||
|
|
||||||
|
# Bersihkan output (khusus lingkungan notebook)
|
||||||
|
try:
|
||||||
|
clear_output()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"Corpus: {kalimat}")
|
||||||
|
|
||||||
|
# Tokenisasi
|
||||||
|
tokens = kalimat.lower().split()
|
||||||
|
print(f"Tokens ({len(tokens)}): {tokens}")
|
||||||
|
|
||||||
|
# 2. Hitung Frekuensi Bigram dan Trigram
|
||||||
|
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
|
||||||
|
trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
|
||||||
|
|
||||||
|
bigram_counts = Counter(bigrams)
|
||||||
|
trigram_counts = Counter(trigrams)
|
||||||
|
|
||||||
|
print("\nFrekuensi Trigram dalam kalimat:")
|
||||||
|
for tg, count in trigram_counts.items():
|
||||||
|
print(f" {tg}: {count}")
|
||||||
|
print(f"\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}")
|
||||||
|
|
||||||
|
# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)
|
||||||
|
trigram_probabilities = {}
|
||||||
|
for (w1, w2, w3), count in trigram_counts.items():
|
||||||
|
# Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)
|
||||||
|
if bigram_counts[(w1, w2)] > 0:
|
||||||
|
prob = count / bigram_counts[(w1, w2)]
|
||||||
|
else:
|
||||||
|
prob = 0
|
||||||
|
trigram_probabilities[(w1, w2, w3)] = prob
|
||||||
|
|
||||||
|
print("\nProbabilitas masing-masing trigram:")
|
||||||
|
for (w1, w2, w3), prob in trigram_probabilities.items():
|
||||||
|
print(f" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)")
|
||||||
|
|
||||||
|
# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))
|
||||||
|
unigram_counts = Counter(tokens)
|
||||||
|
total_tokens = sum(unigram_counts.values())
|
||||||
|
|
||||||
|
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)
|
||||||
|
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...
|
||||||
|
|
||||||
|
# a. P(w1)
|
||||||
|
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0
|
||||||
|
|
||||||
|
# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)
|
||||||
|
if len(tokens) > 1:
|
||||||
|
count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0
|
||||||
|
p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1
|
||||||
|
else:
|
||||||
|
p_w2_w1 = 1.0 # Jika hanya 1 kata
|
||||||
|
|
||||||
|
p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)
|
||||||
|
|
||||||
|
# Daftar bagian rumus untuk ditampilkan
|
||||||
|
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"]
|
||||||
|
if len(tokens) > 1:
|
||||||
|
prob_str_parts.append(f"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}")
|
||||||
|
|
||||||
|
# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3
|
||||||
|
for i in range(len(tokens) - 2):
|
||||||
|
triplet = (tokens[i], tokens[i+1], tokens[i+2])
|
||||||
|
p = trigram_probabilities.get(triplet, 0)
|
||||||
|
p_kalimat *= p
|
||||||
|
prob_str_parts.append(f"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}")
|
||||||
|
|
||||||
|
prob_str = " x ".join(prob_str_parts)
|
||||||
|
|
||||||
|
print("\nProbabilitas Keseluruhan Kalimat (Model Trigram):")
|
||||||
|
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
|
||||||
|
|
||||||
209
.virtual_documents/N-Gram.ipynb
Normal file
209
.virtual_documents/N-Gram.ipynb
Normal file
@ -0,0 +1,209 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
from IPython.display import clear_output
|
||||||
|
import math
|
||||||
|
|
||||||
|
# 1. Input Kalimat dan Tokenisasi
|
||||||
|
kalimat = input("Masukkan kalimat: ").strip()
|
||||||
|
|
||||||
|
# Bersihkan output (khusus lingkungan notebook)
|
||||||
|
try:
|
||||||
|
clear_output()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"Corpus: {kalimat}")
|
||||||
|
|
||||||
|
# Tokenize
|
||||||
|
tokens = kalimat.lower().split()
|
||||||
|
print(f"Tokens ({len(tokens)}): {tokens}")
|
||||||
|
|
||||||
|
# 2. Hitung Frekuensi Unigram
|
||||||
|
unigram_counts = Counter(tokens)
|
||||||
|
total_tokens = sum(unigram_counts.values())
|
||||||
|
|
||||||
|
print("\nFrekuensi Unigram dalam kalimat")
|
||||||
|
for pair, count in unigram_counts.items():
|
||||||
|
print(f" ('{pair}'): {count}")
|
||||||
|
print(f"\nTotal unigram dalam 1 kalimat: {total_tokens}")
|
||||||
|
|
||||||
|
# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata
|
||||||
|
unigram_probabilities = {}
|
||||||
|
for word, count in unigram_counts.items():
|
||||||
|
prob = count / total_tokens
|
||||||
|
unigram_probabilities[word] = prob
|
||||||
|
|
||||||
|
print("\nProbabilitas masing-masing unigram:")
|
||||||
|
for word, prob in unigram_probabilities.items():
|
||||||
|
print(f" P({word}) = {prob:.2f} ({prob*100:.2f}%)")
|
||||||
|
|
||||||
|
# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)
|
||||||
|
p_kalimat = 1
|
||||||
|
prob_parts = []
|
||||||
|
|
||||||
|
# Loop untuk menghitung probabilitas total dan membangun string rumus detail
|
||||||
|
for word in tokens:
|
||||||
|
prob_value = unigram_probabilities[word]
|
||||||
|
p_kalimat *= prob_value
|
||||||
|
# Format: P(word)=prob_value
|
||||||
|
prob_parts.append(f"P({word})={prob_value:.2f}")
|
||||||
|
|
||||||
|
# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail
|
||||||
|
prob_str = " x ".join(prob_parts)
|
||||||
|
|
||||||
|
print("\nProbabilitas Keseluruhan Kalimat (Model Unigram):")
|
||||||
|
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
from IPython.display import clear_output
|
||||||
|
import math
|
||||||
|
|
||||||
|
# 1. Input Kalimat dan Tokenisasi
|
||||||
|
kalimat = input("Masukkan kalimat: ").strip()
|
||||||
|
|
||||||
|
# Bersihkan output (khusus lingkungan notebook)
|
||||||
|
try:
|
||||||
|
clear_output()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"Corpus: {kalimat}")
|
||||||
|
|
||||||
|
# Tokenisasi
|
||||||
|
tokens = kalimat.lower().split()
|
||||||
|
print(f"Tokens ({len(tokens)}): {tokens}")
|
||||||
|
|
||||||
|
# 2. Hitung Frekuensi Unigram dan Bigram
|
||||||
|
unigram_counts = Counter(tokens)
|
||||||
|
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
|
||||||
|
bigram_counts = Counter(bigrams)
|
||||||
|
|
||||||
|
print("\nFrekuensi Bigram dalam kalimat:")
|
||||||
|
for pair, count in bigram_counts.items():
|
||||||
|
print(f" {pair}: {count}")
|
||||||
|
print(f"\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}")
|
||||||
|
|
||||||
|
# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)
|
||||||
|
bigram_probabilities = {}
|
||||||
|
for (w1, w2), count in bigram_counts.items():
|
||||||
|
prob = count / unigram_counts[w1]
|
||||||
|
bigram_probabilities[(w1, w2)] = prob
|
||||||
|
|
||||||
|
print("\nProbabilitas masing-masing bigram:")
|
||||||
|
for (w1, w2), prob in bigram_probabilities.items():
|
||||||
|
print(f" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)")
|
||||||
|
|
||||||
|
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)
|
||||||
|
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...
|
||||||
|
total_tokens = sum(unigram_counts.values())
|
||||||
|
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)
|
||||||
|
p_kalimat = p_w1 # Inisialisasi dengan P(w1)
|
||||||
|
|
||||||
|
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"] # Tambahkan P(w1) ke rumus
|
||||||
|
|
||||||
|
for i in range(1, len(tokens)):
|
||||||
|
pair = (tokens[i-1], tokens[i])
|
||||||
|
p = bigram_probabilities.get(pair, 0)
|
||||||
|
p_kalimat *= p
|
||||||
|
prob_str_parts.append(f"P({pair[1]}|{pair[0]})={p:.2f}")
|
||||||
|
|
||||||
|
# Gabungkan rumus perkalian untuk ditampilkan
|
||||||
|
prob_str = " x ".join(prob_str_parts)
|
||||||
|
|
||||||
|
print("\nProbabilitas Keseluruhan Kalimat (Model Bigram):")
|
||||||
|
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
from collections import Counter
|
||||||
|
from IPython.display import clear_output
|
||||||
|
import math
|
||||||
|
|
||||||
|
# 1. Input Kalimat dan Tokenisasi
|
||||||
|
kalimat = input("Masukkan kalimat: ").strip()
|
||||||
|
|
||||||
|
# Bersihkan output (khusus lingkungan notebook)
|
||||||
|
try:
|
||||||
|
clear_output()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f"Corpus: {kalimat}")
|
||||||
|
|
||||||
|
# Tokenisasi
|
||||||
|
tokens = kalimat.lower().split()
|
||||||
|
print(f"Tokens ({len(tokens)}): {tokens}")
|
||||||
|
|
||||||
|
# 2. Hitung Frekuensi Bigram dan Trigram
|
||||||
|
bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]
|
||||||
|
trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]
|
||||||
|
|
||||||
|
bigram_counts = Counter(bigrams)
|
||||||
|
trigram_counts = Counter(trigrams)
|
||||||
|
|
||||||
|
print("\nFrekuensi Trigram dalam kalimat:")
|
||||||
|
for tg, count in trigram_counts.items():
|
||||||
|
print(f" {tg}: {count}")
|
||||||
|
print(f"\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}")
|
||||||
|
|
||||||
|
# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)
|
||||||
|
trigram_probabilities = {}
|
||||||
|
for (w1, w2, w3), count in trigram_counts.items():
|
||||||
|
# Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)
|
||||||
|
if bigram_counts[(w1, w2)] > 0:
|
||||||
|
prob = count / bigram_counts[(w1, w2)]
|
||||||
|
else:
|
||||||
|
prob = 0
|
||||||
|
trigram_probabilities[(w1, w2, w3)] = prob
|
||||||
|
|
||||||
|
print("\nProbabilitas masing-masing trigram:")
|
||||||
|
for (w1, w2, w3), prob in trigram_probabilities.items():
|
||||||
|
print(f" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)")
|
||||||
|
|
||||||
|
# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))
|
||||||
|
unigram_counts = Counter(tokens)
|
||||||
|
total_tokens = sum(unigram_counts.values())
|
||||||
|
|
||||||
|
# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)
|
||||||
|
# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...
|
||||||
|
|
||||||
|
# a. P(w1)
|
||||||
|
p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0
|
||||||
|
|
||||||
|
# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)
|
||||||
|
if len(tokens) > 1:
|
||||||
|
count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0
|
||||||
|
p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1
|
||||||
|
else:
|
||||||
|
p_w2_w1 = 1.0 # Jika hanya 1 kata
|
||||||
|
|
||||||
|
p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)
|
||||||
|
|
||||||
|
# Daftar bagian rumus untuk ditampilkan
|
||||||
|
prob_str_parts = [f"P({tokens[0]})={p_w1:.2f}"]
|
||||||
|
if len(tokens) > 1:
|
||||||
|
prob_str_parts.append(f"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}")
|
||||||
|
|
||||||
|
# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3
|
||||||
|
for i in range(len(tokens) - 2):
|
||||||
|
triplet = (tokens[i], tokens[i+1], tokens[i+2])
|
||||||
|
p = trigram_probabilities.get(triplet, 0)
|
||||||
|
p_kalimat *= p
|
||||||
|
prob_str_parts.append(f"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}")
|
||||||
|
|
||||||
|
prob_str = " x ".join(prob_str_parts)
|
||||||
|
|
||||||
|
print("\nProbabilitas Keseluruhan Kalimat (Model Trigram):")
|
||||||
|
print(f" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)")
|
||||||
|
|
||||||
1
.virtual_documents/Untitled.ipynb
Normal file
1
.virtual_documents/Untitled.ipynb
Normal file
@ -0,0 +1 @@
|
|||||||
|
|
||||||
310
NLP/Fitur_Ekstraksi_BOW.ipynb
Normal file
310
NLP/Fitur_Ekstraksi_BOW.ipynb
Normal file
@ -0,0 +1,310 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "qBYcPYAb059g",
|
||||||
|
"outputId": "9f57b704-da1b-4495-d366-24c30586dc76"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdin",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Masukkan jumlah dokumen yang ingin dimasukkan: 3\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Input jumlah dokumen\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "mo-yt5Ob1N8j",
|
||||||
|
"outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Masukkan teks untuk dokumen ke-1: saya belajar nlp di kampus\n",
|
||||||
|
"Masukkan teks untuk dokumen ke-2: saya suka belajar ai\n",
|
||||||
|
"Masukkan teks untuk dokumen ke-3: mahasiswa belajar data science dan nlp\n",
|
||||||
|
"\n",
|
||||||
|
"=== Dokumen yang Dimasukkan ===\n",
|
||||||
|
"Doc 1: saya belajar nlp di kampus\n",
|
||||||
|
"Doc 2: saya suka belajar ai\n",
|
||||||
|
"Doc 3: mahasiswa belajar data science dan nlp\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Input teks dokumen satu per satu\n",
|
||||||
|
"documents = []\n",
|
||||||
|
"for i in range(n):\n",
|
||||||
|
" teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
|
||||||
|
" documents.append(teks)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
|
||||||
|
"for i, doc in enumerate(documents):\n",
|
||||||
|
" print(f\"Doc {i+1}: {doc}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "FkmxRAFq1oDK",
|
||||||
|
"outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"=== Hasil Tokenisasi ===\n",
|
||||||
|
"Doc 1: ['saya', 'belajar', 'nlp', 'di', 'kampus']\n",
|
||||||
|
"Doc 2: ['saya', 'suka', 'belajar', 'ai']\n",
|
||||||
|
"Doc 3: ['mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Tahap Tokenisasi\n",
|
||||||
|
"tokenized_docs = []\n",
|
||||||
|
"for doc in documents:\n",
|
||||||
|
" tokens = doc.lower().split()\n",
|
||||||
|
" tokenized_docs.append(tokens)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
|
||||||
|
"for i, tokens in enumerate(tokenized_docs):\n",
|
||||||
|
" print(f\"Doc {i+1}: {tokens}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "ybC1Vo2C_c3q",
|
||||||
|
"outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
|
||||||
|
"['saya', 'belajar', 'nlp', 'di', 'kampus', 'saya', 'suka', 'belajar', 'ai', 'mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n",
|
||||||
|
"Jumlah total kata dalam seluruh dokumen: 15\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Pembuatan Corpus\n",
|
||||||
|
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
|
||||||
|
"print(corpus_all)\n",
|
||||||
|
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "s6S-Ma4R1xuq",
|
||||||
|
"outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"=== Vocabulary (Kata Unik) ===\n",
|
||||||
|
"['ai', 'belajar', 'dan', 'data', 'di', 'kampus', 'mahasiswa', 'nlp', 'saya', 'science', 'suka']\n",
|
||||||
|
"Jumlah kata unik (vocabulary size): 11\n",
|
||||||
|
"\n",
|
||||||
|
"=== Vocabulary (Kata Unik) ===\n",
|
||||||
|
" 1. ai\n",
|
||||||
|
" 2. belajar\n",
|
||||||
|
" 3. dan\n",
|
||||||
|
" 4. data\n",
|
||||||
|
" 5. di\n",
|
||||||
|
" 6. kampus\n",
|
||||||
|
" 7. mahasiswa\n",
|
||||||
|
" 8. nlp\n",
|
||||||
|
" 9. saya\n",
|
||||||
|
"10. science\n",
|
||||||
|
"11. suka\n",
|
||||||
|
"\n",
|
||||||
|
"Jumlah kata unik (vocabulary size): 11\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Pembuatan Vocabulary\n",
|
||||||
|
"vocabulary = sorted(set(corpus_all))\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
|
||||||
|
"print(vocabulary)\n",
|
||||||
|
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"vocabulary = sorted(set(corpus_all))\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
|
||||||
|
"for idx, word in enumerate(vocabulary, start=1):\n",
|
||||||
|
" print(f\"{idx:>2}. {word}\")\n",
|
||||||
|
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {
|
||||||
|
"id": "ShevCTva2Fg9"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Representasi Numerik (Matriks BoW)\n",
|
||||||
|
"bow_matrix = []\n",
|
||||||
|
"for doc in tokenized_docs:\n",
|
||||||
|
" vector = [doc.count(word) for word in vocabulary]\n",
|
||||||
|
" bow_matrix.append(vector)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "-yB6D2pY2M0E",
|
||||||
|
"outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"=== Matriks Bag of Words ===\n",
|
||||||
|
" ai belajar dan data di kampus mahasiswa nlp saya science suka\n",
|
||||||
|
"D1 0 1 0 0 1 1 0 1 1 0 0\n",
|
||||||
|
"D2 1 1 0 0 0 0 0 0 1 0 1\n",
|
||||||
|
"D3 0 1 1 1 0 0 1 1 0 1 0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
|
||||||
|
"df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Matriks Bag of Words ===\")\n",
|
||||||
|
"print(df_bow)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "8ruf5vKL2rGD",
|
||||||
|
"outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
|
||||||
|
" Kata Frekuensi\n",
|
||||||
|
"0 belajar 3\n",
|
||||||
|
"1 nlp 2\n",
|
||||||
|
"2 saya 2\n",
|
||||||
|
"3 dan 1\n",
|
||||||
|
"4 ai 1\n",
|
||||||
|
"5 data 1\n",
|
||||||
|
"6 di 1\n",
|
||||||
|
"7 mahasiswa 1\n",
|
||||||
|
"8 kampus 1\n",
|
||||||
|
"9 science 1\n",
|
||||||
|
"10 suka 1\n",
|
||||||
|
"Frekuensi kata: 11\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
|
||||||
|
"word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
|
||||||
|
"word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
|
||||||
|
"print(word_frequencies)\n",
|
||||||
|
"print(f\"Frekuensi kata: {len(word_frequencies)}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "NQjExannHuj0"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
||||||
380
NLP/N-Gram.ipynb
Normal file
380
NLP/N-Gram.ipynb
Normal file
@ -0,0 +1,380 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "JVPdWpz3hhbj"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "4Mvva3v65h1v"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# **UNIGRAM**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "1cub_VJnUJMl",
|
||||||
|
"outputId": "1889eb61-4f3f-4780-f42e-02368076cce3"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Corpus: saya suka makan nasi\n",
|
||||||
|
"Tokens (4): ['saya', 'suka', 'makan', 'nasi']\n",
|
||||||
|
"\n",
|
||||||
|
"Frekuensi Unigram dalam kalimat\n",
|
||||||
|
" ('saya'): 1\n",
|
||||||
|
" ('suka'): 1\n",
|
||||||
|
" ('makan'): 1\n",
|
||||||
|
" ('nasi'): 1\n",
|
||||||
|
"\n",
|
||||||
|
"Total unigram dalam 1 kalimat: 4\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas masing-masing unigram:\n",
|
||||||
|
" P(saya) = 0.25 (25.00%)\n",
|
||||||
|
" P(suka) = 0.25 (25.00%)\n",
|
||||||
|
" P(makan) = 0.25 (25.00%)\n",
|
||||||
|
" P(nasi) = 0.25 (25.00%)\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
|
||||||
|
" P(saya suka makan nasi) = P(saya)=0.25 x P(suka)=0.25 x P(makan)=0.25 x P(nasi)=0.25 = 0.0039 (0.39%)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"from IPython.display import clear_output\n",
|
||||||
|
"import math\n",
|
||||||
|
"\n",
|
||||||
|
"# 1. Input Kalimat dan Tokenisasi\n",
|
||||||
|
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
|
||||||
|
"\n",
|
||||||
|
"# Bersihkan output (khusus lingkungan notebook)\n",
|
||||||
|
"try:\n",
|
||||||
|
" clear_output()\n",
|
||||||
|
"except:\n",
|
||||||
|
" pass\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Corpus: {kalimat}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Tokenize\n",
|
||||||
|
"tokens = kalimat.lower().split()\n",
|
||||||
|
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 2. Hitung Frekuensi Unigram\n",
|
||||||
|
"unigram_counts = Counter(tokens)\n",
|
||||||
|
"total_tokens = sum(unigram_counts.values())\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nFrekuensi Unigram dalam kalimat\")\n",
|
||||||
|
"for pair, count in unigram_counts.items():\n",
|
||||||
|
" print(f\" ('{pair}'): {count}\")\n",
|
||||||
|
"print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
|
||||||
|
"unigram_probabilities = {}\n",
|
||||||
|
"for word, count in unigram_counts.items():\n",
|
||||||
|
" prob = count / total_tokens\n",
|
||||||
|
" unigram_probabilities[word] = prob\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas masing-masing unigram:\")\n",
|
||||||
|
"for word, prob in unigram_probabilities.items():\n",
|
||||||
|
" print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n",
|
||||||
|
"p_kalimat = 1\n",
|
||||||
|
"prob_parts = []\n",
|
||||||
|
"\n",
|
||||||
|
"# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n",
|
||||||
|
"for word in tokens:\n",
|
||||||
|
" prob_value = unigram_probabilities[word]\n",
|
||||||
|
" p_kalimat *= prob_value\n",
|
||||||
|
" # Format: P(word)=prob_value\n",
|
||||||
|
" prob_parts.append(f\"P({word})={prob_value:.2f}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n",
|
||||||
|
"prob_str = \" x \".join(prob_parts)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
|
||||||
|
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "Vstwt996-FrS"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# **BIGRAM**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "XRIY4qgTVbjl",
|
||||||
|
"outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Corpus: saya makan nasi dan saya makan roti\n",
|
||||||
|
"Tokens (7): ['saya', 'makan', 'nasi', 'dan', 'saya', 'makan', 'roti']\n",
|
||||||
|
"\n",
|
||||||
|
"Frekuensi Bigram dalam kalimat:\n",
|
||||||
|
" ('saya', 'makan'): 2\n",
|
||||||
|
" ('makan', 'nasi'): 1\n",
|
||||||
|
" ('nasi', 'dan'): 1\n",
|
||||||
|
" ('dan', 'saya'): 1\n",
|
||||||
|
" ('makan', 'roti'): 1\n",
|
||||||
|
"\n",
|
||||||
|
"Total bigram dalam 1 kalimat: 6\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas masing-masing bigram:\n",
|
||||||
|
" P(makan|saya) = 1.00 (100.00%)\n",
|
||||||
|
" P(nasi|makan) = 0.50 (50.00%)\n",
|
||||||
|
" P(dan|nasi) = 1.00 (100.00%)\n",
|
||||||
|
" P(saya|dan) = 1.00 (100.00%)\n",
|
||||||
|
" P(roti|makan) = 0.50 (50.00%)\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
|
||||||
|
" P(saya makan nasi dan saya makan roti) = P(saya)=0.29 x P(makan|saya)=1.00 x P(nasi|makan)=0.50 x P(dan|nasi)=1.00 x P(saya|dan)=1.00 x P(makan|saya)=1.00 x P(roti|makan)=0.50 = 0.071429 (7.14%)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"from IPython.display import clear_output\n",
|
||||||
|
"import math\n",
|
||||||
|
"\n",
|
||||||
|
"# 1. Input Kalimat dan Tokenisasi\n",
|
||||||
|
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
|
||||||
|
"\n",
|
||||||
|
"# Bersihkan output (khusus lingkungan notebook)\n",
|
||||||
|
"try:\n",
|
||||||
|
" clear_output()\n",
|
||||||
|
"except:\n",
|
||||||
|
" pass\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Corpus: {kalimat}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Tokenisasi\n",
|
||||||
|
"tokens = kalimat.lower().split()\n",
|
||||||
|
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 2. Hitung Frekuensi Unigram dan Bigram\n",
|
||||||
|
"unigram_counts = Counter(tokens)\n",
|
||||||
|
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
|
||||||
|
"bigram_counts = Counter(bigrams)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
|
||||||
|
"for pair, count in bigram_counts.items():\n",
|
||||||
|
" print(f\" {pair}: {count}\")\n",
|
||||||
|
"print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
|
||||||
|
"bigram_probabilities = {}\n",
|
||||||
|
"for (w1, w2), count in bigram_counts.items():\n",
|
||||||
|
" prob = count / unigram_counts[w1]\n",
|
||||||
|
" bigram_probabilities[(w1, w2)] = prob\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas masing-masing bigram:\")\n",
|
||||||
|
"for (w1, w2), prob in bigram_probabilities.items():\n",
|
||||||
|
" print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n",
|
||||||
|
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n",
|
||||||
|
"total_tokens = sum(unigram_counts.values())\n",
|
||||||
|
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n",
|
||||||
|
"p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n",
|
||||||
|
"\n",
|
||||||
|
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n",
|
||||||
|
"\n",
|
||||||
|
"for i in range(1, len(tokens)):\n",
|
||||||
|
" pair = (tokens[i-1], tokens[i])\n",
|
||||||
|
" p = bigram_probabilities.get(pair, 0)\n",
|
||||||
|
" p_kalimat *= p\n",
|
||||||
|
" prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Gabungkan rumus perkalian untuk ditampilkan\n",
|
||||||
|
"prob_str = \" x \".join(prob_str_parts)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
|
||||||
|
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {
|
||||||
|
"id": "E6n1IU8X-G9S"
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"# **TRIGRAM**"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "BIRARsj2FHJg",
|
||||||
|
"outputId": "968d420e-9370-40e5-e7e1-148e1d351d62"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Corpus: mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning\n",
|
||||||
|
"Tokens (7): ['mahasiswa', 'mengerjakan', 'tugas', 'kemudian', 'mahasiswa', 'upload', 'e-learning']\n",
|
||||||
|
"\n",
|
||||||
|
"Frekuensi Trigram dalam kalimat:\n",
|
||||||
|
" ('mahasiswa', 'mengerjakan', 'tugas'): 1\n",
|
||||||
|
" ('mengerjakan', 'tugas', 'kemudian'): 1\n",
|
||||||
|
" ('tugas', 'kemudian', 'mahasiswa'): 1\n",
|
||||||
|
" ('kemudian', 'mahasiswa', 'upload'): 1\n",
|
||||||
|
" ('mahasiswa', 'upload', 'e-learning'): 1\n",
|
||||||
|
"\n",
|
||||||
|
"Total trigram dalam 1 kalimat: 5\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas masing-masing trigram:\n",
|
||||||
|
" P(tugas|mahasiswa,mengerjakan) = 1.00 (100.00%)\n",
|
||||||
|
" P(kemudian|mengerjakan,tugas) = 1.00 (100.00%)\n",
|
||||||
|
" P(mahasiswa|tugas,kemudian) = 1.00 (100.00%)\n",
|
||||||
|
" P(upload|kemudian,mahasiswa) = 1.00 (100.00%)\n",
|
||||||
|
" P(e-learning|mahasiswa,upload) = 1.00 (100.00%)\n",
|
||||||
|
"\n",
|
||||||
|
"Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
|
||||||
|
" P(mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning) = P(mahasiswa)=0.29 x P(mengerjakan|mahasiswa)=0.50 x P(tugas|mahasiswa,mengerjakan)=1.00 x P(kemudian|mengerjakan,tugas)=1.00 x P(mahasiswa|tugas,kemudian)=1.00 x P(upload|kemudian,mahasiswa)=1.00 x P(e-learning|mahasiswa,upload)=1.00 = 0.142857 (14.29%)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"from IPython.display import clear_output\n",
|
||||||
|
"import math\n",
|
||||||
|
"\n",
|
||||||
|
"# 1. Input Kalimat dan Tokenisasi\n",
|
||||||
|
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
|
||||||
|
"\n",
|
||||||
|
"# Bersihkan output (khusus lingkungan notebook)\n",
|
||||||
|
"try:\n",
|
||||||
|
" clear_output()\n",
|
||||||
|
"except:\n",
|
||||||
|
" pass\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Corpus: {kalimat}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Tokenisasi\n",
|
||||||
|
"tokens = kalimat.lower().split()\n",
|
||||||
|
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 2. Hitung Frekuensi Bigram dan Trigram\n",
|
||||||
|
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
|
||||||
|
"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
|
||||||
|
"\n",
|
||||||
|
"bigram_counts = Counter(bigrams)\n",
|
||||||
|
"trigram_counts = Counter(trigrams)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
|
||||||
|
"for tg, count in trigram_counts.items():\n",
|
||||||
|
" print(f\" {tg}: {count}\")\n",
|
||||||
|
"print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n",
|
||||||
|
"trigram_probabilities = {}\n",
|
||||||
|
"for (w1, w2, w3), count in trigram_counts.items():\n",
|
||||||
|
" # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n",
|
||||||
|
" if bigram_counts[(w1, w2)] > 0:\n",
|
||||||
|
" prob = count / bigram_counts[(w1, w2)]\n",
|
||||||
|
" else:\n",
|
||||||
|
" prob = 0\n",
|
||||||
|
" trigram_probabilities[(w1, w2, w3)] = prob\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas masing-masing trigram:\")\n",
|
||||||
|
"for (w1, w2, w3), prob in trigram_probabilities.items():\n",
|
||||||
|
" print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n",
|
||||||
|
"unigram_counts = Counter(tokens)\n",
|
||||||
|
"total_tokens = sum(unigram_counts.values())\n",
|
||||||
|
"\n",
|
||||||
|
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n",
|
||||||
|
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n",
|
||||||
|
"\n",
|
||||||
|
"# a. P(w1)\n",
|
||||||
|
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
|
||||||
|
"\n",
|
||||||
|
"# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n",
|
||||||
|
"if len(tokens) > 1:\n",
|
||||||
|
" count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n",
|
||||||
|
" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
|
||||||
|
"else:\n",
|
||||||
|
" p_w2_w1 = 1.0 # Jika hanya 1 kata\n",
|
||||||
|
"\n",
|
||||||
|
"p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n",
|
||||||
|
"\n",
|
||||||
|
"# Daftar bagian rumus untuk ditampilkan\n",
|
||||||
|
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n",
|
||||||
|
"if len(tokens) > 1:\n",
|
||||||
|
" prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n",
|
||||||
|
"\n",
|
||||||
|
"# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n",
|
||||||
|
"for i in range(len(tokens) - 2):\n",
|
||||||
|
" triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
|
||||||
|
" p = trigram_probabilities.get(triplet, 0)\n",
|
||||||
|
" p_kalimat *= p\n",
|
||||||
|
" prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n",
|
||||||
|
"\n",
|
||||||
|
"prob_str = \" x \".join(prob_str_parts)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
|
||||||
|
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user