Compare commits

...

No commits in common. "master" and "main" have entirely different histories.
master ... main

7 changed files with 3 additions and 1806 deletions

View File

@ -1,335 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qBYcPYAb059g",
"outputId": "9f57b704-da1b-4495-d366-24c30586dc76",
"scrolled": true
},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Masukkan jumlah dokumen yang ingin dimasukkan: 4\n"
]
}
],
"source": [
"# Input jumlah dokumen\n",
"import pandas as pd\n",
"n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mo-yt5Ob1N8j",
"outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Masukkan teks untuk dokumen ke-1: saya adalah seorang pria\n",
"Masukkan teks untuk dokumen ke-2: saya adalah pria yang memiliki hati\n",
"Masukkan teks untuk dokumen ke-3: hati saya telah terisi satu nama\n",
"Masukkan teks untuk dokumen ke-4: di dalam hati saya terukir nama pasangan saya\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Dokumen yang Dimasukkan ===\n",
"Doc 1: saya adalah seorang pria\n",
"Doc 2: saya adalah pria yang memiliki hati\n",
"Doc 3: hati saya telah terisi satu nama\n",
"Doc 4: di dalam hati saya terukir nama pasangan saya\n"
]
}
],
"source": [
"# Input teks dokumen satu per satu\n",
"documents = []\n",
"for i in range(n):\n",
" teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
" documents.append(teks)\n",
"\n",
"print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
"for i, doc in enumerate(documents):\n",
" print(f\"Doc {i+1}: {doc}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FkmxRAFq1oDK",
"outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Hasil Tokenisasi ===\n",
"Doc 1: ['saya', 'adalah', 'seorang', 'pria']\n",
"Doc 2: ['saya', 'adalah', 'pria', 'yang', 'memiliki', 'hati']\n",
"Doc 3: ['hati', 'saya', 'telah', 'terisi', 'satu', 'nama']\n",
"Doc 4: ['di', 'dalam', 'hati', 'saya', 'terukir', 'nama', 'pasangan', 'saya']\n"
]
}
],
"source": [
"# Tahap Tokenisasi\n",
"tokenized_docs = []\n",
"for doc in documents:\n",
" tokens = doc.lower().split()\n",
" tokenized_docs.append(tokens)\n",
"\n",
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
"for i, tokens in enumerate(tokenized_docs):\n",
" print(f\"Doc {i+1}: {tokens}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ybC1Vo2C_c3q",
"outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
"['saya', 'adalah', 'seorang', 'pria', 'saya', 'adalah', 'pria', 'yang', 'memiliki', 'hati', 'hati', 'saya', 'telah', 'terisi', 'satu', 'nama', 'di', 'dalam', 'hati', 'saya', 'terukir', 'nama', 'pasangan', 'saya']\n",
"Jumlah total kata dalam seluruh dokumen: 24\n"
]
}
],
"source": [
"# Pembuatan Corpus\n",
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
"\n",
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
"print(corpus_all)\n",
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "s6S-Ma4R1xuq",
"outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Vocabulary (Kata Unik) ===\n",
"['adalah', 'dalam', 'di', 'hati', 'memiliki', 'nama', 'pasangan', 'pria', 'satu', 'saya', 'seorang', 'telah', 'terisi', 'terukir', 'yang']\n",
"Jumlah kata unik (vocabulary size): 15\n",
"\n",
"=== Vocabulary (Kata Unik) ===\n",
" 1. adalah\n",
" 2. dalam\n",
" 3. di\n",
" 4. hati\n",
" 5. memiliki\n",
" 6. nama\n",
" 7. pasangan\n",
" 8. pria\n",
" 9. satu\n",
"10. saya\n",
"11. seorang\n",
"12. telah\n",
"13. terisi\n",
"14. terukir\n",
"15. yang\n",
"\n",
"Jumlah kata unik (vocabulary size): 15\n"
]
}
],
"source": [
"# Pembuatan Vocabulary\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"print(vocabulary)\n",
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
"\n",
"\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"for idx, word in enumerate(vocabulary, start=1):\n",
" print(f\"{idx:>2}. {word}\")\n",
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "ShevCTva2Fg9"
},
"outputs": [],
"source": [
"# Representasi Numerik (Matriks BoW)\n",
"bow_matrix = []\n",
"for doc in tokenized_docs:\n",
" vector = [doc.count(word) for word in vocabulary]\n",
" bow_matrix.append(vector)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-yB6D2pY2M0E",
"outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Matriks Bag of Words ===\n",
" adalah dalam di hati memiliki nama pasangan pria satu saya \\\n",
"D1 1 0 0 0 0 0 0 1 0 1 \n",
"D2 1 0 0 1 1 0 0 1 0 1 \n",
"D3 0 0 0 1 0 1 0 0 1 1 \n",
"D4 0 1 1 1 0 1 1 0 0 2 \n",
"\n",
" seorang telah terisi terukir yang \n",
"D1 1 0 0 0 0 \n",
"D2 0 0 0 0 1 \n",
"D3 0 1 1 0 0 \n",
"D4 0 0 0 1 0 \n"
]
}
],
"source": [
"df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
"df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n",
"\n",
"print(\"\\n=== Matriks Bag of Words ===\")\n",
"print(df_bow)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8ruf5vKL2rGD",
"outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
" Kata Frekuensi\n",
"0 saya 5\n",
"1 hati 3\n",
"2 nama 2\n",
"3 pria 2\n",
"4 adalah 2\n",
"5 di 1\n",
"6 dalam 1\n",
"7 pasangan 1\n",
"8 memiliki 1\n",
"9 satu 1\n",
"10 seorang 1\n",
"11 telah 1\n",
"12 terisi 1\n",
"13 terukir 1\n",
"14 yang 1\n",
"Frekuensi kata: 15\n"
]
}
],
"source": [
"# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
"word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
"word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
"\n",
"print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
"print(word_frequencies)\n",
"print(f\"Frekuensi kata: {len(word_frequencies)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NQjExannHuj0"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,176 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f4a1399a-f23d-4060-a07e-bce5a5c7ddac",
"metadata": {},
"source": [
"# Klasifikasi Teks menggunakan ANN\n",
"## Arif R Dwiyanto\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "53a214ae-c9cf-4d46-925d-068f1685537b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== Classification Report ===\n",
" precision recall f1-score support\n",
"\n",
" negative 0.00 0.00 0.00 1.0\n",
" positive 0.00 0.00 0.00 1.0\n",
"\n",
" accuracy 0.00 2.0\n",
" macro avg 0.00 0.00 0.00 2.0\n",
"weighted avg 0.00 0.00 0.00 2.0\n",
"\n",
"=== Confusion Matrix ===\n",
"[[0 1]\n",
" [1 0]]\n",
"\n",
"Prediksi untuk: barang buruk, saya kecewa\n",
"Hasil: negative\n"
]
}
],
"source": [
"# ---------------------------------------------------------\n",
"# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network\n",
"# ---------------------------------------------------------\n",
"\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.neural_network import MLPClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix\n",
"\n",
"# -----------------------------------------\n",
"# 1. Contoh Dataset\n",
"# -----------------------------------------\n",
"# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)\n",
"\n",
"data = {\n",
" \"text\": [\n",
" \"Saya suka produk ini, luar biasa\",\n",
" \"Layanannya buruk, saya sangat kecewa\",\n",
" \"Pembelian terbaik yang pernah saya lakukan\",\n",
" \"Saya benci produk ini, buang-buang uang\",\n",
" \"Kualitasnya sangat bagus, direkomendasikan\",\n",
" \"Pengalaman buruk, tidak akan membeli lagi\"\n",
" ],\n",
" \"label\": [\"positive\", \"negative\", \"positive\", \"negative\", \"positive\", \"negative\"]\n",
"}\n",
"\n",
"df = pd.DataFrame(data)\n",
"\n",
"# -----------------------------------------\n",
"# 2. Split Train & Test\n",
"# -----------------------------------------\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" df[\"text\"], df[\"label\"], test_size=0.3, random_state=42\n",
")\n",
"\n",
"# -----------------------------------------\n",
"# 3. TF-IDF Vectorization\n",
"# -----------------------------------------\n",
"tfidf = TfidfVectorizer(max_features=5000)\n",
"X_train_tfidf = tfidf.fit_transform(X_train)\n",
"X_test_tfidf = tfidf.transform(X_test)\n",
"\n",
"# -----------------------------------------\n",
"# 4. Feedforward ANN (MLPClassifier)\n",
"# -----------------------------------------\n",
"model = MLPClassifier(\n",
" hidden_layer_sizes=(256, 64),\n",
" activation='relu',\n",
" solver='adam',\n",
" max_iter=500,\n",
" random_state=42\n",
")\n",
"\n",
"model.fit(X_train_tfidf, y_train)\n",
"\n",
"# -----------------------------------------\n",
"# 5. Evaluasi Model\n",
"# -----------------------------------------\n",
"y_pred = model.predict(X_test_tfidf)\n",
"\n",
"print(\"=== Classification Report ===\")\n",
"print(classification_report(y_test, y_pred))\n",
"\n",
"print(\"=== Confusion Matrix ===\")\n",
"print(confusion_matrix(y_test, y_pred))\n",
"\n",
"# -----------------------------------------\n",
"# 6. Prediksi Teks Baru\n",
"# -----------------------------------------\n",
"#sample_text = [\"barang bagus luar biasa\"]\n",
"sample_text = [\"barang buruk, saya kecewa\"]\n",
"sample_vec = tfidf.transform(sample_text)\n",
"prediction = model.predict(sample_vec)\n",
"\n",
"print(\"\\nPrediksi untuk:\", sample_text[0])\n",
"print(\"Hasil:\", prediction[0])\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "9f7d90fe-4af4-446c-9547-c9312bfa6fc7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Prediksi untuk: saya benci barang ini\n",
"Hasil: negative\n"
]
}
],
"source": [
"#sample_text = [\"barang bagus luar biasa\"]\n",
"sample_text = [\"saya benci barang ini\"]\n",
"sample_vec = tfidf.transform(sample_text)\n",
"prediction = model.predict(sample_vec)\n",
"print(\"\\nPrediksi untuk:\", sample_text[0])\n",
"print(\"Hasil:\", prediction[0])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4b9a7c2-0f08-43fd-8da8-018d839a4917",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,380 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "JVPdWpz3hhbj"
},
"source": [
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4Mvva3v65h1v"
},
"source": [
"# **UNIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1cub_VJnUJMl",
"outputId": "1889eb61-4f3f-4780-f42e-02368076cce3"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: saya suka makan nasi\n",
"Tokens (4): ['saya', 'suka', 'makan', 'nasi']\n",
"\n",
"Frekuensi Unigram dalam kalimat\n",
" ('saya'): 1\n",
" ('suka'): 1\n",
" ('makan'): 1\n",
" ('nasi'): 1\n",
"\n",
"Total unigram dalam 1 kalimat: 4\n",
"\n",
"Probabilitas masing-masing unigram:\n",
" P(saya) = 0.25 (25.00%)\n",
" P(suka) = 0.25 (25.00%)\n",
" P(makan) = 0.25 (25.00%)\n",
" P(nasi) = 0.25 (25.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
" P(saya suka makan nasi) = P(saya)=0.25 x P(suka)=0.25 x P(makan)=0.25 x P(nasi)=0.25 = 0.0039 (0.39%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenize\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"print(\"\\nFrekuensi Unigram dalam kalimat\")\n",
"for pair, count in unigram_counts.items():\n",
" print(f\" ('{pair}'): {count}\")\n",
"print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
"\n",
"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
"unigram_probabilities = {}\n",
"for word, count in unigram_counts.items():\n",
" prob = count / total_tokens\n",
" unigram_probabilities[word] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing unigram:\")\n",
"for word, prob in unigram_probabilities.items():\n",
" print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n",
"p_kalimat = 1\n",
"prob_parts = []\n",
"\n",
"# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n",
"for word in tokens:\n",
" prob_value = unigram_probabilities[word]\n",
" p_kalimat *= prob_value\n",
" # Format: P(word)=prob_value\n",
" prob_parts.append(f\"P({word})={prob_value:.2f}\")\n",
"\n",
"# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n",
"prob_str = \" x \".join(prob_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Vstwt996-FrS"
},
"source": [
"# **BIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XRIY4qgTVbjl",
"outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: saya makan nasi dan saya makan roti\n",
"Tokens (7): ['saya', 'makan', 'nasi', 'dan', 'saya', 'makan', 'roti']\n",
"\n",
"Frekuensi Bigram dalam kalimat:\n",
" ('saya', 'makan'): 2\n",
" ('makan', 'nasi'): 1\n",
" ('nasi', 'dan'): 1\n",
" ('dan', 'saya'): 1\n",
" ('makan', 'roti'): 1\n",
"\n",
"Total bigram dalam 1 kalimat: 6\n",
"\n",
"Probabilitas masing-masing bigram:\n",
" P(makan|saya) = 1.00 (100.00%)\n",
" P(nasi|makan) = 0.50 (50.00%)\n",
" P(dan|nasi) = 1.00 (100.00%)\n",
" P(saya|dan) = 1.00 (100.00%)\n",
" P(roti|makan) = 0.50 (50.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
" P(saya makan nasi dan saya makan roti) = P(saya)=0.29 x P(makan|saya)=1.00 x P(nasi|makan)=0.50 x P(dan|nasi)=1.00 x P(saya|dan)=1.00 x P(makan|saya)=1.00 x P(roti|makan)=0.50 = 0.071429 (7.14%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram dan Bigram\n",
"unigram_counts = Counter(tokens)\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"bigram_counts = Counter(bigrams)\n",
"\n",
"print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
"for pair, count in bigram_counts.items():\n",
" print(f\" {pair}: {count}\")\n",
"print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
"bigram_probabilities = {}\n",
"for (w1, w2), count in bigram_counts.items():\n",
" prob = count / unigram_counts[w1]\n",
" bigram_probabilities[(w1, w2)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing bigram:\")\n",
"for (w1, w2), prob in bigram_probabilities.items():\n",
" print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n",
"total_tokens = sum(unigram_counts.values())\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n",
"p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n",
"\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n",
"\n",
"for i in range(1, len(tokens)):\n",
" pair = (tokens[i-1], tokens[i])\n",
" p = bigram_probabilities.get(pair, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n",
"\n",
"# Gabungkan rumus perkalian untuk ditampilkan\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "E6n1IU8X-G9S"
},
"source": [
"# **TRIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BIRARsj2FHJg",
"outputId": "968d420e-9370-40e5-e7e1-148e1d351d62"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning\n",
"Tokens (7): ['mahasiswa', 'mengerjakan', 'tugas', 'kemudian', 'mahasiswa', 'upload', 'e-learning']\n",
"\n",
"Frekuensi Trigram dalam kalimat:\n",
" ('mahasiswa', 'mengerjakan', 'tugas'): 1\n",
" ('mengerjakan', 'tugas', 'kemudian'): 1\n",
" ('tugas', 'kemudian', 'mahasiswa'): 1\n",
" ('kemudian', 'mahasiswa', 'upload'): 1\n",
" ('mahasiswa', 'upload', 'e-learning'): 1\n",
"\n",
"Total trigram dalam 1 kalimat: 5\n",
"\n",
"Probabilitas masing-masing trigram:\n",
" P(tugas|mahasiswa,mengerjakan) = 1.00 (100.00%)\n",
" P(kemudian|mengerjakan,tugas) = 1.00 (100.00%)\n",
" P(mahasiswa|tugas,kemudian) = 1.00 (100.00%)\n",
" P(upload|kemudian,mahasiswa) = 1.00 (100.00%)\n",
" P(e-learning|mahasiswa,upload) = 1.00 (100.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
" P(mahasiswa mengerjakan tugas kemudian mahasiswa upload e-learning) = P(mahasiswa)=0.29 x P(mengerjakan|mahasiswa)=0.50 x P(tugas|mahasiswa,mengerjakan)=1.00 x P(kemudian|mengerjakan,tugas)=1.00 x P(mahasiswa|tugas,kemudian)=1.00 x P(upload|kemudian,mahasiswa)=1.00 x P(e-learning|mahasiswa,upload)=1.00 = 0.142857 (14.29%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Bigram dan Trigram\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
"\n",
"bigram_counts = Counter(bigrams)\n",
"trigram_counts = Counter(trigrams)\n",
"\n",
"print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
"for tg, count in trigram_counts.items():\n",
" print(f\" {tg}: {count}\")\n",
"print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n",
"trigram_probabilities = {}\n",
"for (w1, w2, w3), count in trigram_counts.items():\n",
" # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n",
" if bigram_counts[(w1, w2)] > 0:\n",
" prob = count / bigram_counts[(w1, w2)]\n",
" else:\n",
" prob = 0\n",
" trigram_probabilities[(w1, w2, w3)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing trigram:\")\n",
"for (w1, w2, w3), prob in trigram_probabilities.items():\n",
" print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n",
"\n",
"# a. P(w1)\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
"\n",
"# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n",
"if len(tokens) > 1:\n",
" count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n",
" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
"else:\n",
" p_w2_w1 = 1.0 # Jika hanya 1 kata\n",
"\n",
"p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n",
"\n",
"# Daftar bagian rumus untuk ditampilkan\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n",
"if len(tokens) > 1:\n",
" prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n",
"\n",
"# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n",
"for i in range(len(tokens) - 2):\n",
" triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
" p = trigram_probabilities.get(triplet, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n",
"\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,351 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qBYcPYAb059g",
"outputId": "9f57b704-da1b-4495-d366-24c30586dc76",
"scrolled": true
},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Masukkan jumlah dokumen yang ingin dimasukkan: 4\n"
]
}
],
"source": [
"# Input jumlah dokumen\n",
"import pandas as pd\n",
"n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mo-yt5Ob1N8j",
"outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Masukkan teks untuk dokumen ke-1: saya adalah seorang pria\n",
"Masukkan teks untuk dokumen ke-2: saya adalah pria yang memiliki hati\n",
"Masukkan teks untuk dokumen ke-3: hati saya telah terisi satu nama\n",
"Masukkan teks untuk dokumen ke-4: di dalam hati saya terukir nama pasangan saya\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Dokumen yang Dimasukkan ===\n",
"Doc 1: saya adalah seorang pria\n",
"Doc 2: saya adalah pria yang memiliki hati\n",
"Doc 3: hati saya telah terisi satu nama\n",
"Doc 4: di dalam hati saya terukir nama pasangan saya\n"
]
}
],
"source": [
"# Input teks dokumen satu per satu\n",
"documents = []\n",
"for i in range(n):\n",
" teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
" documents.append(teks)\n",
"\n",
"print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
"for i, doc in enumerate(documents):\n",
" print(f\"Doc {i+1}: {doc}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FkmxRAFq1oDK",
"outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Hasil Tokenisasi ===\n",
"Doc 1: ['saya', 'adalah', 'seorang', 'pria']\n",
"Doc 2: ['saya', 'adalah', 'pria', 'yang', 'memiliki', 'hati']\n",
"Doc 3: ['hati', 'saya', 'telah', 'terisi', 'satu', 'nama']\n",
"Doc 4: ['di', 'dalam', 'hati', 'saya', 'terukir', 'nama', 'pasangan', 'saya']\n"
]
}
],
"source": [
"# Tahap Tokenisasi\n",
"tokenized_docs = []\n",
"for doc in documents:\n",
" tokens = doc.lower().split()\n",
" tokenized_docs.append(tokens)\n",
"\n",
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
"for i, tokens in enumerate(tokenized_docs):\n",
" print(f\"Doc {i+1}: {tokens}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ybC1Vo2C_c3q",
"outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
"['saya', 'adalah', 'seorang', 'pria', 'saya', 'adalah', 'pria', 'yang', 'memiliki', 'hati', 'hati', 'saya', 'telah', 'terisi', 'satu', 'nama', 'di', 'dalam', 'hati', 'saya', 'terukir', 'nama', 'pasangan', 'saya']\n",
"Jumlah total kata dalam seluruh dokumen: 24\n"
]
}
],
"source": [
"# Pembuatan Corpus\n",
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
"\n",
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
"print(corpus_all)\n",
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "s6S-Ma4R1xuq",
"outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Vocabulary (Kata Unik) ===\n",
"['adalah', 'dalam', 'di', 'hati', 'memiliki', 'nama', 'pasangan', 'pria', 'satu', 'saya', 'seorang', 'telah', 'terisi', 'terukir', 'yang']\n",
"Jumlah kata unik (vocabulary size): 15\n",
"\n",
"=== Vocabulary (Kata Unik) ===\n",
" 1. adalah\n",
" 2. dalam\n",
" 3. di\n",
" 4. hati\n",
" 5. memiliki\n",
" 6. nama\n",
" 7. pasangan\n",
" 8. pria\n",
" 9. satu\n",
"10. saya\n",
"11. seorang\n",
"12. telah\n",
"13. terisi\n",
"14. terukir\n",
"15. yang\n",
"\n",
"Jumlah kata unik (vocabulary size): 15\n"
]
}
],
"source": [
"# Pembuatan Vocabulary\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"print(vocabulary)\n",
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
"\n",
"\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"for idx, word in enumerate(vocabulary, start=1):\n",
" print(f\"{idx:>2}. {word}\")\n",
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "ShevCTva2Fg9"
},
"outputs": [],
"source": [
"# Representasi Numerik (Matriks BoW)\n",
"bow_matrix = []\n",
"for doc in tokenized_docs:\n",
" vector = [doc.count(word) for word in vocabulary]\n",
" bow_matrix.append(vector)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-yB6D2pY2M0E",
"outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Matriks Bag of Words ===\n",
" adalah dalam di hati memiliki nama pasangan pria satu saya \\\n",
"D1 1 0 0 0 0 0 0 1 0 1 \n",
"D2 1 0 0 1 1 0 0 1 0 1 \n",
"D3 0 0 0 1 0 1 0 0 1 1 \n",
"D4 0 1 1 1 0 1 1 0 0 2 \n",
"\n",
" seorang telah terisi terukir yang \n",
"D1 1 0 0 0 0 \n",
"D2 0 0 0 0 1 \n",
"D3 0 1 1 0 0 \n",
"D4 0 0 0 1 0 \n"
]
}
],
"source": [
"df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
"df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n",
"\n",
"print(\"\\n=== Matriks Bag of Words ===\")\n",
"print(df_bow)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8ruf5vKL2rGD",
"outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
" Kata Frekuensi\n",
"0 saya 5\n",
"1 hati 3\n",
"2 nama 2\n",
"3 pria 2\n",
"4 adalah 2\n",
"5 di 1\n",
"6 dalam 1\n",
"7 pasangan 1\n",
"8 memiliki 1\n",
"9 satu 1\n",
"10 seorang 1\n",
"11 telah 1\n",
"12 terisi 1\n",
"13 terukir 1\n",
"14 yang 1\n",
"Frekuensi kata: 15\n"
]
}
],
"source": [
"# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
"word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
"word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
"\n",
"print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
"print(word_frequencies)\n",
"print(f\"Frekuensi kata: {len(word_frequencies)}\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (3022847739.py, line 1)",
"output_type": "error",
"traceback": [
" \u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[31m \u001b[39m\u001b[31mgit remote add origin https://git.lab.ubharajaya.ac.id/202210715229-ALPRIAN-BAHARAJA-SITORUS/Praktikum_NLP\u001b[39m\n ^\n\u001b[31mSyntaxError\u001b[39m\u001b[31m:\u001b[39m invalid syntax\n"
]
}
],
"source": [
"git remote add origin https://git.lab.ubharajaya.ac.id/202210715229-ALPRIAN-BAHARAJA-SITORUS/Praktikum_NLP"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,169 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f4a1399a-f23d-4060-a07e-bce5a5c7ddac",
"metadata": {},
"source": [
"# Klasifikasi Teks menggunakan ANN\n",
"## Arif R Dwiyanto\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "53a214ae-c9cf-4d46-925d-068f1685537b",
"metadata": {
"scrolled": true
},
"outputs": [
{
"ename": "ValueError",
"evalue": "All arrays must be of the same length",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mValueError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 28\u001b[39m\n\u001b[32m 11\u001b[39m \u001b[38;5;66;03m# -----------------------------------------\u001b[39;00m\n\u001b[32m 12\u001b[39m \u001b[38;5;66;03m# 1. Contoh Dataset\u001b[39;00m\n\u001b[32m 13\u001b[39m \u001b[38;5;66;03m# -----------------------------------------\u001b[39;00m\n\u001b[32m 14\u001b[39m \u001b[38;5;66;03m# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)\u001b[39;00m\n\u001b[32m 16\u001b[39m data = {\n\u001b[32m 17\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mtext\u001b[39m\u001b[33m\"\u001b[39m: [\n\u001b[32m 18\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mTempat ini sangat nyaman dan bersih.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m (...)\u001b[39m\u001b[32m 25\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mlabel\u001b[39m\u001b[33m\"\u001b[39m: [\u001b[33m\"\u001b[39m\u001b[33mpositive\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mnegative\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mpositive\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mnegative\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mpositive\u001b[39m\u001b[33m\"\u001b[39m, \u001b[33m\"\u001b[39m\u001b[33mnegative\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 26\u001b[39m }\n\u001b[32m---> \u001b[39m\u001b[32m28\u001b[39m df = \u001b[43mpd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mDataFrame\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 30\u001b[39m \u001b[38;5;66;03m# -----------------------------------------\u001b[39;00m\n\u001b[32m 31\u001b[39m \u001b[38;5;66;03m# 2. Split Train & Test\u001b[39;00m\n\u001b[32m 32\u001b[39m \u001b[38;5;66;03m# -----------------------------------------\u001b[39;00m\n\u001b[32m 33\u001b[39m X_train, X_test, y_train, y_test = train_test_split(\n\u001b[32m 34\u001b[39m df[\u001b[33m\"\u001b[39m\u001b[33mtext\u001b[39m\u001b[33m\"\u001b[39m], df[\u001b[33m\"\u001b[39m\u001b[33mlabel\u001b[39m\u001b[33m\"\u001b[39m], test_size=\u001b[32m0.3\u001b[39m, random_state=\u001b[32m42\u001b[39m\n\u001b[32m 35\u001b[39m )\n",
"\u001b[36mFile \u001b[39m\u001b[32m~\\miniforge3\\Lib\\site-packages\\pandas\\core\\frame.py:782\u001b[39m, in \u001b[36mDataFrame.__init__\u001b[39m\u001b[34m(self, data, index, columns, dtype, copy)\u001b[39m\n\u001b[32m 776\u001b[39m mgr = \u001b[38;5;28mself\u001b[39m._init_mgr(\n\u001b[32m 777\u001b[39m data, axes={\u001b[33m\"\u001b[39m\u001b[33mindex\u001b[39m\u001b[33m\"\u001b[39m: index, \u001b[33m\"\u001b[39m\u001b[33mcolumns\u001b[39m\u001b[33m\"\u001b[39m: columns}, dtype=dtype, copy=copy\n\u001b[32m 778\u001b[39m )\n\u001b[32m 780\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mdict\u001b[39m):\n\u001b[32m 781\u001b[39m \u001b[38;5;66;03m# GH#38939 de facto copy defaults to False only in non-dict cases\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m782\u001b[39m mgr = \u001b[43mdict_to_mgr\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtyp\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmanager\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 783\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, ma.MaskedArray):\n\u001b[32m 784\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnumpy\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mma\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m mrecords\n",
"\u001b[36mFile \u001b[39m\u001b[32m~\\miniforge3\\Lib\\site-packages\\pandas\\core\\internals\\construction.py:503\u001b[39m, in \u001b[36mdict_to_mgr\u001b[39m\u001b[34m(data, index, columns, dtype, typ, copy)\u001b[39m\n\u001b[32m 499\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 500\u001b[39m \u001b[38;5;66;03m# dtype check to exclude e.g. range objects, scalars\u001b[39;00m\n\u001b[32m 501\u001b[39m arrays = [x.copy() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(x, \u001b[33m\"\u001b[39m\u001b[33mdtype\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m x \u001b[38;5;28;01mfor\u001b[39;00m x \u001b[38;5;129;01min\u001b[39;00m arrays]\n\u001b[32m--> \u001b[39m\u001b[32m503\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43marrays_to_mgr\u001b[49m\u001b[43m(\u001b[49m\u001b[43marrays\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtyp\u001b[49m\u001b[43m=\u001b[49m\u001b[43mtyp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconsolidate\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcopy\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~\\miniforge3\\Lib\\site-packages\\pandas\\core\\internals\\construction.py:114\u001b[39m, in \u001b[36marrays_to_mgr\u001b[39m\u001b[34m(arrays, columns, index, dtype, verify_integrity, typ, consolidate)\u001b[39m\n\u001b[32m 111\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m verify_integrity:\n\u001b[32m 112\u001b[39m \u001b[38;5;66;03m# figure out the index, if necessary\u001b[39;00m\n\u001b[32m 113\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m index \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m114\u001b[39m index = \u001b[43m_extract_index\u001b[49m\u001b[43m(\u001b[49m\u001b[43marrays\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 115\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 116\u001b[39m index = ensure_index(index)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~\\miniforge3\\Lib\\site-packages\\pandas\\core\\internals\\construction.py:677\u001b[39m, in \u001b[36m_extract_index\u001b[39m\u001b[34m(data)\u001b[39m\n\u001b[32m 675\u001b[39m lengths = \u001b[38;5;28mlist\u001b[39m(\u001b[38;5;28mset\u001b[39m(raw_lengths))\n\u001b[32m 676\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(lengths) > \u001b[32m1\u001b[39m:\n\u001b[32m--> \u001b[39m\u001b[32m677\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mAll arrays must be of the same length\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 679\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m have_dicts:\n\u001b[32m 680\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[32m 681\u001b[39m \u001b[33m\"\u001b[39m\u001b[33mMixing dicts with non-Series may lead to ambiguous ordering.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 682\u001b[39m )\n",
"\u001b[31mValueError\u001b[39m: All arrays must be of the same length"
]
}
],
"source": [
"# ---------------------------------------------------------\n",
"# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network\n",
"# ---------------------------------------------------------\n",
"\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.neural_network import MLPClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix\n",
"\n",
"# -----------------------------------------\n",
"# 1. Contoh Dataset\n",
"# -----------------------------------------\n",
"# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)\n",
"\n",
"data = {\n",
" \"text\": [\n",
" \"Tempat ini sangat nyaman dan bersih.\"\n",
" \"Akses menuju ke sana cukup sulit dan membingungkan.\"\n",
" \"Pelayanan staf di sini juga sangat ramah dan cepat tanggap.\"\n",
" \"Lokasi kafe ini strategis dan mudah ditemukan.\"\n",
" \"Suasananya kadang terlalu bising karena sering ada keramaian.\"\n",
" \"Pilihan menu minumannya sangat beragam dan lezat.\"\n",
" ],\n",
" \"label\": [\"positive\", \"negative\", \"positive\", \"negative\", \"positive\", \"negative\"]\n",
"}\n",
"\n",
"df = pd.DataFrame(data)\n",
"\n",
"# -----------------------------------------\n",
"# 2. Split Train & Test\n",
"# -----------------------------------------\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" df[\"text\"], df[\"label\"], test_size=0.3, random_state=42\n",
")\n",
"\n",
"# -----------------------------------------\n",
"# 3. TF-IDF Vectorization\n",
"# -----------------------------------------\n",
"tfidf = TfidfVectorizer(max_features=5000)\n",
"X_train_tfidf = tfidf.fit_transform(X_train)\n",
"X_test_tfidf = tfidf.transform(X_test)\n",
"\n",
"# -----------------------------------------\n",
"# 4. Feedforward ANN (MLPClassifier)\n",
"# -----------------------------------------\n",
"model = MLPClassifier(\n",
" hidden_layer_sizes=(256, 64),\n",
" activation='relu',\n",
" solver='adam',\n",
" max_iter=500,\n",
" random_state=42\n",
")\n",
"\n",
"model.fit(X_train_tfidf, y_train)\n",
"\n",
"# -----------------------------------------\n",
"# 5. Evaluasi Model\n",
"# -----------------------------------------\n",
"y_pred = model.predict(X_test_tfidf)\n",
"\n",
"print(\"=== Classification Report ===\")\n",
"print(classification_report(y_test, y_pred))\n",
"\n",
"print(\"=== Confusion Matrix ===\")\n",
"print(confusion_matrix(y_test, y_pred))\n",
"\n",
"# -----------------------------------------\n",
"# 6. Prediksi Teks Baru\n",
"# -----------------------------------------\n",
"#sample_text = [\"barang bagus luar biasa\"]\n",
"sample_text = [\"Tempat nyaman, saya suka\"]\n",
"sample_vec = tfidf.transform(sample_text)\n",
"prediction = model.predict(sample_vec)\n",
"\n",
"print(\"\\nPrediksi untuk:\", sample_text[0])\n",
"print(\"Hasil:\", prediction[0])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9f7d90fe-4af4-446c-9547-c9312bfa6fc7",
"metadata": {},
"outputs": [],
"source": [
"#sample_text = [\"barang bagus luar biasa\"]\n",
"sample_text = [\"Tempat bising saya tidak suka\"]\n",
"sample_vec = tfidf.transform(sample_text)\n",
"prediction = model.predict(sample_vec)\n",
"print(\"\\nPrediksi untuk:\", sample_text[0])\n",
"print(\"Hasil:\", prediction[0])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0413b4bf-beb1-483b-a081-b540fce1b21c",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "d714bd96-09a0-4439-8286-0cb39e2fb4df",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,395 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "JVPdWpz3hhbj"
},
"source": [
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4Mvva3v65h1v"
},
"source": [
"# **UNIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1cub_VJnUJMl",
"outputId": "1889eb61-4f3f-4780-f42e-02368076cce3"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: Saya sangat mencintai pacar saya\n",
"Tokens (5): ['saya', 'sangat', 'mencintai', 'pacar', 'saya']\n",
"\n",
"Frekuensi Unigram dalam kalimat\n",
" ('saya'): 2\n",
" ('sangat'): 1\n",
" ('mencintai'): 1\n",
" ('pacar'): 1\n",
"\n",
"Total unigram dalam 1 kalimat: 5\n",
"\n",
"Probabilitas masing-masing unigram:\n",
" P(saya) = 0.40 (40.00%)\n",
" P(sangat) = 0.20 (20.00%)\n",
" P(mencintai) = 0.20 (20.00%)\n",
" P(pacar) = 0.20 (20.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
" P(saya sangat mencintai pacar saya) = P(saya)=0.40 x P(sangat)=0.20 x P(mencintai)=0.20 x P(pacar)=0.20 x P(saya)=0.40 = 0.0013 (0.13%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenize\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"print(\"\\nFrekuensi Unigram dalam kalimat\")\n",
"for pair, count in unigram_counts.items():\n",
" print(f\" ('{pair}'): {count}\")\n",
"print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
"\n",
"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
"unigram_probabilities = {}\n",
"for word, count in unigram_counts.items():\n",
" prob = count / total_tokens\n",
" unigram_probabilities[word] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing unigram:\")\n",
"for word, prob in unigram_probabilities.items():\n",
" print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n",
"p_kalimat = 1\n",
"prob_parts = []\n",
"\n",
"# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n",
"for word in tokens:\n",
" prob_value = unigram_probabilities[word]\n",
" p_kalimat *= prob_value\n",
" # Format: P(word)=prob_value\n",
" prob_parts.append(f\"P({word})={prob_value:.2f}\")\n",
"\n",
"# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n",
"prob_str = \" x \".join(prob_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Vstwt996-FrS"
},
"source": [
"# **BIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XRIY4qgTVbjl",
"outputId": "ea6e62ce-45a0-40c9-ca98-1fcc30558479"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: Saya adalah pemain liga sepak bola terbaik di dunia\n",
"Tokens (9): ['saya', 'adalah', 'pemain', 'liga', 'sepak', 'bola', 'terbaik', 'di', 'dunia']\n",
"\n",
"Frekuensi Bigram dalam kalimat:\n",
" ('saya', 'adalah'): 1\n",
" ('adalah', 'pemain'): 1\n",
" ('pemain', 'liga'): 1\n",
" ('liga', 'sepak'): 1\n",
" ('sepak', 'bola'): 1\n",
" ('bola', 'terbaik'): 1\n",
" ('terbaik', 'di'): 1\n",
" ('di', 'dunia'): 1\n",
"\n",
"Total bigram dalam 1 kalimat: 8\n",
"\n",
"Probabilitas masing-masing bigram:\n",
" P(adalah|saya) = 1.00 (100.00%)\n",
" P(pemain|adalah) = 1.00 (100.00%)\n",
" P(liga|pemain) = 1.00 (100.00%)\n",
" P(sepak|liga) = 1.00 (100.00%)\n",
" P(bola|sepak) = 1.00 (100.00%)\n",
" P(terbaik|bola) = 1.00 (100.00%)\n",
" P(di|terbaik) = 1.00 (100.00%)\n",
" P(dunia|di) = 1.00 (100.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
" P(saya adalah pemain liga sepak bola terbaik di dunia) = P(saya)=0.11 x P(adalah|saya)=1.00 x P(pemain|adalah)=1.00 x P(liga|pemain)=1.00 x P(sepak|liga)=1.00 x P(bola|sepak)=1.00 x P(terbaik|bola)=1.00 x P(di|terbaik)=1.00 x P(dunia|di)=1.00 = 0.111111 (11.11%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram dan Bigram\n",
"unigram_counts = Counter(tokens)\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"bigram_counts = Counter(bigrams)\n",
"\n",
"print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
"for pair, count in bigram_counts.items():\n",
" print(f\" {pair}: {count}\")\n",
"print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
"bigram_probabilities = {}\n",
"for (w1, w2), count in bigram_counts.items():\n",
" prob = count / unigram_counts[w1]\n",
" bigram_probabilities[(w1, w2)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing bigram:\")\n",
"for (w1, w2), prob in bigram_probabilities.items():\n",
" print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n",
"total_tokens = sum(unigram_counts.values())\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n",
"p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n",
"\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n",
"\n",
"for i in range(1, len(tokens)):\n",
" pair = (tokens[i-1], tokens[i])\n",
" p = bigram_probabilities.get(pair, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n",
"\n",
"# Gabungkan rumus perkalian untuk ditampilkan\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "E6n1IU8X-G9S"
},
"source": [
"# **TRIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BIRARsj2FHJg",
"outputId": "968d420e-9370-40e5-e7e1-148e1d351d62"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Corpus: Saya adalah anak dari ibu dan bapak saya\n",
"Tokens (8): ['saya', 'adalah', 'anak', 'dari', 'ibu', 'dan', 'bapak', 'saya']\n",
"\n",
"Frekuensi Trigram dalam kalimat:\n",
" ('saya', 'adalah', 'anak'): 1\n",
" ('adalah', 'anak', 'dari'): 1\n",
" ('anak', 'dari', 'ibu'): 1\n",
" ('dari', 'ibu', 'dan'): 1\n",
" ('ibu', 'dan', 'bapak'): 1\n",
" ('dan', 'bapak', 'saya'): 1\n",
"\n",
"Total trigram dalam 1 kalimat: 6\n",
"\n",
"Probabilitas masing-masing trigram:\n",
" P(anak|saya,adalah) = 1.00 (100.00%)\n",
" P(dari|adalah,anak) = 1.00 (100.00%)\n",
" P(ibu|anak,dari) = 1.00 (100.00%)\n",
" P(dan|dari,ibu) = 1.00 (100.00%)\n",
" P(bapak|ibu,dan) = 1.00 (100.00%)\n",
" P(saya|dan,bapak) = 1.00 (100.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
" P(saya adalah anak dari ibu dan bapak saya) = P(saya)=0.25 x P(adalah|saya)=0.50 x P(anak|saya,adalah)=1.00 x P(dari|adalah,anak)=1.00 x P(ibu|anak,dari)=1.00 x P(dan|dari,ibu)=1.00 x P(bapak|ibu,dan)=1.00 x P(saya|dan,bapak)=1.00 = 0.125000 (12.50%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Bigram dan Trigram\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
"\n",
"bigram_counts = Counter(bigrams)\n",
"trigram_counts = Counter(trigrams)\n",
"\n",
"print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
"for tg, count in trigram_counts.items():\n",
" print(f\" {tg}: {count}\")\n",
"print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n",
"trigram_probabilities = {}\n",
"for (w1, w2, w3), count in trigram_counts.items():\n",
" # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n",
" if bigram_counts[(w1, w2)] > 0:\n",
" prob = count / bigram_counts[(w1, w2)]\n",
" else:\n",
" prob = 0\n",
" trigram_probabilities[(w1, w2, w3)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing trigram:\")\n",
"for (w1, w2, w3), prob in trigram_probabilities.items():\n",
" print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n",
"\n",
"# a. P(w1)\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
"\n",
"# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n",
"if len(tokens) > 1:\n",
" count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n",
" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
"else:\n",
" p_w2_w1 = 1.0 # Jika hanya 1 kata\n",
"\n",
"p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n",
"\n",
"# Daftar bagian rumus untuk ditampilkan\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n",
"if len(tokens) > 1:\n",
" prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n",
"\n",
"# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n",
"for i in range(len(tokens) - 2):\n",
" triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
" p = trigram_probabilities.get(triplet, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n",
"\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

3
README.md Normal file
View File

@ -0,0 +1,3 @@
#praktikum nlp
Nama : Alprian Baharaja Sitorus