Compare commits

...

No commits in common. "master" and "main" have entirely different histories.
master ... main

4 changed files with 943 additions and 0 deletions

View File

@ -0,0 +1,318 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qBYcPYAb059g",
"outputId": "ac27d686-2d15-4b2f-cc13-963fadf3100f"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Masukkan jumlah dokumen yang ingin dimasukkan: 4\n"
]
}
],
"source": [
"# Input jumlah dokumen\n",
"import pandas as pd\n",
"n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mo-yt5Ob1N8j",
"outputId": "21da9ff1-2954-4b39-b207-017d03d0294f"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Masukkan teks untuk dokumen ke-1: saya ingin memasak\n",
"Masukkan teks untuk dokumen ke-2: masak ayam goreng sepertinya enak\n",
"Masukkan teks untuk dokumen ke-3: enakan ayam goreng atau ikan goreng\n",
"Masukkan teks untuk dokumen ke-4: dibarengi dengan saus sepertinya akan lezat\n",
"\n",
"=== Dokumen yang Dimasukkan ===\n",
"Doc 1: saya ingin memasak\n",
"Doc 2: masak ayam goreng sepertinya enak\n",
"Doc 3: enakan ayam goreng atau ikan goreng\n",
"Doc 4: dibarengi dengan saus sepertinya akan lezat\n"
]
}
],
"source": [
"# Input teks dokumen satu per satu\n",
"documents = []\n",
"for i in range(n):\n",
" teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
" documents.append(teks)\n",
"\n",
"print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
"for i, doc in enumerate(documents):\n",
" print(f\"Doc {i+1}: {doc}\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FkmxRAFq1oDK",
"outputId": "e451e801-161a-4618-f047-97893cc7a68b"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"=== Hasil Tokenisasi ===\n",
"Doc 1: ['saya', 'ingin', 'memasak']\n",
"Doc 2: ['masak', 'ayam', 'goreng', 'sepertinya', 'enak']\n",
"Doc 3: ['enakan', 'ayam', 'goreng', 'atau', 'ikan', 'goreng']\n",
"Doc 4: ['dibarengi', 'dengan', 'saus', 'sepertinya', 'akan', 'lezat']\n"
]
}
],
"source": [
"# Tahap Tokenisasi\n",
"tokenized_docs = []\n",
"for doc in documents:\n",
" tokens = doc.lower().split()\n",
" tokenized_docs.append(tokens)\n",
"\n",
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
"for i, tokens in enumerate(tokenized_docs):\n",
" print(f\"Doc {i+1}: {tokens}\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ybC1Vo2C_c3q",
"outputId": "f1e97af1-3af9-4dee-b59a-2a8baa79a370"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
"['saya', 'ingin', 'memasak', 'masak', 'ayam', 'goreng', 'sepertinya', 'enak', 'enakan', 'ayam', 'goreng', 'atau', 'ikan', 'goreng', 'dibarengi', 'dengan', 'saus', 'sepertinya', 'akan', 'lezat']\n",
"Jumlah total kata dalam seluruh dokumen: 20\n"
]
}
],
"source": [
"# Pembuatan Corpus\n",
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
"\n",
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
"print(corpus_all)\n",
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "s6S-Ma4R1xuq",
"outputId": "7643748e-937e-4724-8db0-0a768ad7182f"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"=== Vocabulary (Kata Unik) ===\n",
"['akan', 'atau', 'ayam', 'dengan', 'dibarengi', 'enak', 'enakan', 'goreng', 'ikan', 'ingin', 'lezat', 'masak', 'memasak', 'saus', 'saya', 'sepertinya']\n",
"Jumlah kata unik (vocabulary size): 16\n",
"\n",
"=== Vocabulary (Kata Unik) ===\n",
" 1. akan\n",
" 2. atau\n",
" 3. ayam\n",
" 4. dengan\n",
" 5. dibarengi\n",
" 6. enak\n",
" 7. enakan\n",
" 8. goreng\n",
" 9. ikan\n",
"10. ingin\n",
"11. lezat\n",
"12. masak\n",
"13. memasak\n",
"14. saus\n",
"15. saya\n",
"16. sepertinya\n",
"\n",
"Jumlah kata unik (vocabulary size): 16\n"
]
}
],
"source": [
"# Pembuatan Vocabulary\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"print(vocabulary)\n",
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
"\n",
"\n",
"vocabulary = sorted(set(corpus_all))\n",
"\n",
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
"for idx, word in enumerate(vocabulary, start=1):\n",
" print(f\"{idx:>2}. {word}\")\n",
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ShevCTva2Fg9"
},
"outputs": [],
"source": [
"# Representasi Numerik (Matriks BoW)\n",
"bow_matrix = []\n",
"for doc in tokenized_docs:\n",
" vector = [doc.count(word) for word in vocabulary]\n",
" bow_matrix.append(vector)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "-yB6D2pY2M0E",
"outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Matriks Bag of Words ===\n",
" ai belajar dan data di kampus mahasiswa nlp saya science suka\n",
"D1 0 1 0 0 1 1 0 1 1 0 0\n",
"D2 1 1 0 0 0 0 0 0 1 0 1\n",
"D3 0 1 1 1 0 0 1 1 0 1 0\n"
]
}
],
"source": [
"df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
"df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n",
"\n",
"print(\"\\n=== Matriks Bag of Words ===\")\n",
"print(df_bow)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "8ruf5vKL2rGD",
"outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
" Kata Frekuensi\n",
"0 belajar 3\n",
"1 nlp 2\n",
"2 saya 2\n",
"3 dan 1\n",
"4 ai 1\n",
"5 data 1\n",
"6 di 1\n",
"7 mahasiswa 1\n",
"8 kampus 1\n",
"9 science 1\n",
"10 suka 1\n",
"Frekuensi kata: 11\n"
]
}
],
"source": [
"# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
"word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
"word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
"\n",
"print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
"print(word_frequencies)\n",
"print(f\"Frekuensi kata: {len(word_frequencies)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "NQjExannHuj0"
},
"outputs": [],
"source": []
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

218
Klasifikasi_Teks_FNN.ipynb Normal file
View File

@ -0,0 +1,218 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "f4a1399a-f23d-4060-a07e-bce5a5c7ddac",
"metadata": {
"id": "f4a1399a-f23d-4060-a07e-bce5a5c7ddac"
},
"source": [
"# Klasifikasi Teks menggunakan ANN\n",
"## Fahrizal Setiawan\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "53a214ae-c9cf-4d46-925d-068f1685537b",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "53a214ae-c9cf-4d46-925d-068f1685537b",
"outputId": "f224e8ff-e3a6-49d9-fac9-cafc0202eb4c"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"=== Classification Report ===\n",
" precision recall f1-score support\n",
"\n",
" negative 0.33 1.00 0.50 1\n",
" positive 0.00 0.00 0.00 2\n",
"\n",
" accuracy 0.33 3\n",
" macro avg 0.17 0.50 0.25 3\n",
"weighted avg 0.11 0.33 0.17 3\n",
"\n",
"=== Confusion Matrix ===\n",
"[[1 0]\n",
" [2 0]]\n",
"\n",
"Prediksi untuk: barang buruk, saya kecewa\n",
"Hasil: negative\n"
]
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n",
"/usr/local/lib/python3.12/dist-packages/sklearn/metrics/_classification.py:1565: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
" _warn_prf(average, modifier, f\"{metric.capitalize()} is\", len(result))\n"
]
}
],
"source": [
"# ---------------------------------------------------------\n",
"# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network\n",
"# ---------------------------------------------------------\n",
"\n",
"import pandas as pd\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.neural_network import MLPClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix\n",
"\n",
"# -----------------------------------------\n",
"# 1. Contoh Dataset\n",
"# -----------------------------------------\n",
"# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)\n",
"\n",
"data = {\n",
" \"text\": [\n",
" \"Saya suka produk ini, luar biasa\",\n",
" \"Layanannya buruk, saya sangat kecewa\",\n",
" \"Penjual tidak responsif, sangat kecewa\",\n",
" \"Pembelian terbaik yang pernah saya lakukan\",\n",
" \"Saya benci produk ini, buang-buang uang\",\n",
" \"Kualitasnya sangat bagus, direkomendasikan\",\n",
" \"Pengalaman buruk, tidak akan membeli lagi\",\n",
"\n",
" ],\n",
" \"label\": [\n",
" \"positive\",\n",
" \"negative\",\n",
" \"negative\", # Corrected: Was positive, now negative to match sentiment\n",
" \"positive\", # Corrected: Was negative, now positive to match sentiment\n",
" \"negative\", # Corrected: Was positive, now negative to match sentiment\n",
" \"positive\", # Corrected: Was negative, now positive to match sentiment\n",
" \"negative\",\n",
" # Added missing label to match length of 'text' list\n",
" ]\n",
"}\n",
"\n",
"df = pd.DataFrame(data)\n",
"\n",
"# -----------------------------------------\n",
"# 2. Split Train & Test\n",
"# -----------------------------------------\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" df[\"text\"], df[\"label\"], test_size=0.3, random_state=42\n",
")\n",
"\n",
"# -----------------------------------------\n",
"# 3. TF-IDF Vectorization\n",
"# -----------------------------------------\n",
"tfidf = TfidfVectorizer(max_features=5000)\n",
"X_train_tfidf = tfidf.fit_transform(X_train)\n",
"X_test_tfidf = tfidf.transform(X_test)\n",
"\n",
"# -----------------------------------------\n",
"# 4. Feedforward ANN (MLPClassifier)\n",
"# -----------------------------------------\n",
"model = MLPClassifier(\n",
" hidden_layer_sizes=(256, 64),\n",
" activation='relu',\n",
" solver='adam',\n",
" max_iter=500,\n",
" random_state=42\n",
")\n",
"\n",
"model.fit(X_train_tfidf, y_train)\n",
"\n",
"# -----------------------------------------\n",
"# 5. Evaluasi Model\n",
"# -----------------------------------------\n",
"y_pred = model.predict(X_test_tfidf)\n",
"\n",
"print(\"=== Classification Report ===\")\n",
"print(classification_report(y_test, y_pred))\n",
"\n",
"print(\"=== Confusion Matrix ===\")\n",
"print(confusion_matrix(y_test, y_pred))\n",
"\n",
"# -----------------------------------------\n",
"# 6. Prediksi Teks Baru\n",
"# -----------------------------------------\n",
"#sample_text = [\"barang bagus luar biasa\"]\n",
"sample_text = [\"barang buruk, saya kecewa\"]\n",
"sample_vec = tfidf.transform(sample_text)\n",
"prediction = model.predict(sample_vec)\n",
"\n",
"print(\"\\nPrediksi untuk:\", sample_text[0])\n",
"print(\"Hasil:\", prediction[0])"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "9f7d90fe-4af4-446c-9547-c9312bfa6fc7",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9f7d90fe-4af4-446c-9547-c9312bfa6fc7",
"outputId": "4a889f91-ff57-459e-8987-43a230489899"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Prediksi untuk: saya benci barang ini\n",
"Hasil: negative\n"
]
}
],
"source": [
"#sample_text = [\"barang bagus luar biasa\"]\n",
"sample_text = [\"saya benci barang ini\"]\n",
"sample_vec = tfidf.transform(sample_text)\n",
"prediction = model.predict(sample_vec)\n",
"print(\"\\nPrediksi untuk:\", sample_text[0])\n",
"print(\"Hasil:\", prediction[0])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4b9a7c2-0f08-43fd-8da8-018d839a4917",
"metadata": {
"id": "d4b9a7c2-0f08-43fd-8da8-018d839a4917"
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.12"
},
"colab": {
"provenance": []
}
},
"nbformat": 4,
"nbformat_minor": 5
}

394
N_Gram f.ipynb Normal file
View File

@ -0,0 +1,394 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "JVPdWpz3hhbj"
},
"source": [
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "4Mvva3v65h1v"
},
"source": [
"# **UNIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"id": "1cub_VJnUJMl",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "a712acbd-01e2-4c9e-f2c0-d7d33f3bc9fb"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Corpus: Jangan pernah berhenti belajar, karena hidup tak pernah berhenti mengajarkan\n",
"Tokens (10): ['jangan', 'pernah', 'berhenti', 'belajar,', 'karena', 'hidup', 'tak', 'pernah', 'berhenti', 'mengajarkan']\n",
"\n",
"Frekuensi Unigram dalam kalimat\n",
" ('jangan'): 1\n",
" ('pernah'): 2\n",
" ('berhenti'): 2\n",
" ('belajar,'): 1\n",
" ('karena'): 1\n",
" ('hidup'): 1\n",
" ('tak'): 1\n",
" ('mengajarkan'): 1\n",
"\n",
"Total unigram dalam 1 kalimat: 10\n",
"\n",
"Probabilitas masing-masing unigram:\n",
" P(jangan) = 0.10 (10.00%)\n",
" P(pernah) = 0.20 (20.00%)\n",
" P(berhenti) = 0.20 (20.00%)\n",
" P(belajar,) = 0.10 (10.00%)\n",
" P(karena) = 0.10 (10.00%)\n",
" P(hidup) = 0.10 (10.00%)\n",
" P(tak) = 0.10 (10.00%)\n",
" P(mengajarkan) = 0.10 (10.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
" P(jangan pernah berhenti belajar, karena hidup tak pernah berhenti mengajarkan) = P(jangan)=0.10 x P(pernah)=0.20 x P(berhenti)=0.20 x P(belajar,)=0.10 x P(karena)=0.10 x P(hidup)=0.10 x P(tak)=0.10 x P(pernah)=0.20 x P(berhenti)=0.20 x P(mengajarkan)=0.10 = 0.0000 (0.00%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: Jangan pernah berhenti belajar, karena hidup tak pernah berhenti mengajarkan \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenize\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"print(\"\\nFrekuensi Unigram dalam kalimat\")\n",
"for pair, count in unigram_counts.items():\n",
" print(f\" ('{pair}'): {count}\")\n",
"print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
"\n",
"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
"unigram_probabilities = {}\n",
"for word, count in unigram_counts.items():\n",
" prob = count / total_tokens\n",
" unigram_probabilities[word] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing unigram:\")\n",
"for word, prob in unigram_probabilities.items():\n",
" print(f\" P({word}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (P(kalimat) = P(w1) * P(w2) * ...)\n",
"p_kalimat = 1\n",
"prob_parts = []\n",
"\n",
"# Loop untuk menghitung probabilitas total dan membangun string rumus detail\n",
"for word in tokens:\n",
" prob_value = unigram_probabilities[word]\n",
" p_kalimat *= prob_value\n",
" # Format: P(word)=prob_value\n",
" prob_parts.append(f\"P({word})={prob_value:.2f}\")\n",
"\n",
"# Gabungkan bagian-bagian rumus untuk mendapatkan prob_str detail\n",
"prob_str = \" x \".join(prob_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.4f} ({p_kalimat*100:.2f}%)\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Vstwt996-FrS"
},
"source": [
"# **BIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XRIY4qgTVbjl",
"outputId": "4eff35ea-8a13-4b4a-fd8f-e0f3518c1add"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Corpus: ilmu adalah cahaya, dan belajar adalah menyalakan lentera dalam kegelapan\n",
"Tokens (10): ['ilmu', 'adalah', 'cahaya,', 'dan', 'belajar', 'adalah', 'menyalakan', 'lentera', 'dalam', 'kegelapan']\n",
"\n",
"Frekuensi Bigram dalam kalimat:\n",
" ('ilmu', 'adalah'): 1\n",
" ('adalah', 'cahaya,'): 1\n",
" ('cahaya,', 'dan'): 1\n",
" ('dan', 'belajar'): 1\n",
" ('belajar', 'adalah'): 1\n",
" ('adalah', 'menyalakan'): 1\n",
" ('menyalakan', 'lentera'): 1\n",
" ('lentera', 'dalam'): 1\n",
" ('dalam', 'kegelapan'): 1\n",
"\n",
"Total bigram dalam 1 kalimat: 9\n",
"\n",
"Probabilitas masing-masing bigram:\n",
" P(adalah|ilmu) = 1.00 (100.00%)\n",
" P(cahaya,|adalah) = 0.50 (50.00%)\n",
" P(dan|cahaya,) = 1.00 (100.00%)\n",
" P(belajar|dan) = 1.00 (100.00%)\n",
" P(adalah|belajar) = 1.00 (100.00%)\n",
" P(menyalakan|adalah) = 0.50 (50.00%)\n",
" P(lentera|menyalakan) = 1.00 (100.00%)\n",
" P(dalam|lentera) = 1.00 (100.00%)\n",
" P(kegelapan|dalam) = 1.00 (100.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
" P(ilmu adalah cahaya, dan belajar adalah menyalakan lentera dalam kegelapan) = P(ilmu)=0.10 x P(adalah|ilmu)=1.00 x P(cahaya,|adalah)=0.50 x P(dan|cahaya,)=1.00 x P(belajar|dan)=1.00 x P(adalah|belajar)=1.00 x P(menyalakan|adalah)=0.50 x P(lentera|menyalakan)=1.00 x P(dalam|lentera)=1.00 x P(kegelapan|dalam)=1.00 = 0.025000 (2.50%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: Ilmu adalah cahaya, dan belajar adalah menyalakan lentera dalam kegelapan \").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Unigram dan Bigram\n",
"unigram_counts = Counter(tokens)\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"bigram_counts = Counter(bigrams)\n",
"\n",
"print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
"for pair, count in bigram_counts.items():\n",
" print(f\" {pair}: {count}\")\n",
"print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
"bigram_probabilities = {}\n",
"for (w1, w2), count in bigram_counts.items():\n",
" prob = count / unigram_counts[w1]\n",
" bigram_probabilities[(w1, w2)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing bigram:\")\n",
"for (w1, w2), prob in bigram_probabilities.items():\n",
" print(f\" P({w2}|{w1}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Bigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w2) * ...\n",
"total_tokens = sum(unigram_counts.values())\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens # P(w1)\n",
"p_kalimat = p_w1 # Inisialisasi dengan P(w1)\n",
"\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"] # Tambahkan P(w1) ke rumus\n",
"\n",
"for i in range(1, len(tokens)):\n",
" pair = (tokens[i-1], tokens[i])\n",
" p = bigram_probabilities.get(pair, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.2f}\")\n",
"\n",
"# Gabungkan rumus perkalian untuk ditampilkan\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "E6n1IU8X-G9S"
},
"source": [
"# **TRIGRAM**"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "BIRARsj2FHJg",
"outputId": "6e09b998-b787-4c91-a710-57a809bf2223"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Corpus: belajar adalah kunci membuka pintu kesuksesan\n",
"Tokens (6): ['belajar', 'adalah', 'kunci', 'membuka', 'pintu', 'kesuksesan']\n",
"\n",
"Frekuensi Trigram dalam kalimat:\n",
" ('belajar', 'adalah', 'kunci'): 1\n",
" ('adalah', 'kunci', 'membuka'): 1\n",
" ('kunci', 'membuka', 'pintu'): 1\n",
" ('membuka', 'pintu', 'kesuksesan'): 1\n",
"\n",
"Total trigram dalam 1 kalimat: 4\n",
"\n",
"Probabilitas masing-masing trigram:\n",
" P(kunci|belajar,adalah) = 1.00 (100.00%)\n",
" P(membuka|adalah,kunci) = 1.00 (100.00%)\n",
" P(pintu|kunci,membuka) = 1.00 (100.00%)\n",
" P(kesuksesan|membuka,pintu) = 1.00 (100.00%)\n",
"\n",
"Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
" P(belajar adalah kunci membuka pintu kesuksesan) = P(belajar)=0.17 x P(adalah|belajar)=1.00 x P(kunci|belajar,adalah)=1.00 x P(membuka|adalah,kunci)=1.00 x P(pintu|kunci,membuka)=1.00 x P(kesuksesan|membuka,pintu)=1.00 = 0.166667 (16.67%)\n"
]
}
],
"source": [
"from collections import Counter\n",
"from IPython.display import clear_output\n",
"import math\n",
"\n",
"# 1. Input Kalimat dan Tokenisasi\n",
"kalimat = input(\"Masukkan kalimat: Belajar adalah kunci membuka pintu kesuksesan\").strip()\n",
"\n",
"# Bersihkan output (khusus lingkungan notebook)\n",
"try:\n",
" clear_output()\n",
"except:\n",
" pass\n",
"\n",
"print(f\"Corpus: {kalimat}\")\n",
"\n",
"# Tokenisasi\n",
"tokens = kalimat.lower().split()\n",
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
"\n",
"# 2. Hitung Frekuensi Bigram dan Trigram\n",
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
"\n",
"bigram_counts = Counter(bigrams)\n",
"trigram_counts = Counter(trigrams)\n",
"\n",
"print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
"for tg, count in trigram_counts.items():\n",
" print(f\" {tg}: {count}\")\n",
"print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
"\n",
"# 3. Hitung Probabilitas Trigram: P(w3 | w1, w2) = Count(w1,w2,w3) / Count(w1,w2)\n",
"trigram_probabilities = {}\n",
"for (w1, w2, w3), count in trigram_counts.items():\n",
" # Hindari pembagian dengan nol (jika ada bigram yang tidak muncul)\n",
" if bigram_counts[(w1, w2)] > 0:\n",
" prob = count / bigram_counts[(w1, w2)]\n",
" else:\n",
" prob = 0\n",
" trigram_probabilities[(w1, w2, w3)] = prob\n",
"\n",
"print(\"\\nProbabilitas masing-masing trigram:\")\n",
"for (w1, w2, w3), prob in trigram_probabilities.items():\n",
" print(f\" P({w3}|{w1},{w2}) = {prob:.2f} ({prob*100:.2f}%)\")\n",
"\n",
"# Tambahkan perhitungan Unigram Count (dibutuhkan untuk P(w1) dan P(w2|w1))\n",
"unigram_counts = Counter(tokens)\n",
"total_tokens = sum(unigram_counts.values())\n",
"\n",
"# 4. Hitung Probabilitas Kalimat Keseluruhan (Model Trigram)\n",
"# P(kalimat) = P(w1) * P(w2|w1) * P(w3|w1,w2) * ...\n",
"\n",
"# a. P(w1)\n",
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
"\n",
"# b. P(w2|w1) (Menggunakan Bigram tanpa smoothing)\n",
"if len(tokens) > 1:\n",
" count_w1 = unigram_counts.get(tokens[0], 1) # Hindari pembagian dengan 0\n",
" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
"else:\n",
" p_w2_w1 = 1.0 # Jika hanya 1 kata\n",
"\n",
"p_kalimat = p_w1 * p_w2_w1 # Inisialisasi dengan P(w1) * P(w2|w1)\n",
"\n",
"# Daftar bagian rumus untuk ditampilkan\n",
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.2f}\"]\n",
"if len(tokens) > 1:\n",
" prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.2f}\")\n",
"\n",
"# c. Perkalian Trigram P(wi | wi-2, wi-1) untuk i >= 3\n",
"for i in range(len(tokens) - 2):\n",
" triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
" p = trigram_probabilities.get(triplet, 0)\n",
" p_kalimat *= p\n",
" prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.2f}\")\n",
"\n",
"prob_str = \" x \".join(prob_str_parts)\n",
"\n",
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.6f} ({p_kalimat*100:.2f}%)\")\n"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

13
README.md Normal file
View File

@ -0,0 +1,13 @@
#repo repo repo
Nama saya Fahrizal Setiawan
Saya ingin menjadi Pahlawan seperti deku san
kenapa? karena pahlawan himmel akan melakukan hal yang sama