Update tiga file praktikum NLP (Fatah): - Fitur Ekstraksi BOW - Klasifikasi Teks TF-IDF + ANN
Tugas praktikum Sudah saya edit sesuai instruksi dosen. Signed-off-by: 202210715288 FATAH SABILA ROSYAD <202210715288@mhs.ubharajaya.ac.id>
This commit is contained in:
parent
887e4d3725
commit
e871ab77ad
742
Fitur_Ekstraksi_BOW.ipynb
Normal file
742
Fitur_Ekstraksi_BOW.ipynb
Normal file
@ -0,0 +1,742 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Fitur Ekstraksi Bag-of-Words (BOW)\n",
|
||||
"\n",
|
||||
"**Nama:** Fatah Sabila Rosyad \n",
|
||||
"**NIM:** **202210715288** \n",
|
||||
"**Kelas:** F7B2 \n",
|
||||
"**MK:** NLP \n",
|
||||
"\n",
|
||||
"**Tujuan praktikum:** \n",
|
||||
"Melakukan ekstraksi fitur teks menggunakan Bag-of-Words dengan variasi parameter, yaitu: \n",
|
||||
"- Mengubah contoh teks \n",
|
||||
"- Mengubah jumlah fitur (`max_features`) \n",
|
||||
"- Menggunakan rentang n-gram baru (`ngram_range = (1,3)`) \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdin",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Masukkan jumlah dokumen yang ingin dimasukkan: 3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Input jumlah dokumen\n",
|
||||
"n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "mo-yt5Ob1N8j",
|
||||
"outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdin",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Masukkan teks untuk dokumen ke-1: Saya Fatah Sabila Rosyad, mahasiswa informatika yang sedang mempelajari NLP dan machine learning\n",
|
||||
"Masukkan teks untuk dokumen ke-2: Fatah melakukan eksperimen Bag-of-Words untuk melihat bagaimana fitur teks direpresentasikan sebagai angka\n",
|
||||
"Masukkan teks untuk dokumen ke-3: Pada tahap ini Fatah menggunakan n-gram untuk menguji kombinasi kata yang sering muncul bersama\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== Dokumen yang Dimasukkan ===\n",
|
||||
"Doc 1: Saya Fatah Sabila Rosyad, mahasiswa informatika yang sedang mempelajari NLP dan machine learning\n",
|
||||
"Doc 2: Fatah melakukan eksperimen Bag-of-Words untuk melihat bagaimana fitur teks direpresentasikan sebagai angka\n",
|
||||
"Doc 3: Pada tahap ini Fatah menggunakan n-gram untuk menguji kombinasi kata yang sering muncul bersama\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Input teks dokumen satu per satu\n",
|
||||
"documents = []\n",
|
||||
"for i in range(n):\n",
|
||||
" teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
|
||||
" documents.append(teks)\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
|
||||
"for i, doc in enumerate(documents):\n",
|
||||
" print(f\"Doc {i+1}: {doc}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "FkmxRAFq1oDK",
|
||||
"outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== Hasil Tokenisasi ===\n",
|
||||
"Doc 1: ['saya', 'fatah', 'sabila', 'rosyad,', 'mahasiswa', 'informatika', 'yang', 'sedang', 'mempelajari', 'nlp', 'dan', 'machine', 'learning']\n",
|
||||
"Doc 2: ['fatah', 'melakukan', 'eksperimen', 'bag-of-words', 'untuk', 'melihat', 'bagaimana', 'fitur', 'teks', 'direpresentasikan', 'sebagai', 'angka']\n",
|
||||
"Doc 3: ['pada', 'tahap', 'ini', 'fatah', 'menggunakan', 'n-gram', 'untuk', 'menguji', 'kombinasi', 'kata', 'yang', 'sering', 'muncul', 'bersama']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Tahap Tokenisasi\n",
|
||||
"tokenized_docs = []\n",
|
||||
"for doc in documents:\n",
|
||||
" tokens = doc.lower().split()\n",
|
||||
" tokenized_docs.append(tokens)\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
|
||||
"for i, tokens in enumerate(tokenized_docs):\n",
|
||||
" print(f\"Doc {i+1}: {tokens}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "ybC1Vo2C_c3q",
|
||||
"outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
|
||||
"['saya', 'fatah', 'sabila', 'rosyad,', 'mahasiswa', 'informatika', 'yang', 'sedang', 'mempelajari', 'nlp', 'dan', 'machine', 'learning', 'fatah', 'melakukan', 'eksperimen', 'bag-of-words', 'untuk', 'melihat', 'bagaimana', 'fitur', 'teks', 'direpresentasikan', 'sebagai', 'angka', 'pada', 'tahap', 'ini', 'fatah', 'menggunakan', 'n-gram', 'untuk', 'menguji', 'kombinasi', 'kata', 'yang', 'sering', 'muncul', 'bersama']\n",
|
||||
"Jumlah total kata dalam seluruh dokumen: 39\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Pembuatan Corpus\n",
|
||||
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
|
||||
"print(corpus_all)\n",
|
||||
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== Hasil BOW Manual (Frekuensi Kata) ===\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'fatah': 3,\n",
|
||||
" 'yang': 2,\n",
|
||||
" 'untuk': 2,\n",
|
||||
" 'saya': 1,\n",
|
||||
" 'sabila': 1,\n",
|
||||
" 'rosyad,': 1,\n",
|
||||
" 'mahasiswa': 1,\n",
|
||||
" 'informatika': 1,\n",
|
||||
" 'sedang': 1,\n",
|
||||
" 'mempelajari': 1,\n",
|
||||
" 'nlp': 1,\n",
|
||||
" 'dan': 1,\n",
|
||||
" 'machine': 1,\n",
|
||||
" 'learning': 1,\n",
|
||||
" 'melakukan': 1,\n",
|
||||
" 'eksperimen': 1,\n",
|
||||
" 'bag-of-words': 1,\n",
|
||||
" 'melihat': 1,\n",
|
||||
" 'bagaimana': 1,\n",
|
||||
" 'fitur': 1,\n",
|
||||
" 'teks': 1,\n",
|
||||
" 'direpresentasikan': 1,\n",
|
||||
" 'sebagai': 1,\n",
|
||||
" 'angka': 1,\n",
|
||||
" 'pada': 1,\n",
|
||||
" 'tahap': 1,\n",
|
||||
" 'ini': 1,\n",
|
||||
" 'menggunakan': 1,\n",
|
||||
" 'n-gram': 1,\n",
|
||||
" 'menguji': 1,\n",
|
||||
" 'kombinasi': 1,\n",
|
||||
" 'kata': 1,\n",
|
||||
" 'sering': 1,\n",
|
||||
" 'muncul': 1,\n",
|
||||
" 'bersama': 1}"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# 4) Membuat Bag-of-Words manual (frekuensi kata)\n",
|
||||
"bow_manual = {}\n",
|
||||
"for tokens in tokenized_docs:\n",
|
||||
" for token in tokens:\n",
|
||||
" bow_manual[token] = bow_manual.get(token, 0) + 1\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Hasil BOW Manual (Frekuensi Kata) ===\")\n",
|
||||
"# Tampilkan dictionary secara sorted by frequency (desc)\n",
|
||||
"bow_sorted = dict(sorted(bow_manual.items(), key=lambda x: x[1], reverse=True))\n",
|
||||
"bow_sorted"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "s6S-Ma4R1xuq",
|
||||
"outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== Vocabulary (Kata Unik) ===\n",
|
||||
"['angka', 'bag-of-words', 'bagaimana', 'bersama', 'dan', 'direpresentasikan', 'eksperimen', 'fatah', 'fitur', 'informatika', 'ini', 'kata', 'kombinasi', 'learning', 'machine', 'mahasiswa', 'melakukan', 'melihat', 'mempelajari', 'menggunakan', 'menguji', 'muncul', 'n-gram', 'nlp', 'pada', 'rosyad,', 'sabila', 'saya', 'sebagai', 'sedang', 'sering', 'tahap', 'teks', 'untuk', 'yang']\n",
|
||||
"Jumlah kata unik (vocabulary size): 35\n",
|
||||
"\n",
|
||||
"=== Vocabulary (Kata Unik) ===\n",
|
||||
" 1. angka\n",
|
||||
" 2. bag-of-words\n",
|
||||
" 3. bagaimana\n",
|
||||
" 4. bersama\n",
|
||||
" 5. dan\n",
|
||||
" 6. direpresentasikan\n",
|
||||
" 7. eksperimen\n",
|
||||
" 8. fatah\n",
|
||||
" 9. fitur\n",
|
||||
"10. informatika\n",
|
||||
"11. ini\n",
|
||||
"12. kata\n",
|
||||
"13. kombinasi\n",
|
||||
"14. learning\n",
|
||||
"15. machine\n",
|
||||
"16. mahasiswa\n",
|
||||
"17. melakukan\n",
|
||||
"18. melihat\n",
|
||||
"19. mempelajari\n",
|
||||
"20. menggunakan\n",
|
||||
"21. menguji\n",
|
||||
"22. muncul\n",
|
||||
"23. n-gram\n",
|
||||
"24. nlp\n",
|
||||
"25. pada\n",
|
||||
"26. rosyad,\n",
|
||||
"27. sabila\n",
|
||||
"28. saya\n",
|
||||
"29. sebagai\n",
|
||||
"30. sedang\n",
|
||||
"31. sering\n",
|
||||
"32. tahap\n",
|
||||
"33. teks\n",
|
||||
"34. untuk\n",
|
||||
"35. yang\n",
|
||||
"\n",
|
||||
"Jumlah kata unik (vocabulary size): 35\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Pembuatan Vocabulary\n",
|
||||
"vocabulary = sorted(set(corpus_all))\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
|
||||
"print(vocabulary)\n",
|
||||
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"vocabulary = sorted(set(corpus_all))\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
|
||||
"for idx, word in enumerate(vocabulary, start=1):\n",
|
||||
" print(f\"{idx:>2}. {word}\")\n",
|
||||
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Jumlah dokumen: 3\n",
|
||||
"Jumlah tokenized_docs: 3\n",
|
||||
"Jumlah kata di vocabulary (unique): 35\n",
|
||||
"Contoh 10 kata pertama vocabulary: ['angka', 'bag-of-words', 'bagaimana', 'bersama', 'dan', 'direpresentasikan', 'eksperimen', 'fatah', 'fitur', 'informatika']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"Jumlah dokumen:\", len(documents))\n",
|
||||
"print(\"Jumlah tokenized_docs:\", len(tokenized_docs))\n",
|
||||
"print(\"Jumlah kata di vocabulary (unique):\", len(vocabulary))\n",
|
||||
"print(\"Contoh 10 kata pertama vocabulary:\", vocabulary[:10])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== Info CountVectorizer ===\n",
|
||||
"n-gram range: (1, 3)\n",
|
||||
"max_features: 500\n",
|
||||
"Jumlah fitur (vocabulary size): 110\n",
|
||||
"\n",
|
||||
"Contoh 20 fitur pertama:\n",
|
||||
"['angka' 'bag' 'bag of' 'bag of words' 'bagaimana' 'bagaimana fitur'\n",
|
||||
" 'bagaimana fitur teks' 'bersama' 'dan' 'dan machine'\n",
|
||||
" 'dan machine learning' 'direpresentasikan' 'direpresentasikan sebagai'\n",
|
||||
" 'direpresentasikan sebagai angka' 'eksperimen' 'eksperimen bag'\n",
|
||||
" 'eksperimen bag of' 'fatah' 'fatah melakukan'\n",
|
||||
" 'fatah melakukan eksperimen']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# BOW modern: CountVectorizer dengan n-gram dan max_features \n",
|
||||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||||
"\n",
|
||||
"# Ubah di sini bila mau nilai lain:\n",
|
||||
"NGRAM_RANGE = (1, 3) # ubah n-gram (contoh: (1,2) atau (1,4))\n",
|
||||
"MAX_FEATURES = 500 # ubah jumlah fitur (contoh: 200, 1000)\n",
|
||||
"\n",
|
||||
"vectorizer = CountVectorizer(ngram_range=NGRAM_RANGE, max_features=MAX_FEATURES)\n",
|
||||
"X = vectorizer.fit_transform(documents)\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Info CountVectorizer ===\")\n",
|
||||
"print(\"n-gram range:\", NGRAM_RANGE)\n",
|
||||
"print(\"max_features:\", MAX_FEATURES)\n",
|
||||
"print(\"Jumlah fitur (vocabulary size):\", len(vectorizer.vocabulary_))\n",
|
||||
"print(\"\\nContoh 20 fitur pertama:\")\n",
|
||||
"print(vectorizer.get_feature_names_out()[:20])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>angka</th>\n",
|
||||
" <th>bag</th>\n",
|
||||
" <th>bag of</th>\n",
|
||||
" <th>bag of words</th>\n",
|
||||
" <th>bagaimana</th>\n",
|
||||
" <th>bagaimana fitur</th>\n",
|
||||
" <th>bagaimana fitur teks</th>\n",
|
||||
" <th>bersama</th>\n",
|
||||
" <th>dan</th>\n",
|
||||
" <th>dan machine</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>untuk menguji</th>\n",
|
||||
" <th>untuk menguji kombinasi</th>\n",
|
||||
" <th>words</th>\n",
|
||||
" <th>words untuk</th>\n",
|
||||
" <th>words untuk melihat</th>\n",
|
||||
" <th>yang</th>\n",
|
||||
" <th>yang sedang</th>\n",
|
||||
" <th>yang sedang mempelajari</th>\n",
|
||||
" <th>yang sering</th>\n",
|
||||
" <th>yang sering muncul</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>3 rows × 110 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" angka bag bag of bag of words bagaimana bagaimana fitur \\\n",
|
||||
"0 0 0 0 0 0 0 \n",
|
||||
"1 1 1 1 1 1 1 \n",
|
||||
"2 0 0 0 0 0 0 \n",
|
||||
"\n",
|
||||
" bagaimana fitur teks bersama dan dan machine ... untuk menguji \\\n",
|
||||
"0 0 0 1 1 ... 0 \n",
|
||||
"1 1 0 0 0 ... 0 \n",
|
||||
"2 0 1 0 0 ... 1 \n",
|
||||
"\n",
|
||||
" untuk menguji kombinasi words words untuk words untuk melihat yang \\\n",
|
||||
"0 0 0 0 0 1 \n",
|
||||
"1 0 1 1 1 0 \n",
|
||||
"2 1 0 0 0 1 \n",
|
||||
"\n",
|
||||
" yang sedang yang sedang mempelajari yang sering yang sering muncul \n",
|
||||
"0 1 1 0 0 \n",
|
||||
"1 0 0 0 0 \n",
|
||||
"2 0 0 1 1 \n",
|
||||
"\n",
|
||||
"[3 rows x 110 columns]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# 6) Konversi hasil ke DataFrame\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"df_features = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())\n",
|
||||
"display(df_features) # di Jupyter ini akan tampil tabel\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"id": "ShevCTva2Fg9"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Representasi Numerik (Matriks BoW)\n",
|
||||
"bow_matrix = []\n",
|
||||
"for doc in tokenized_docs:\n",
|
||||
" vector = [doc.count(word) for word in vocabulary]\n",
|
||||
" bow_matrix.append(vector)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>angka</th>\n",
|
||||
" <th>bag-of-words</th>\n",
|
||||
" <th>bagaimana</th>\n",
|
||||
" <th>bersama</th>\n",
|
||||
" <th>dan</th>\n",
|
||||
" <th>direpresentasikan</th>\n",
|
||||
" <th>eksperimen</th>\n",
|
||||
" <th>fatah</th>\n",
|
||||
" <th>fitur</th>\n",
|
||||
" <th>informatika</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>rosyad,</th>\n",
|
||||
" <th>sabila</th>\n",
|
||||
" <th>saya</th>\n",
|
||||
" <th>sebagai</th>\n",
|
||||
" <th>sedang</th>\n",
|
||||
" <th>sering</th>\n",
|
||||
" <th>tahap</th>\n",
|
||||
" <th>teks</th>\n",
|
||||
" <th>untuk</th>\n",
|
||||
" <th>yang</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>3 rows × 35 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" angka bag-of-words bagaimana bersama dan direpresentasikan \\\n",
|
||||
"0 0 0 0 0 1 0 \n",
|
||||
"1 1 1 1 0 0 1 \n",
|
||||
"2 0 0 0 1 0 0 \n",
|
||||
"\n",
|
||||
" eksperimen fatah fitur informatika ... rosyad, sabila saya sebagai \\\n",
|
||||
"0 0 1 0 1 ... 1 1 1 0 \n",
|
||||
"1 1 1 1 0 ... 0 0 0 1 \n",
|
||||
"2 0 1 0 0 ... 0 0 0 0 \n",
|
||||
"\n",
|
||||
" sedang sering tahap teks untuk yang \n",
|
||||
"0 1 0 0 0 0 1 \n",
|
||||
"1 0 0 0 1 1 0 \n",
|
||||
"2 0 1 1 0 1 1 \n",
|
||||
"\n",
|
||||
"[3 rows x 35 columns]"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"Shape (dokumen x fitur): (3, 35)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"# Buat matrix manual berdasarkan vocabulary yang sudah kamu buat\n",
|
||||
"bow_matrix = []\n",
|
||||
"for tokens in tokenized_docs:\n",
|
||||
" # hitung frekuensi tiap kata pada vocabulary pada dokumen ini\n",
|
||||
" vector = [tokens.count(word) for word in vocabulary] # tokens adalah list kata\n",
|
||||
" bow_matrix.append(vector)\n",
|
||||
"\n",
|
||||
"# Konversi ke DataFrame agar rapi saat ditampilkan\n",
|
||||
"df_bow_manual = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
|
||||
"display(df_bow_manual) # di Jupyter ini akan tampil tabel\n",
|
||||
"print(\"\\nShape (dokumen x fitur):\", df_bow_manual.shape)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
360
Klasifikasi Teks FNN.ipynb
Normal file
360
Klasifikasi Teks FNN.ipynb
Normal file
@ -0,0 +1,360 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f4a1399a-f23d-4060-a07e-bce5a5c7ddac",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Klasifikasi Teks menggunakan ANN (TF-IDF + FNN)\n",
|
||||
"\n",
|
||||
"**Nama:** Fatah Sabila Rosyad \n",
|
||||
"**NIM:** 202210715288 \n",
|
||||
"**Kelas:** F7B2 \n",
|
||||
"**MK:** NLP \n",
|
||||
"\n",
|
||||
"**Tujuan praktikum:**\n",
|
||||
"Menerapkan klasifikasi teks sentimen sederhana menggunakan TF-IDF dan Feedforward Neural Network (MLPClassifier), dengan:\n",
|
||||
"- Mengubah contoh teks (menggunakan kalimat yang dibuat sendiri)\n",
|
||||
"- Mengubah parameter TF-IDF (`max_features`, `ngram_range`)\n",
|
||||
"- Mengubah arsitektur dan parameter model ANN (`hidden_layer_sizes`, `max_iter`, `learning_rate_init`)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4c395092-326a-4abc-b308-067392277cfa",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# ---------------------------------------------------------\n",
|
||||
"# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network\n",
|
||||
"# ---------------------------------------------------------\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.neural_network import MLPClassifier\n",
|
||||
"from sklearn.metrics import classification_report, confusion_matrix"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "4ac91b0c-e6af-4766-8933-db10ebf69140",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>text</th>\n",
|
||||
" <th>label</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>Saya Fatah Sabila Rosyad merasa sangat puas de...</td>\n",
|
||||
" <td>positive</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>Sebagai pelanggan, Fatah kecewa karena pelayan...</td>\n",
|
||||
" <td>negative</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>Pengalaman belanja Fatah kali ini menyenangkan...</td>\n",
|
||||
" <td>positive</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>Fatah benci produk ini karena mudah rusak dan ...</td>\n",
|
||||
" <td>negative</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>Menurut Fatah kualitas produk ini sangat bagus...</td>\n",
|
||||
" <td>positive</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>Fatah tidak akan membeli lagi di sini karena p...</td>\n",
|
||||
" <td>negative</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" text label\n",
|
||||
"0 Saya Fatah Sabila Rosyad merasa sangat puas de... positive\n",
|
||||
"1 Sebagai pelanggan, Fatah kecewa karena pelayan... negative\n",
|
||||
"2 Pengalaman belanja Fatah kali ini menyenangkan... positive\n",
|
||||
"3 Fatah benci produk ini karena mudah rusak dan ... negative\n",
|
||||
"4 Menurut Fatah kualitas produk ini sangat bagus... positive\n",
|
||||
"5 Fatah tidak akan membeli lagi di sini karena p... negative"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# -----------------------------------------\n",
|
||||
"# 1. Contoh Dataset (teks buatan Fatah)\n",
|
||||
"# -----------------------------------------\n",
|
||||
"\n",
|
||||
"data = {\n",
|
||||
" \"text\": [\n",
|
||||
" \"Saya Fatah Sabila Rosyad merasa sangat puas dengan kualitas produk ini.\",\n",
|
||||
" \"Sebagai pelanggan, Fatah kecewa karena pelayanan toko sangat lambat.\",\n",
|
||||
" \"Pengalaman belanja Fatah kali ini menyenangkan, proses cepat dan barang sesuai.\",\n",
|
||||
" \"Fatah benci produk ini karena mudah rusak dan tidak sesuai deskripsi.\",\n",
|
||||
" \"Menurut Fatah kualitas produk ini sangat bagus dan layak direkomendasikan.\",\n",
|
||||
" \"Fatah tidak akan membeli lagi di sini karena pelayanan buruk dan respon yang lambat.\"\n",
|
||||
" ],\n",
|
||||
" \"label\": [\"positive\", \"negative\", \"positive\", \"negative\", \"positive\", \"negative\"]\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"df = pd.DataFrame(data)\n",
|
||||
"df\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "6dab8e80-c225-4de8-aecc-8b457153c3ee",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Jumlah data latih : 3\n",
|
||||
"Jumlah data uji : 3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# -----------------------------------------\n",
|
||||
"# 2. Split Train & Test (PERUBAHAN: test_size & random_state)\n",
|
||||
"# -----------------------------------------\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" df[\"text\"],\n",
|
||||
" df[\"label\"],\n",
|
||||
" test_size=0.34, # semula 0.3\n",
|
||||
" random_state=7 # semula 42\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(\"Jumlah data latih :\", len(X_train))\n",
|
||||
"print(\"Jumlah data uji :\", len(X_test))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "2cb05f0c-b497-4e9e-87bc-25d167f0c0ee",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Shape X_train_tfidf: (3, 52)\n",
|
||||
"Shape X_test_tfidf : (3, 52)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# -----------------------------------------\n",
|
||||
"# 3. TF-IDF Vectorization (PERUBAHAN PARAMETER)\n",
|
||||
"# -----------------------------------------\n",
|
||||
"\n",
|
||||
"tfidf = TfidfVectorizer(\n",
|
||||
" max_features=1000, # semula 5000\n",
|
||||
" ngram_range=(1, 2) # tambahan: gunakan unigram + bigram\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"X_train_tfidf = tfidf.fit_transform(X_train)\n",
|
||||
"X_test_tfidf = tfidf.transform(X_test)\n",
|
||||
"\n",
|
||||
"print(\"Shape X_train_tfidf:\", X_train_tfidf.shape)\n",
|
||||
"print(\"Shape X_test_tfidf :\", X_test_tfidf.shape)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "0cb99708-b1bd-43a7-84b9-4e4925bf2914",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Model selesai dilatih.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# -----------------------------------------\n",
|
||||
"# 4. Feedforward ANN (MLPClassifier) (PERUBAHAN PARAMETER)\n",
|
||||
"# -----------------------------------------\n",
|
||||
"\n",
|
||||
"model = MLPClassifier(\n",
|
||||
" hidden_layer_sizes=(128, 32), # semula (256, 64)\n",
|
||||
" activation='relu',\n",
|
||||
" solver='adam',\n",
|
||||
" learning_rate_init=0.001, # tambahan\n",
|
||||
" max_iter=300, # semula 500\n",
|
||||
" random_state=7\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model.fit(X_train_tfidf, y_train)\n",
|
||||
"print(\"Model selesai dilatih.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "d388afdf-0f08-48ea-92d1-e03390dee1d9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"=== Classification Report ===\n",
|
||||
" precision recall f1-score support\n",
|
||||
"\n",
|
||||
" negative 1.00 0.50 0.67 2\n",
|
||||
" positive 0.50 1.00 0.67 1\n",
|
||||
"\n",
|
||||
" accuracy 0.67 3\n",
|
||||
" macro avg 0.75 0.75 0.67 3\n",
|
||||
"weighted avg 0.83 0.67 0.67 3\n",
|
||||
"\n",
|
||||
"=== Confusion Matrix ===\n",
|
||||
"[[1 1]\n",
|
||||
" [0 1]]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# -----------------------------------------\n",
|
||||
"# 5. Evaluasi Model\n",
|
||||
"# -----------------------------------------\n",
|
||||
"\n",
|
||||
"y_pred = model.predict(X_test_tfidf)\n",
|
||||
"\n",
|
||||
"print(\"=== Classification Report ===\")\n",
|
||||
"print(classification_report(y_test, y_pred))\n",
|
||||
"\n",
|
||||
"print(\"=== Confusion Matrix ===\")\n",
|
||||
"print(confusion_matrix(y_test, y_pred))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "64141093-c8fd-4118-aaf3-6e48454c5e76",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Prediksi untuk: Menurut Fatah, pengalaman belanja kali ini sangat memuaskan.\n",
|
||||
"Hasil: positive\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# -----------------------------------------\n",
|
||||
"# 6. Prediksi Teks Baru (contoh 1 - positif)\n",
|
||||
"# -----------------------------------------\n",
|
||||
"sample_text = [\"Menurut Fatah, pengalaman belanja kali ini sangat memuaskan.\"]\n",
|
||||
"sample_vec = tfidf.transform(sample_text)\n",
|
||||
"prediction = model.predict(sample_vec)\n",
|
||||
"\n",
|
||||
"print(\"Prediksi untuk:\", sample_text[0])\n",
|
||||
"print(\"Hasil:\", prediction[0])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "d4bf8434-fe3b-4a88-a294-207fa731de7d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Prediksi untuk: Saya Fatah merasa kecewa karena layanan toko sangat buruk.\n",
|
||||
"Hasil: negative\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# -----------------------------------------\n",
|
||||
"# 6. Prediksi Teks Baru (contoh 2 - negatif)\n",
|
||||
"# -----------------------------------------\n",
|
||||
"sample_text = [\"Saya Fatah merasa kecewa karena layanan toko sangat buruk.\"]\n",
|
||||
"sample_vec = tfidf.transform(sample_text)\n",
|
||||
"prediction = model.predict(sample_vec)\n",
|
||||
"\n",
|
||||
"print(\"Prediksi untuk:\", sample_text[0])\n",
|
||||
"print(\"Hasil:\", prediction[0])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "724617be-aa1d-41bd-ad39-e6517fbcf837",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
362
N-Gram.ipynb
Normal file
362
N-Gram.ipynb
Normal file
@ -0,0 +1,362 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Model Unigram, Bigram, dan Trigram\n",
|
||||
"\n",
|
||||
"**Nama:** Fatah Sabila Rosyad \n",
|
||||
"**NIM:** 202210715288 \n",
|
||||
"**Kelas:** F7B2 \n",
|
||||
"\n",
|
||||
"**Tujuan praktikum:** \n",
|
||||
"Memahami cara menghitung frekuensi dan probabilitas kalimat menggunakan model Unigram, Bigram, dan Trigram pada teks bahasa Indonesia.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from collections import Counter\n",
|
||||
"from IPython.display import clear_output\n",
|
||||
"import math"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Corpus: fatah suka olahraga lari dan suka olahraga badminton\n",
|
||||
"Tokens (8): ['fatah', 'suka', 'olahraga', 'lari', 'dan', 'suka', 'olahraga', 'badminton']\n",
|
||||
"\n",
|
||||
"Frekuensi Unigram dalam kalimat:\n",
|
||||
" ('fatah'): 1\n",
|
||||
" ('suka'): 2\n",
|
||||
" ('olahraga'): 2\n",
|
||||
" ('lari'): 1\n",
|
||||
" ('dan'): 1\n",
|
||||
" ('badminton'): 1\n",
|
||||
"\n",
|
||||
"Total unigram dalam 1 kalimat: 8\n",
|
||||
"\n",
|
||||
"Probabilitas masing-masing unigram:\n",
|
||||
" P(fatah) = 0.1250 (12.50%)\n",
|
||||
" P(suka) = 0.2500 (25.00%)\n",
|
||||
" P(olahraga) = 0.2500 (25.00%)\n",
|
||||
" P(lari) = 0.1250 (12.50%)\n",
|
||||
" P(dan) = 0.1250 (12.50%)\n",
|
||||
" P(badminton) = 0.1250 (12.50%)\n",
|
||||
"\n",
|
||||
"Probabilitas Keseluruhan Kalimat (Model Unigram):\n",
|
||||
" P(fatah suka olahraga lari dan suka olahraga badminton) = P(fatah)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(lari)=0.1250 x P(dan)=0.1250 x P(suka)=0.2500 x P(olahraga)=0.2500 x P(badminton)=0.1250 = 0.00000095 (0.000095%)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# ================= UNIGRAM =================\n",
|
||||
"\n",
|
||||
"# 1. Input Kalimat dan Tokenisasi\n",
|
||||
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
|
||||
"\n",
|
||||
"# Bersihkan output (khusus lingkungan notebook)\n",
|
||||
"try:\n",
|
||||
" clear_output()\n",
|
||||
"except:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"print(f\"Corpus: {kalimat}\")\n",
|
||||
"\n",
|
||||
"# Tokenize\n",
|
||||
"tokens = kalimat.lower().split()\n",
|
||||
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
|
||||
"\n",
|
||||
"# 2. Hitung Frekuensi Unigram\n",
|
||||
"unigram_counts = Counter(tokens)\n",
|
||||
"total_tokens = sum(unigram_counts.values())\n",
|
||||
"\n",
|
||||
"print(\"\\nFrekuensi Unigram dalam kalimat:\")\n",
|
||||
"for pair, count in unigram_counts.items():\n",
|
||||
" print(f\" ('{pair}'): {count}\")\n",
|
||||
"print(f\"\\nTotal unigram dalam 1 kalimat: {total_tokens}\")\n",
|
||||
"\n",
|
||||
"# 3. Hitung Probabilitas Unigram: P(wi) = Count(wi) / Total Kata\n",
|
||||
"unigram_probabilities = {}\n",
|
||||
"for word, count in unigram_counts.items():\n",
|
||||
" prob = count / total_tokens\n",
|
||||
" unigram_probabilities[word] = prob\n",
|
||||
"\n",
|
||||
"print(\"\\nProbabilitas masing-masing unigram:\")\n",
|
||||
"for word, prob in unigram_probabilities.items():\n",
|
||||
" print(f\" P({word}) = {prob:.4f} ({prob*100:.2f}%)\")\n",
|
||||
"\n",
|
||||
"# 4. Probabilitas Kalimat Keseluruhan (P(w1) * P(w2) * ...)\n",
|
||||
"p_kalimat = 1\n",
|
||||
"prob_parts = []\n",
|
||||
"\n",
|
||||
"for word in tokens:\n",
|
||||
" prob_value = unigram_probabilities[word]\n",
|
||||
" p_kalimat *= prob_value\n",
|
||||
" prob_parts.append(f\"P({word})={prob_value:.4f}\")\n",
|
||||
"\n",
|
||||
"prob_str = \" x \".join(prob_parts)\n",
|
||||
"\n",
|
||||
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Unigram):\")\n",
|
||||
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Corpus: Fatah sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar\n",
|
||||
"Tokens (14): ['fatah', 'sedang', 'belajar', 'model', 'bigram', 'untuk', 'menghitung', 'probabilitas', 'kalimat', 'dan', 'fatah', 'sangat', 'suka', 'belajar']\n",
|
||||
"\n",
|
||||
"Frekuensi Bigram dalam kalimat:\n",
|
||||
" ('fatah', 'sedang'): 1\n",
|
||||
" ('sedang', 'belajar'): 1\n",
|
||||
" ('belajar', 'model'): 1\n",
|
||||
" ('model', 'bigram'): 1\n",
|
||||
" ('bigram', 'untuk'): 1\n",
|
||||
" ('untuk', 'menghitung'): 1\n",
|
||||
" ('menghitung', 'probabilitas'): 1\n",
|
||||
" ('probabilitas', 'kalimat'): 1\n",
|
||||
" ('kalimat', 'dan'): 1\n",
|
||||
" ('dan', 'fatah'): 1\n",
|
||||
" ('fatah', 'sangat'): 1\n",
|
||||
" ('sangat', 'suka'): 1\n",
|
||||
" ('suka', 'belajar'): 1\n",
|
||||
"\n",
|
||||
"Total bigram dalam 1 kalimat: 13\n",
|
||||
"\n",
|
||||
"Probabilitas masing-masing bigram:\n",
|
||||
" P(sedang|fatah) = 0.5000 (50.00%)\n",
|
||||
" P(belajar|sedang) = 1.0000 (100.00%)\n",
|
||||
" P(model|belajar) = 0.5000 (50.00%)\n",
|
||||
" P(bigram|model) = 1.0000 (100.00%)\n",
|
||||
" P(untuk|bigram) = 1.0000 (100.00%)\n",
|
||||
" P(menghitung|untuk) = 1.0000 (100.00%)\n",
|
||||
" P(probabilitas|menghitung) = 1.0000 (100.00%)\n",
|
||||
" P(kalimat|probabilitas) = 1.0000 (100.00%)\n",
|
||||
" P(dan|kalimat) = 1.0000 (100.00%)\n",
|
||||
" P(fatah|dan) = 1.0000 (100.00%)\n",
|
||||
" P(sangat|fatah) = 0.5000 (50.00%)\n",
|
||||
" P(suka|sangat) = 1.0000 (100.00%)\n",
|
||||
" P(belajar|suka) = 1.0000 (100.00%)\n",
|
||||
"\n",
|
||||
"Probabilitas Keseluruhan Kalimat (Model Bigram):\n",
|
||||
" P(fatah sedang belajar model bigram untuk menghitung probabilitas kalimat dan fatah sangat suka belajar) = P(fatah)=0.1429 x P(sedang|fatah)=0.5000 x P(belajar|sedang)=1.0000 x P(model|belajar)=0.5000 x P(bigram|model)=1.0000 x P(untuk|bigram)=1.0000 x P(menghitung|untuk)=1.0000 x P(probabilitas|menghitung)=1.0000 x P(kalimat|probabilitas)=1.0000 x P(dan|kalimat)=1.0000 x P(fatah|dan)=1.0000 x P(sangat|fatah)=0.5000 x P(suka|sangat)=1.0000 x P(belajar|suka)=1.0000 = 0.01785714 (1.785714%)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# ================= BIGRAM =================\n",
|
||||
"\n",
|
||||
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" clear_output()\n",
|
||||
"except:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"print(f\"Corpus: {kalimat}\")\n",
|
||||
"\n",
|
||||
"# Tokenisasi\n",
|
||||
"tokens = kalimat.lower().split()\n",
|
||||
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
|
||||
"\n",
|
||||
"# 2. Frekuensi Unigram dan Bigram\n",
|
||||
"unigram_counts = Counter(tokens)\n",
|
||||
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
|
||||
"bigram_counts = Counter(bigrams)\n",
|
||||
"\n",
|
||||
"print(\"\\nFrekuensi Bigram dalam kalimat:\")\n",
|
||||
"for pair, count in bigram_counts.items():\n",
|
||||
" print(f\" {pair}: {count}\")\n",
|
||||
"print(f\"\\nTotal bigram dalam 1 kalimat: {sum(bigram_counts.values())}\")\n",
|
||||
"\n",
|
||||
"# 3. Probabilitas Bigram: P(w2 | w1) = Count(w1,w2) / Count(w1)\n",
|
||||
"bigram_probabilities = {}\n",
|
||||
"for (w1, w2), count in bigram_counts.items():\n",
|
||||
" prob = count / unigram_counts[w1]\n",
|
||||
" bigram_probabilities[(w1, w2)] = prob\n",
|
||||
"\n",
|
||||
"print(\"\\nProbabilitas masing-masing bigram:\")\n",
|
||||
"for (w1, w2), prob in bigram_probabilities.items():\n",
|
||||
" print(f\" P({w2}|{w1}) = {prob:.4f} ({prob*100:.2f}%)\")\n",
|
||||
"\n",
|
||||
"# 4. Probabilitas Kalimat (Model Bigram)\n",
|
||||
"total_tokens = sum(unigram_counts.values())\n",
|
||||
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens\n",
|
||||
"p_kalimat = p_w1\n",
|
||||
"\n",
|
||||
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.4f}\"]\n",
|
||||
"\n",
|
||||
"for i in range(1, len(tokens)):\n",
|
||||
" pair = (tokens[i-1], tokens[i])\n",
|
||||
" p = bigram_probabilities.get(pair, 0)\n",
|
||||
" p_kalimat *= p\n",
|
||||
" prob_str_parts.append(f\"P({pair[1]}|{pair[0]})={p:.4f}\")\n",
|
||||
"\n",
|
||||
"prob_str = \" x \".join(prob_str_parts)\n",
|
||||
"\n",
|
||||
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Bigram):\")\n",
|
||||
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Corpus: Pada praktikum ini Fatah sedang mempelajari model trigram untuk kalimat bahasa Indonesia\n",
|
||||
"Tokens (12): ['pada', 'praktikum', 'ini', 'fatah', 'sedang', 'mempelajari', 'model', 'trigram', 'untuk', 'kalimat', 'bahasa', 'indonesia']\n",
|
||||
"\n",
|
||||
"Frekuensi Trigram dalam kalimat:\n",
|
||||
" ('pada', 'praktikum', 'ini'): 1\n",
|
||||
" ('praktikum', 'ini', 'fatah'): 1\n",
|
||||
" ('ini', 'fatah', 'sedang'): 1\n",
|
||||
" ('fatah', 'sedang', 'mempelajari'): 1\n",
|
||||
" ('sedang', 'mempelajari', 'model'): 1\n",
|
||||
" ('mempelajari', 'model', 'trigram'): 1\n",
|
||||
" ('model', 'trigram', 'untuk'): 1\n",
|
||||
" ('trigram', 'untuk', 'kalimat'): 1\n",
|
||||
" ('untuk', 'kalimat', 'bahasa'): 1\n",
|
||||
" ('kalimat', 'bahasa', 'indonesia'): 1\n",
|
||||
"\n",
|
||||
"Total trigram dalam 1 kalimat: 10\n",
|
||||
"\n",
|
||||
"Probabilitas masing-masing trigram:\n",
|
||||
" P(ini|pada,praktikum) = 1.0000 (100.00%)\n",
|
||||
" P(fatah|praktikum,ini) = 1.0000 (100.00%)\n",
|
||||
" P(sedang|ini,fatah) = 1.0000 (100.00%)\n",
|
||||
" P(mempelajari|fatah,sedang) = 1.0000 (100.00%)\n",
|
||||
" P(model|sedang,mempelajari) = 1.0000 (100.00%)\n",
|
||||
" P(trigram|mempelajari,model) = 1.0000 (100.00%)\n",
|
||||
" P(untuk|model,trigram) = 1.0000 (100.00%)\n",
|
||||
" P(kalimat|trigram,untuk) = 1.0000 (100.00%)\n",
|
||||
" P(bahasa|untuk,kalimat) = 1.0000 (100.00%)\n",
|
||||
" P(indonesia|kalimat,bahasa) = 1.0000 (100.00%)\n",
|
||||
"\n",
|
||||
"Probabilitas Keseluruhan Kalimat (Model Trigram):\n",
|
||||
" P(pada praktikum ini fatah sedang mempelajari model trigram untuk kalimat bahasa indonesia) = P(pada)=0.0833 x P(praktikum|pada)=1.0000 x P(ini|pada,praktikum)=1.0000 x P(fatah|praktikum,ini)=1.0000 x P(sedang|ini,fatah)=1.0000 x P(mempelajari|fatah,sedang)=1.0000 x P(model|sedang,mempelajari)=1.0000 x P(trigram|mempelajari,model)=1.0000 x P(untuk|model,trigram)=1.0000 x P(kalimat|trigram,untuk)=1.0000 x P(bahasa|untuk,kalimat)=1.0000 x P(indonesia|kalimat,bahasa)=1.0000 = 0.08333333 (8.333333%)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# ================= TRIGRAM =================\n",
|
||||
"\n",
|
||||
"kalimat = input(\"Masukkan kalimat: \").strip()\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" clear_output()\n",
|
||||
"except:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
"print(f\"Corpus: {kalimat}\")\n",
|
||||
"\n",
|
||||
"# Tokenisasi\n",
|
||||
"tokens = kalimat.lower().split()\n",
|
||||
"print(f\"Tokens ({len(tokens)}): {tokens}\")\n",
|
||||
"\n",
|
||||
"# 2. Frekuensi Bigram dan Trigram\n",
|
||||
"bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens) - 1)]\n",
|
||||
"trigrams = [(tokens[i], tokens[i+1], tokens[i+2]) for i in range(len(tokens) - 2)]\n",
|
||||
"\n",
|
||||
"bigram_counts = Counter(bigrams)\n",
|
||||
"trigram_counts = Counter(trigrams)\n",
|
||||
"\n",
|
||||
"print(\"\\nFrekuensi Trigram dalam kalimat:\")\n",
|
||||
"for tg, count in trigram_counts.items():\n",
|
||||
" print(f\" {tg}: {count}\")\n",
|
||||
"print(f\"\\nTotal trigram dalam 1 kalimat: {sum(trigram_counts.values())}\")\n",
|
||||
"\n",
|
||||
"# 3. Probabilitas Trigram: P(w3 | w1, w2)\n",
|
||||
"trigram_probabilities = {}\n",
|
||||
"for (w1, w2, w3), count in trigram_counts.items():\n",
|
||||
" if bigram_counts[(w1, w2)] > 0:\n",
|
||||
" prob = count / bigram_counts[(w1, w2)]\n",
|
||||
" else:\n",
|
||||
" prob = 0\n",
|
||||
" trigram_probabilities[(w1, w2, w3)] = prob\n",
|
||||
"\n",
|
||||
"print(\"\\nProbabilitas masing-masing trigram:\")\n",
|
||||
"for (w1, w2, w3), prob in trigram_probabilities.items():\n",
|
||||
" print(f\" P({w3}|{w1},{w2}) = {prob:.4f} ({prob*100:.2f}%)\")\n",
|
||||
"\n",
|
||||
"# 4. Probabilitas Kalimat (Model Trigram)\n",
|
||||
"\n",
|
||||
"unigram_counts = Counter(tokens)\n",
|
||||
"total_tokens = sum(unigram_counts.values())\n",
|
||||
"\n",
|
||||
"# P(w1)\n",
|
||||
"p_w1 = unigram_counts.get(tokens[0], 0) / total_tokens if total_tokens > 0 else 0\n",
|
||||
"\n",
|
||||
"# P(w2|w1)\n",
|
||||
"if len(tokens) > 1:\n",
|
||||
" count_w1 = unigram_counts.get(tokens[0], 1)\n",
|
||||
" p_w2_w1 = bigram_counts.get((tokens[0], tokens[1]), 0) / count_w1\n",
|
||||
"else:\n",
|
||||
" p_w2_w1 = 1.0\n",
|
||||
"\n",
|
||||
"p_kalimat = p_w1 * p_w2_w1\n",
|
||||
"\n",
|
||||
"prob_str_parts = [f\"P({tokens[0]})={p_w1:.4f}\"]\n",
|
||||
"if len(tokens) > 1:\n",
|
||||
" prob_str_parts.append(f\"P({tokens[1]}|{tokens[0]})={p_w2_w1:.4f}\")\n",
|
||||
"\n",
|
||||
"# Perkalian trigram untuk i >= 3\n",
|
||||
"for i in range(len(tokens) - 2):\n",
|
||||
" triplet = (tokens[i], tokens[i+1], tokens[i+2])\n",
|
||||
" p = trigram_probabilities.get(triplet, 0)\n",
|
||||
" p_kalimat *= p\n",
|
||||
" prob_str_parts.append(f\"P({triplet[2]}|{triplet[0]},{triplet[1]})={p:.4f}\")\n",
|
||||
"\n",
|
||||
"prob_str = \" x \".join(prob_str_parts)\n",
|
||||
"\n",
|
||||
"print(\"\\nProbabilitas Keseluruhan Kalimat (Model Trigram):\")\n",
|
||||
"print(f\" P({' '.join(tokens)}) = {prob_str} = {p_kalimat:.8f} ({p_kalimat*100:.6f}%)\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.5"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user