Tugas praktikum Sudah saya edit sesuai instruksi dosen. Signed-off-by: 202210715288 FATAH SABILA ROSYAD <202210715288@mhs.ubharajaya.ac.id>
743 lines
23 KiB
Plaintext
743 lines
23 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Fitur Ekstraksi Bag-of-Words (BOW)\n",
|
||
"\n",
|
||
"**Nama:** Fatah Sabila Rosyad \n",
|
||
"**NIM:** **202210715288** \n",
|
||
"**Kelas:** F7B2 \n",
|
||
"**MK:** NLP \n",
|
||
"\n",
|
||
"**Tujuan praktikum:** \n",
|
||
"Melakukan ekstraksi fitur teks menggunakan Bag-of-Words dengan variasi parameter, yaitu: \n",
|
||
"- Mengubah contoh teks \n",
|
||
"- Mengubah jumlah fitur (`max_features`) \n",
|
||
"- Menggunakan rentang n-gram baru (`ngram_range = (1,3)`) \n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdin",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Masukkan jumlah dokumen yang ingin dimasukkan: 3\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Input jumlah dokumen\n",
|
||
"n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "mo-yt5Ob1N8j",
|
||
"outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdin",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Masukkan teks untuk dokumen ke-1: Saya Fatah Sabila Rosyad, mahasiswa informatika yang sedang mempelajari NLP dan machine learning\n",
|
||
"Masukkan teks untuk dokumen ke-2: Fatah melakukan eksperimen Bag-of-Words untuk melihat bagaimana fitur teks direpresentasikan sebagai angka\n",
|
||
"Masukkan teks untuk dokumen ke-3: Pada tahap ini Fatah menggunakan n-gram untuk menguji kombinasi kata yang sering muncul bersama\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"=== Dokumen yang Dimasukkan ===\n",
|
||
"Doc 1: Saya Fatah Sabila Rosyad, mahasiswa informatika yang sedang mempelajari NLP dan machine learning\n",
|
||
"Doc 2: Fatah melakukan eksperimen Bag-of-Words untuk melihat bagaimana fitur teks direpresentasikan sebagai angka\n",
|
||
"Doc 3: Pada tahap ini Fatah menggunakan n-gram untuk menguji kombinasi kata yang sering muncul bersama\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Input teks dokumen satu per satu\n",
|
||
"documents = []\n",
|
||
"for i in range(n):\n",
|
||
" teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
|
||
" documents.append(teks)\n",
|
||
"\n",
|
||
"print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
|
||
"for i, doc in enumerate(documents):\n",
|
||
" print(f\"Doc {i+1}: {doc}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "FkmxRAFq1oDK",
|
||
"outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"=== Hasil Tokenisasi ===\n",
|
||
"Doc 1: ['saya', 'fatah', 'sabila', 'rosyad,', 'mahasiswa', 'informatika', 'yang', 'sedang', 'mempelajari', 'nlp', 'dan', 'machine', 'learning']\n",
|
||
"Doc 2: ['fatah', 'melakukan', 'eksperimen', 'bag-of-words', 'untuk', 'melihat', 'bagaimana', 'fitur', 'teks', 'direpresentasikan', 'sebagai', 'angka']\n",
|
||
"Doc 3: ['pada', 'tahap', 'ini', 'fatah', 'menggunakan', 'n-gram', 'untuk', 'menguji', 'kombinasi', 'kata', 'yang', 'sering', 'muncul', 'bersama']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Tahap Tokenisasi\n",
|
||
"tokenized_docs = []\n",
|
||
"for doc in documents:\n",
|
||
" tokens = doc.lower().split()\n",
|
||
" tokenized_docs.append(tokens)\n",
|
||
"\n",
|
||
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
|
||
"for i, tokens in enumerate(tokenized_docs):\n",
|
||
" print(f\"Doc {i+1}: {tokens}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "ybC1Vo2C_c3q",
|
||
"outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
|
||
"['saya', 'fatah', 'sabila', 'rosyad,', 'mahasiswa', 'informatika', 'yang', 'sedang', 'mempelajari', 'nlp', 'dan', 'machine', 'learning', 'fatah', 'melakukan', 'eksperimen', 'bag-of-words', 'untuk', 'melihat', 'bagaimana', 'fitur', 'teks', 'direpresentasikan', 'sebagai', 'angka', 'pada', 'tahap', 'ini', 'fatah', 'menggunakan', 'n-gram', 'untuk', 'menguji', 'kombinasi', 'kata', 'yang', 'sering', 'muncul', 'bersama']\n",
|
||
"Jumlah total kata dalam seluruh dokumen: 39\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Pembuatan Corpus\n",
|
||
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
|
||
"\n",
|
||
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
|
||
"print(corpus_all)\n",
|
||
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"=== Hasil BOW Manual (Frekuensi Kata) ===\n"
|
||
]
|
||
},
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"{'fatah': 3,\n",
|
||
" 'yang': 2,\n",
|
||
" 'untuk': 2,\n",
|
||
" 'saya': 1,\n",
|
||
" 'sabila': 1,\n",
|
||
" 'rosyad,': 1,\n",
|
||
" 'mahasiswa': 1,\n",
|
||
" 'informatika': 1,\n",
|
||
" 'sedang': 1,\n",
|
||
" 'mempelajari': 1,\n",
|
||
" 'nlp': 1,\n",
|
||
" 'dan': 1,\n",
|
||
" 'machine': 1,\n",
|
||
" 'learning': 1,\n",
|
||
" 'melakukan': 1,\n",
|
||
" 'eksperimen': 1,\n",
|
||
" 'bag-of-words': 1,\n",
|
||
" 'melihat': 1,\n",
|
||
" 'bagaimana': 1,\n",
|
||
" 'fitur': 1,\n",
|
||
" 'teks': 1,\n",
|
||
" 'direpresentasikan': 1,\n",
|
||
" 'sebagai': 1,\n",
|
||
" 'angka': 1,\n",
|
||
" 'pada': 1,\n",
|
||
" 'tahap': 1,\n",
|
||
" 'ini': 1,\n",
|
||
" 'menggunakan': 1,\n",
|
||
" 'n-gram': 1,\n",
|
||
" 'menguji': 1,\n",
|
||
" 'kombinasi': 1,\n",
|
||
" 'kata': 1,\n",
|
||
" 'sering': 1,\n",
|
||
" 'muncul': 1,\n",
|
||
" 'bersama': 1}"
|
||
]
|
||
},
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# 4) Membuat Bag-of-Words manual (frekuensi kata)\n",
|
||
"bow_manual = {}\n",
|
||
"for tokens in tokenized_docs:\n",
|
||
" for token in tokens:\n",
|
||
" bow_manual[token] = bow_manual.get(token, 0) + 1\n",
|
||
"\n",
|
||
"print(\"\\n=== Hasil BOW Manual (Frekuensi Kata) ===\")\n",
|
||
"# Tampilkan dictionary secara sorted by frequency (desc)\n",
|
||
"bow_sorted = dict(sorted(bow_manual.items(), key=lambda x: x[1], reverse=True))\n",
|
||
"bow_sorted"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "s6S-Ma4R1xuq",
|
||
"outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
|
||
},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"=== Vocabulary (Kata Unik) ===\n",
|
||
"['angka', 'bag-of-words', 'bagaimana', 'bersama', 'dan', 'direpresentasikan', 'eksperimen', 'fatah', 'fitur', 'informatika', 'ini', 'kata', 'kombinasi', 'learning', 'machine', 'mahasiswa', 'melakukan', 'melihat', 'mempelajari', 'menggunakan', 'menguji', 'muncul', 'n-gram', 'nlp', 'pada', 'rosyad,', 'sabila', 'saya', 'sebagai', 'sedang', 'sering', 'tahap', 'teks', 'untuk', 'yang']\n",
|
||
"Jumlah kata unik (vocabulary size): 35\n",
|
||
"\n",
|
||
"=== Vocabulary (Kata Unik) ===\n",
|
||
" 1. angka\n",
|
||
" 2. bag-of-words\n",
|
||
" 3. bagaimana\n",
|
||
" 4. bersama\n",
|
||
" 5. dan\n",
|
||
" 6. direpresentasikan\n",
|
||
" 7. eksperimen\n",
|
||
" 8. fatah\n",
|
||
" 9. fitur\n",
|
||
"10. informatika\n",
|
||
"11. ini\n",
|
||
"12. kata\n",
|
||
"13. kombinasi\n",
|
||
"14. learning\n",
|
||
"15. machine\n",
|
||
"16. mahasiswa\n",
|
||
"17. melakukan\n",
|
||
"18. melihat\n",
|
||
"19. mempelajari\n",
|
||
"20. menggunakan\n",
|
||
"21. menguji\n",
|
||
"22. muncul\n",
|
||
"23. n-gram\n",
|
||
"24. nlp\n",
|
||
"25. pada\n",
|
||
"26. rosyad,\n",
|
||
"27. sabila\n",
|
||
"28. saya\n",
|
||
"29. sebagai\n",
|
||
"30. sedang\n",
|
||
"31. sering\n",
|
||
"32. tahap\n",
|
||
"33. teks\n",
|
||
"34. untuk\n",
|
||
"35. yang\n",
|
||
"\n",
|
||
"Jumlah kata unik (vocabulary size): 35\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# Pembuatan Vocabulary\n",
|
||
"vocabulary = sorted(set(corpus_all))\n",
|
||
"\n",
|
||
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
|
||
"print(vocabulary)\n",
|
||
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
|
||
"\n",
|
||
"\n",
|
||
"vocabulary = sorted(set(corpus_all))\n",
|
||
"\n",
|
||
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
|
||
"for idx, word in enumerate(vocabulary, start=1):\n",
|
||
" print(f\"{idx:>2}. {word}\")\n",
|
||
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Jumlah dokumen: 3\n",
|
||
"Jumlah tokenized_docs: 3\n",
|
||
"Jumlah kata di vocabulary (unique): 35\n",
|
||
"Contoh 10 kata pertama vocabulary: ['angka', 'bag-of-words', 'bagaimana', 'bersama', 'dan', 'direpresentasikan', 'eksperimen', 'fatah', 'fitur', 'informatika']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"print(\"Jumlah dokumen:\", len(documents))\n",
|
||
"print(\"Jumlah tokenized_docs:\", len(tokenized_docs))\n",
|
||
"print(\"Jumlah kata di vocabulary (unique):\", len(vocabulary))\n",
|
||
"print(\"Contoh 10 kata pertama vocabulary:\", vocabulary[:10])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"=== Info CountVectorizer ===\n",
|
||
"n-gram range: (1, 3)\n",
|
||
"max_features: 500\n",
|
||
"Jumlah fitur (vocabulary size): 110\n",
|
||
"\n",
|
||
"Contoh 20 fitur pertama:\n",
|
||
"['angka' 'bag' 'bag of' 'bag of words' 'bagaimana' 'bagaimana fitur'\n",
|
||
" 'bagaimana fitur teks' 'bersama' 'dan' 'dan machine'\n",
|
||
" 'dan machine learning' 'direpresentasikan' 'direpresentasikan sebagai'\n",
|
||
" 'direpresentasikan sebagai angka' 'eksperimen' 'eksperimen bag'\n",
|
||
" 'eksperimen bag of' 'fatah' 'fatah melakukan'\n",
|
||
" 'fatah melakukan eksperimen']\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# BOW modern: CountVectorizer dengan n-gram dan max_features \n",
|
||
"from sklearn.feature_extraction.text import CountVectorizer\n",
|
||
"\n",
|
||
"# Ubah di sini bila mau nilai lain:\n",
|
||
"NGRAM_RANGE = (1, 3) # ubah n-gram (contoh: (1,2) atau (1,4))\n",
|
||
"MAX_FEATURES = 500 # ubah jumlah fitur (contoh: 200, 1000)\n",
|
||
"\n",
|
||
"vectorizer = CountVectorizer(ngram_range=NGRAM_RANGE, max_features=MAX_FEATURES)\n",
|
||
"X = vectorizer.fit_transform(documents)\n",
|
||
"\n",
|
||
"print(\"\\n=== Info CountVectorizer ===\")\n",
|
||
"print(\"n-gram range:\", NGRAM_RANGE)\n",
|
||
"print(\"max_features:\", MAX_FEATURES)\n",
|
||
"print(\"Jumlah fitur (vocabulary size):\", len(vectorizer.vocabulary_))\n",
|
||
"print(\"\\nContoh 20 fitur pertama:\")\n",
|
||
"print(vectorizer.get_feature_names_out()[:20])"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>angka</th>\n",
|
||
" <th>bag</th>\n",
|
||
" <th>bag of</th>\n",
|
||
" <th>bag of words</th>\n",
|
||
" <th>bagaimana</th>\n",
|
||
" <th>bagaimana fitur</th>\n",
|
||
" <th>bagaimana fitur teks</th>\n",
|
||
" <th>bersama</th>\n",
|
||
" <th>dan</th>\n",
|
||
" <th>dan machine</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>untuk menguji</th>\n",
|
||
" <th>untuk menguji kombinasi</th>\n",
|
||
" <th>words</th>\n",
|
||
" <th>words untuk</th>\n",
|
||
" <th>words untuk melihat</th>\n",
|
||
" <th>yang</th>\n",
|
||
" <th>yang sedang</th>\n",
|
||
" <th>yang sedang mempelajari</th>\n",
|
||
" <th>yang sering</th>\n",
|
||
" <th>yang sering muncul</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>3 rows × 110 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" angka bag bag of bag of words bagaimana bagaimana fitur \\\n",
|
||
"0 0 0 0 0 0 0 \n",
|
||
"1 1 1 1 1 1 1 \n",
|
||
"2 0 0 0 0 0 0 \n",
|
||
"\n",
|
||
" bagaimana fitur teks bersama dan dan machine ... untuk menguji \\\n",
|
||
"0 0 0 1 1 ... 0 \n",
|
||
"1 1 0 0 0 ... 0 \n",
|
||
"2 0 1 0 0 ... 1 \n",
|
||
"\n",
|
||
" untuk menguji kombinasi words words untuk words untuk melihat yang \\\n",
|
||
"0 0 0 0 0 1 \n",
|
||
"1 0 1 1 1 0 \n",
|
||
"2 1 0 0 0 1 \n",
|
||
"\n",
|
||
" yang sedang yang sedang mempelajari yang sering yang sering muncul \n",
|
||
"0 1 1 0 0 \n",
|
||
"1 0 0 0 0 \n",
|
||
"2 0 0 1 1 \n",
|
||
"\n",
|
||
"[3 rows x 110 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"# 6) Konversi hasil ke DataFrame\n",
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"df_features = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())\n",
|
||
"display(df_features) # di Jupyter ini akan tampil tabel\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {
|
||
"id": "ShevCTva2Fg9"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Representasi Numerik (Matriks BoW)\n",
|
||
"bow_matrix = []\n",
|
||
"for doc in tokenized_docs:\n",
|
||
" vector = [doc.count(word) for word in vocabulary]\n",
|
||
" bow_matrix.append(vector)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>angka</th>\n",
|
||
" <th>bag-of-words</th>\n",
|
||
" <th>bagaimana</th>\n",
|
||
" <th>bersama</th>\n",
|
||
" <th>dan</th>\n",
|
||
" <th>direpresentasikan</th>\n",
|
||
" <th>eksperimen</th>\n",
|
||
" <th>fatah</th>\n",
|
||
" <th>fitur</th>\n",
|
||
" <th>informatika</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>rosyad,</th>\n",
|
||
" <th>sabila</th>\n",
|
||
" <th>saya</th>\n",
|
||
" <th>sebagai</th>\n",
|
||
" <th>sedang</th>\n",
|
||
" <th>sering</th>\n",
|
||
" <th>tahap</th>\n",
|
||
" <th>teks</th>\n",
|
||
" <th>untuk</th>\n",
|
||
" <th>yang</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>3 rows × 35 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" angka bag-of-words bagaimana bersama dan direpresentasikan \\\n",
|
||
"0 0 0 0 0 1 0 \n",
|
||
"1 1 1 1 0 0 1 \n",
|
||
"2 0 0 0 1 0 0 \n",
|
||
"\n",
|
||
" eksperimen fatah fitur informatika ... rosyad, sabila saya sebagai \\\n",
|
||
"0 0 1 0 1 ... 1 1 1 0 \n",
|
||
"1 1 1 1 0 ... 0 0 0 1 \n",
|
||
"2 0 1 0 0 ... 0 0 0 0 \n",
|
||
"\n",
|
||
" sedang sering tahap teks untuk yang \n",
|
||
"0 1 0 0 0 0 1 \n",
|
||
"1 0 0 0 1 1 0 \n",
|
||
"2 0 1 1 0 1 1 \n",
|
||
"\n",
|
||
"[3 rows x 35 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"\n",
|
||
"Shape (dokumen x fitur): (3, 35)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"\n",
|
||
"# Buat matrix manual berdasarkan vocabulary yang sudah kamu buat\n",
|
||
"bow_matrix = []\n",
|
||
"for tokens in tokenized_docs:\n",
|
||
" # hitung frekuensi tiap kata pada vocabulary pada dokumen ini\n",
|
||
" vector = [tokens.count(word) for word in vocabulary] # tokens adalah list kata\n",
|
||
" bow_matrix.append(vector)\n",
|
||
"\n",
|
||
"# Konversi ke DataFrame agar rapi saat ditampilkan\n",
|
||
"df_bow_manual = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
|
||
"display(df_bow_manual) # di Jupyter ini akan tampil tabel\n",
|
||
"print(\"\\nShape (dokumen x fitur):\", df_bow_manual.shape)\n"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"colab": {
|
||
"provenance": []
|
||
},
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.13.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
}
|