Upload files to "/"
This commit is contained in:
parent
46409198f2
commit
86c32a9499
786
Fitur_Ekstraksi_BOW_(wildanul_jannah).ipynb
Normal file
786
Fitur_Ekstraksi_BOW_(wildanul_jannah).ipynb
Normal file
@ -0,0 +1,786 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
}
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"id": "vY2aVoMNVWho",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 106
|
||||
},
|
||||
"outputId": "e457b721-8441-4e02-a4c4-9eb0665de961"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "error",
|
||||
"ename": "SyntaxError",
|
||||
"evalue": "unterminated string literal (detected at line 344) (ipython-input-4182680070.py, line 344)",
|
||||
"traceback": [
|
||||
"\u001b[0;36m File \u001b[0;32m\"/tmp/ipython-input-4182680070.py\"\u001b[0;36m, line \u001b[0;32m344\u001b[0m\n\u001b[0;31m \" 'wildanul jannah melakukan' ,]\\\"n\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m unterminated string literal (detected at line 344)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"{\n",
|
||||
" \"cells\": [\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"markdown\",\n",
|
||||
" \"metadata\": {},\n",
|
||||
" \"source\": [\n",
|
||||
" \"# Fitur Ekstraksi Bag-of-Words (BOW)\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"**Nama:** Wildanul Jannah \\n\",\n",
|
||||
" \"**NIM:** **202210715061** \\n\",\n",
|
||||
" \"**Kelas:** F7B2 \\n\",\n",
|
||||
" \"**MK:** NLP \\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"**Tujuan praktikum:** \\n\",\n",
|
||||
" \"Melakukan ekstraksi fitur teks menggunakan Bag-of-Words dengan variasi parameter, yaitu: \\n\",\n",
|
||||
" \"- Mengubah contoh teks \\n\",\n",
|
||||
" \"- Mengubah jumlah fitur (`max_features`) \\n\",\n",
|
||||
" \"- Menggunakan rentang n-gram baru (`ngram_range = (1,3)`) \\n\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 6,\n",
|
||||
" \"metadata\": {},\n",
|
||||
" \"outputs\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"stdin\",\n",
|
||||
" \"output_type\": \"stream\",\n",
|
||||
" \"text\": [\n",
|
||||
" \"Masukkan jumlah dokumen yang ingin dimasukkan: 3\\n\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"source\": [\n",
|
||||
" \"# Input jumlah dokumen\\n\",\n",
|
||||
" \"n = int(input(\\\"Masukkan jumlah dokumen yang ingin dimasukkan: \\\"))\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 7,\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"colab\": {\n",
|
||||
" \"base_uri\": \"https://localhost:8080/\"\n",
|
||||
" },\n",
|
||||
" \"id\": \"mo-yt5Ob1N8j\",\n",
|
||||
" \"outputId\": \"362ac3e0-d84b-4014-db96-cc3b10ecdb32\"\n",
|
||||
" },\n",
|
||||
" \"outputs\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"stdin\",\n",
|
||||
" \"output_type\": \"stream\",\n",
|
||||
" \"text\": [\n",
|
||||
" \"Masukkan teks untuk dokumen ke-1: Saya Wildanul Jannah, mahasiswa informatika yang sedang mempelajari dasar pemrosesan bahasa alami\\n\",\n",
|
||||
" \"Masukkan teks untuk dokumen ke-2: Wildanul Jannah melakukan analisis teks menggunakan metode Bag of Words untuk mengubah kata menjadi data numerik\\n\"\n",
|
||||
" \"Masukkan teks untuk dokumen ke-3: Dalam percobaan ini Wildanul Jannah mengamati kemunculan kata untuk memahami representasi fitur teks\\n\",\n",
|
||||
" ]\n",
|
||||
"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"stdout\",\n",
|
||||
" \"output_type\": \"stream\",\n",
|
||||
" \"text\": [\n",
|
||||
" \"\\n\",\n",
|
||||
" \"=== Dokumen yang Dimasukkan ===\\n\",\n",
|
||||
" \"Doc 1: Saya Wildanul Jannah, mahasiswa informatika yang sedang mempelajari dasar pemrosesan bahasa alami\\n\",\n",
|
||||
" \"Doc 2: Wildanul Jannah melakukan analisis teks menggunakan metode Bag of Words untuk mengubah kata menjadi data numerik\\n\",\n",
|
||||
" \"Doc 3: Dalam percobaan ini Wildanul Jannah mengamati kemunculan kata untuk memahami representasi fitur teks\\n\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"source\": [\n",
|
||||
" \"# Input teks dokumen satu per satu\\n\",\n",
|
||||
" \"documents = []\\n\",\n",
|
||||
" \"for i in range(n):\\n\",\n",
|
||||
" \" teks = input(f\\\"Masukkan teks untuk dokumen ke-{i+1}: \\\")\\n\",\n",
|
||||
" \" documents.append(teks)\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"print(\\\"\\\\n=== Dokumen yang Dimasukkan ===\\\")\\n\",\n",
|
||||
" \"for i, doc in enumerate(documents):\\n\",\n",
|
||||
" \" print(f\\\"Doc {i+1}: {doc}\\\")\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 8,\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"colab\": {\n",
|
||||
" \"base_uri\": \"https://localhost:8080/\"\n",
|
||||
" },\n",
|
||||
" \"id\": \"FkmxRAFq1oDK\",\n",
|
||||
" \"outputId\": \"62c4508e-1725-4f30-fbdb-4de8072498b2\"\n",
|
||||
" },\n",
|
||||
" \"outputs\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"stdout\",\n",
|
||||
" \"output_type\": \"stream\",\n",
|
||||
" \"text\": [\n",
|
||||
" \"\\n\",\n",
|
||||
" \"=== Hasil Tokenisasi ===\\n\",\n",
|
||||
"\"Doc 1: ['saya', 'wildanul', 'jannah,', 'mahasiswa', 'informatika', 'yang', 'sedang', 'mempelajari', 'nlp', 'dan', 'machine', 'learning']\\n\",\n",
|
||||
"\"Doc 2: ['wildanul', 'jannah', 'melakukan', 'eksperimen', 'bag-of-words', 'untuk', 'melihat', 'bagaimana', 'fitur', 'teks', 'direpresentasikan', 'sebagai', 'angka']\\n\",\n",
|
||||
"\"Doc 3: ['pada', 'tahap', 'ini', 'wildanul', 'jannah', 'menggunakan', 'n-gram', 'untuk', 'menguji', 'kombinasi', 'kata', 'yang', 'sering', 'muncul', 'bersama']\\n\",\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"source\": [\n",
|
||||
" \"# Tahap Tokenisasi\\n\",\n",
|
||||
" \"tokenized_docs = []\\n\",\n",
|
||||
" \"for doc in documents:\\n\",\n",
|
||||
" \" tokens = doc.lower().split()\\n\",\n",
|
||||
" \" tokenized_docs.append(tokens)\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"print(\\\"\\\\n=== Hasil Tokenisasi ===\\\")\\n\",\n",
|
||||
" \"for i, tokens in enumerate(tokenized_docs):\\n\",\n",
|
||||
" \" print(f\\\"Doc {i+1}: {tokens}\\\")\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 9,\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"colab\": {\n",
|
||||
" \"base_uri\": \"https://localhost:8080/\"\n",
|
||||
" },\n",
|
||||
" \"id\": \"ybC1Vo2C_c3q\",\n",
|
||||
" \"outputId\": \"fa31c57e-5364-4ded-fcd0-54d0db46c34b\"\n",
|
||||
" },\n",
|
||||
" \"outputs\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"stdout\",\n",
|
||||
" \"output_type\": \"stream\",\n",
|
||||
" \"text\": [\n",
|
||||
" \"\\n\",\n",
|
||||
" \"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\\n\",\n",
|
||||
" \"['saya', 'wildanul', 'jannah,', 'mahasiswa', 'informatika', 'yang', 'sedang', 'mempelajari', 'nlp', 'dan', 'machine', 'learning', 'wildanul', 'jannah', 'melakukan', 'eksperimen', 'bag-of-words', 'untuk', 'melihat', 'bagaimana', 'fitur', 'teks', 'direpresentasikan', 'sebagai', 'angka','pada', 'tahap', 'ini', 'wildanul', 'jannah', 'menggunakan', 'n-gram', 'untuk', 'menguji', 'kombinasi', 'kata', 'yang', 'sering', 'muncul', 'bersama']\\n\",\n",
|
||||
" \"Jumlah total kata dalam seluruh dokumen: 39\\n\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"source\": [\n",
|
||||
" \"# Pembuatan Corpus\\n\",\n",
|
||||
" \"corpus_all = [word for doc in tokenized_docs for word in doc]\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"print(\\\"\\\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\\\")\\n\",\n",
|
||||
" \"print(corpus_all)\\n\",\n",
|
||||
" \"print(f\\\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\\\")\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 10,\n",
|
||||
" \"metadata\": {},\n",
|
||||
" \"outputs\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"stdout\",\n",
|
||||
" \"output_type\": \"stream\",\n",
|
||||
" \"text\": [\n",
|
||||
" \"\\n\",\n",
|
||||
" \"=== Hasil BOW Manual (Frekuensi Kata) ===\\n\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"data\": {\n",
|
||||
" \"text/plain\": [\n",
|
||||
" {\n",
|
||||
" \" 'wildanul': 3,\\n\",\n",
|
||||
" \" 'jannah': 3,\\n\",\n",
|
||||
" \" 'yang': 2,\\n\",\n",
|
||||
" \" 'untuk': 2,\\n\",\n",
|
||||
" \" 'saya': 1,\\n\",\n",
|
||||
" \" 'mahasiswa': 1,\\n\",\n",
|
||||
" \" 'informatika': 1,\\n\",\n",
|
||||
" \" 'sedang': 1,\\n\",\n",
|
||||
" \" 'mempelajari': 1,\\n\",\n",
|
||||
" \" 'nlp': 1,\\n\",\n",
|
||||
" \" 'dan': 1,\\n\",\n",
|
||||
" \" 'machine': 1,\\n\",\n",
|
||||
" \" 'learning': 1,\\n\",\n",
|
||||
" \" 'melakukan': 1,\\n\",\n",
|
||||
" \" 'eksperimen': 1,\\n\",\n",
|
||||
" \" 'bag-of-words': 1,\\n\",\n",
|
||||
" \" 'melihat': 1,\\n\",\n",
|
||||
" \" 'bagaimana': 1,\\n\",\n",
|
||||
" \" 'fitur': 1,\\n\",\n",
|
||||
" \" 'teks': 1,\\n\",\n",
|
||||
" \" 'direpresentasikan': 1,\\n\",\n",
|
||||
" \" 'sebagai': 1,\\n\",\n",
|
||||
" \" 'angka': 1,\\n\",\n",
|
||||
" \" 'pada': 1,\\n\",\n",
|
||||
" \" 'tahap': 1,\\n\",\n",
|
||||
" \" 'ini': 1,\\n\",\n",
|
||||
" \" 'menggunakan': 1,\\n\",\n",
|
||||
" \" 'n-gram': 1,\\n\",\n",
|
||||
" \" 'menguji': 1,\\n\",\n",
|
||||
" \" 'kombinasi': 1,\\n\",\n",
|
||||
" \" 'kata': 1,\\n\",\n",
|
||||
" \" 'sering': 1,\\n\",\n",
|
||||
" \" 'muncul': 1,\\n\",\n",
|
||||
" \" 'bersama': 1\\n\",\n",
|
||||
"}\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" \"execution_count\": 10,\n",
|
||||
" \"metadata\": {},\n",
|
||||
" \"output_type\": \"execute_result\"\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"source\": [\n",
|
||||
" \"# 4) Membuat Bag-of-Words manual (frekuensi kata)\\n\",\n",
|
||||
" \"bow_manual = {}\\n\",\n",
|
||||
" \"for tokens in tokenized_docs:\\n\",\n",
|
||||
" \" for token in tokens:\\n\",\n",
|
||||
" \" bow_manual[token] = bow_manual.get(token, 0) + 1\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"print(\\\"\\\\n=== Hasil BOW Manual (Frekuensi Kata) ===\\\")\\n\",\n",
|
||||
" \"# Tampilkan dictionary secara sorted by frequency (desc)\\n\",\n",
|
||||
" \"bow_sorted = dict(sorted(bow_manual.items(), key=lambda x: x[1], reverse=True))\\n\",\n",
|
||||
" \"bow_sorted\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 12,\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"colab\": {\n",
|
||||
" \"base_uri\": \"https://localhost:8080/\"\n",
|
||||
" },\n",
|
||||
" \"id\": \"s6S-Ma4R1xuq\",\n",
|
||||
" \"outputId\": \"98c3685b-1798-4038-d17e-6e45ca419b51\"\n",
|
||||
" },\n",
|
||||
" \"outputs\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"stdout\",\n",
|
||||
" \"output_type\": \"stream\",\n",
|
||||
" \"text\": [\n",
|
||||
" \"\\n\",\n",
|
||||
" \"=== Vocabulary (Kata Unik) ===\\n\",\n",
|
||||
" \"['angka', 'bag-of-words', 'bagaimana', 'bersama', 'dan', 'direpresentasikan','eksperimen', 'fitur', 'informatika', 'ini', 'jannah', 'jannah,', 'kata','kombinasi', 'learning', 'machine', 'mahasiswa', 'melakukan', 'melihat','mempelajari', 'menggunakan', 'menguji', 'muncul', 'n-gram', 'nlp', 'pada', 'saya', 'sebagai', 'sedang', 'sering', 'tahap', 'teks', 'untuk', 'wildanul','yang']\\n\",\n",
|
||||
" \"Jumlah kata unik (vocabulary size): 35\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"=== Vocabulary (Kata Unik) ===\\n\",\n",
|
||||
" \" 1. angka\\n\",\n",
|
||||
" \" 2. bag-of-words\\n\",\n",
|
||||
" \" 3. bagaimana\\n\",\n",
|
||||
" \" 4. bersama\\n\",\n",
|
||||
" \" 5. dan\\n\",\n",
|
||||
" \" 6. direpresentasikan\\n\",\n",
|
||||
" \" 7. eksperimen\\n\",\n",
|
||||
" \" 8. fitur\\n\",\n",
|
||||
" \" 9. informatika\\n\",\n",
|
||||
" \"10. ini\\n\",\n",
|
||||
" \"11. jannah\\n\",\n",
|
||||
" \"12. jannah\\n\",\n",
|
||||
" \"13. kata\\n\",\n",
|
||||
" \"14. kombinasi\\n\",\n",
|
||||
" \"15. learning\\n\",\n",
|
||||
" \"16. machine\\n\",\n",
|
||||
" \"17. melakukan\\n\",\n",
|
||||
" \"18. melihat\\n\",\n",
|
||||
" \"19. mempelajari\\n\",\n",
|
||||
" \"20. menggunakan\\n\",\n",
|
||||
" \"21. menguji\\n\",\n",
|
||||
" \"22. muncul\\n\",\n",
|
||||
" \"23. n-gram\\n\",\n",
|
||||
" \"24. nlp\\n\",\n",
|
||||
" \"25. pada\\n\",\n",
|
||||
" \"26. saya\\n\",\n",
|
||||
" \"27. sebagai\\n\",\n",
|
||||
" \"28. sedang\\n\",\n",
|
||||
" \"29. sering\\n\",\n",
|
||||
" \"30. tahap\\n\",\n",
|
||||
" \"31. texs\\n\",\n",
|
||||
" \"32. untuk\\n\",\n",
|
||||
" \"33. wildanul\\n\",\n",
|
||||
" \"34. untuk\\n\",\n",
|
||||
" \"35. yang\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"Jumlah kata unik (vocabulary size): 35\\n\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"source\": [\n",
|
||||
" \"# Pembuatan Vocabulary\\n\",\n",
|
||||
" \"vocabulary = sorted(set(corpus_all))\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"print(\\\"\\\\n=== Vocabulary (Kata Unik) ===\\\")\\n\",\n",
|
||||
" \"print(vocabulary)\\n\",\n",
|
||||
" \"print(f\\\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\\\")\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"vocabulary = sorted(set(corpus_all))\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"print(\\\"\\\\n=== Vocabulary (Kata Unik) ===\\\")\\n\",\n",
|
||||
" \"for idx, word in enumerate(vocabulary, start=1):\\n\",\n",
|
||||
" \" print(f\\\"{idx:>2}. {word}\\\")\\n\",\n",
|
||||
" \"print(f\\\"\\\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\\\")\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 13,\n",
|
||||
" \"metadata\": {},\n",
|
||||
" \"outputs\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"stdout\",\n",
|
||||
" \"output_type\": \"stream\",\n",
|
||||
" \"text\": [\n",
|
||||
" \"Jumlah dokumen: 3\\n\",\n",
|
||||
" \"Jumlah tokenized_docs: 3\\n\",\n",
|
||||
" \"Jumlah kata di vocabulary (unique): 35\\n\",\n",
|
||||
" \"Contoh 10 kata pertama vocabulary: ['angka', 'bag-of-words', 'bagaimana', 'bersama', 'dan', 'direpresentasikan', 'eksperimen', 'fitur', 'informatika', 'ini']\\n\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"source\": [\n",
|
||||
" \"print(\\\"Jumlah dokumen:\\\", len(documents))\\n\",\n",
|
||||
" \"print(\\\"Jumlah tokenized_docs:\\\", len(tokenized_docs))\\n\",\n",
|
||||
" \"print(\\\"Jumlah kata di vocabulary (unique):\\\", len(vocabulary))\\n\",\n",
|
||||
" \"print(\\\"Contoh 10 kata pertama vocabulary:\\\", vocabulary[:10])\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 11,\n",
|
||||
" \"metadata\": {},\n",
|
||||
" \"outputs\": [\n",
|
||||
" {\n",
|
||||
" \"name\": \"stdout\",\n",
|
||||
" \"output_type\": \"stream\",\n",
|
||||
" \"text\": [\n",
|
||||
" \"\\n\",\n",
|
||||
" \"=== Info CountVectorizer ===\\n\",\n",
|
||||
" \"n-gram range: (1, 3)\\n\",\n",
|
||||
" \"max_features: 500\\n\",\n",
|
||||
" \"Jumlah fitur (vocabulary size): 110\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"nContoh 20 fitur pertama:\\n\",\n",
|
||||
" \"['angka' 'bag' 'bag of' 'bag of words' 'bagaimana' 'bagaimana fitur'\\n\",\n",
|
||||
" \" 'bagaimana fitur teks' 'bersama' 'dan' 'dan machine'\\n\",\n",
|
||||
" \" 'dan machine learning' 'direpresentasikan' 'direpresentasikan sebagai'\\n\",\n",
|
||||
" \" 'direpresentasikan sebagai angka' 'eksperimen' 'eksperimen bag'\\n\",\n",
|
||||
" \" 'eksperimen bag of' 'wildanul' 'wildanul jannah'\\n\",\n",
|
||||
" \" 'wildanul jannah melakukan' ,]\\\"n\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"source\": [\n",
|
||||
" \"# BOW modern: CountVectorizer dengan n-gram dan max_features \\n\",\n",
|
||||
" \"from sklearn.feature_extraction.text import CountVectorizer\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"# Ubah di sini bila mau nilai lain:\\n\",\n",
|
||||
" \"NGRAM_RANGE = (1, 3) # ubah n-gram (contoh: (1,2) atau (1,4))\\n\",\n",
|
||||
" \"MAX_FEATURES = 500 # ubah jumlah fitur (contoh: 200, 1000)\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"vectorizer = CountVectorizer(ngram_range=NGRAM_RANGE, max_features=MAX_FEATURES)\\n\",\n",
|
||||
" \"X = vectorizer.fit_transform(documents)\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"print(\\\"\\\\n=== Info CountVectorizer ===\\\")\\n\",\n",
|
||||
" \"print(\\\"n-gram range:\\\", NGRAM_RANGE)\\n\",\n",
|
||||
" \"print(\\\"max_features:\\\", MAX_FEATURES)\\n\",\n",
|
||||
" \"print(\\\"Jumlah fitur (vocabulary size):\\\", len(vectorizer.vocabulary_))\\n\",\n",
|
||||
" \"print(\\\"\\\\nContoh 20 fitur pertama:\\\")\\n\",\n",
|
||||
" \"print(vectorizer.get_feature_names_out()[:20])\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 12,\n",
|
||||
" \"metadata\": {},\n",
|
||||
" \"outputs\": [\n",
|
||||
" {\n",
|
||||
" \"data\": {\n",
|
||||
" \"text/html\": [\n",
|
||||
" \"<div>\\n\",\n",
|
||||
" \"<style scoped>\\n\",\n",
|
||||
" \" .dataframe tbody tr th:only-of-type {\\n\",\n",
|
||||
" \" vertical-align: middle;\\n\",\n",
|
||||
" \" }\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \" .dataframe tbody tr th {\\n\",\n",
|
||||
" \" vertical-align: top;\\n\",\n",
|
||||
" \" }\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \" .dataframe thead th {\\n\",\n",
|
||||
" \" text-align: right;\\n\",\n",
|
||||
" \" }\\n\",\n",
|
||||
" \"</style>\\n\",\n",
|
||||
" \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n",
|
||||
" \" <thead>\\n\",\n",
|
||||
" \" <tr style=\\\"text-align: right;\\\">\\n\",\n",
|
||||
" \" <th></th>\\n\",\n",
|
||||
" \" <th>angka</th>\\n\",\n",
|
||||
" \" <th>bag</th>\\n\",\n",
|
||||
" \" <th>bag of</th>\\n\",\n",
|
||||
" \" <th>bag of words</th>\\n\",\n",
|
||||
" \" <th>bagaimana</th>\\n\",\n",
|
||||
" \" <th>bagaimana fitur</th>\\n\",\n",
|
||||
" \" <th>bagaimana fitur teks</th>\\n\",\n",
|
||||
" \" <th>bersama</th>\\n\",\n",
|
||||
" \" <th>dan</th>\\n\",\n",
|
||||
" \" <th>dan machine</th>\\n\",\n",
|
||||
" \" <th>...</th>\\n\",\n",
|
||||
" \" <th>untuk menguji</th>\\n\",\n",
|
||||
" \" <th>untuk menguji kombinasi</th>\\n\",\n",
|
||||
" \" <th>words</th>\\n\",\n",
|
||||
" \" <th>words untuk</th>\\n\",\n",
|
||||
" \" <th>words untuk melihat</th>\\n\",\n",
|
||||
" \" <th>yang</th>\\n\",\n",
|
||||
" \" <th>yang sedang</th>\\n\",\n",
|
||||
" \" <th>yang sedang mempelajari</th>\\n\",\n",
|
||||
" \" <th>yang sering</th>\\n\",\n",
|
||||
" \" <th>yang sering muncul</th>\\n\",\n",
|
||||
" \" </tr>\\n\",\n",
|
||||
" \" </thead>\\n\",\n",
|
||||
" \" <tbody>\\n\",\n",
|
||||
" \" <tr>\\n\",\n",
|
||||
" \" <th>0</th>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>...</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" </tr>\\n\",\n",
|
||||
" \" <tr>\\n\",\n",
|
||||
" \" <th>1</th>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>...</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" </tr>\\n\",\n",
|
||||
" \" <tr>\\n\",\n",
|
||||
" \" <th>2</th>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>...</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" </tr>\\n\",\n",
|
||||
" \" </tbody>\\n\",\n",
|
||||
" \"</table>\\n\",\n",
|
||||
" \"<p>3 rows × 110 columns</p>\\n\",\n",
|
||||
" \"</div>\"\n",
|
||||
" ],\n",
|
||||
" \"text/plain\": [\n",
|
||||
" \" angka bag bag of bag of words bagaimana bagaimana fitur \\\\\\n\",\n",
|
||||
" \"0 0 0 0 0 0 0 \\n\",\n",
|
||||
" \"1 1 1 1 1 1 1 \\n\",\n",
|
||||
" \"2 0 0 0 0 0 0 \\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \" bagaimana fitur teks bersama dan dan machine ... untuk menguji \\\\\\n\",\n",
|
||||
" \"0 0 0 1 1 ... 0 \\n\",\n",
|
||||
" \"1 1 0 0 0 ... 0 \\n\",\n",
|
||||
" \"2 0 1 0 0 ... 1 \\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \" untuk menguji kombinasi words words untuk words untuk melihat yang \\\\\\n\",\n",
|
||||
" \"0 0 0 0 0 1 \\n\",\n",
|
||||
" \"1 0 1 1 1 0 \\n\",\n",
|
||||
" \"2 1 0 0 0 1 \\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \" yang sedang yang sedang mempelajari yang sering yang sering muncul \\n\",\n",
|
||||
" \"0 1 1 0 0 \\n\",\n",
|
||||
" \"1 0 0 0 0 \\n\",\n",
|
||||
" \"2 0 0 1 1 \\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"[3 rows x 110 columns]\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" \"metadata\": {},\n",
|
||||
" \"output_type\": \"display_data\"\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"source\": [\n",
|
||||
" \"# 6) Konversi hasil ke DataFrame\\n\",\n",
|
||||
" \"import pandas as pd\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"df_features = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())\\n\",\n",
|
||||
" \"display(df_features) # di Jupyter ini akan tampil tabel\\n\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 13,\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"id\": \"ShevCTva2Fg9\"\n",
|
||||
" },\n",
|
||||
" \"outputs\": [],\n",
|
||||
" \"source\": [\n",
|
||||
" \"# Representasi Numerik (Matriks BoW)\\n\",\n",
|
||||
" \"bow_matrix = []\\n\",\n",
|
||||
" \"for doc in tokenized_docs:\\n\",\n",
|
||||
" \" vector = [doc.count(word) for word in vocabulary]\\n\",\n",
|
||||
" \" bow_matrix.append(vector)\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"cell_type\": \"code\",\n",
|
||||
" \"execution_count\": 14,\n",
|
||||
" \"metadata\": {},\n",
|
||||
" \"outputs\": [\n",
|
||||
" {\n",
|
||||
" \"data\": {\n",
|
||||
" \"text/html\": [\n",
|
||||
" \"<div>\\n\",\n",
|
||||
" \"<style scoped>\\n\",\n",
|
||||
" \" .dataframe tbody tr th:only-of-type {\\n\",\n",
|
||||
" \" vertical-align: middle;\\n\",\n",
|
||||
" \" }\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \" .dataframe tbody tr th {\\n\",\n",
|
||||
" \" vertical-align: top;\\n\",\n",
|
||||
" \" }\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \" .dataframe thead th {\\n\",\n",
|
||||
" \" text-align: right;\\n\",\n",
|
||||
" \" }\\n\",\n",
|
||||
" \"</style>\\n\",\n",
|
||||
" \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n",
|
||||
" \" <thead>\\n\",\n",
|
||||
" \" <tr style=\\\"text-align: right;\\\">\\n\",\n",
|
||||
" \" <th></th>\\n\",\n",
|
||||
" \" <th>angka</th>\\n\",\n",
|
||||
" \" <th>bag-of-words</th>\\n\",\n",
|
||||
" \" <th>bagaimana</th>\\n\",\n",
|
||||
" \" <th>bersama</th>\\n\",\n",
|
||||
" \" <th>dan</th>\\n\",\n",
|
||||
" \" <th>direpresentasikan</th>\\n\",\n",
|
||||
" \" <th>eksperimen</th>\\n\",\n",
|
||||
" \" <th>wilda</th>\\n\",\n",
|
||||
" \" <th>fitur</th>\\n\",\n",
|
||||
" \" <th>informatika</th>\\n\",\n",
|
||||
" \" <th>...</th>\\n\",\n",
|
||||
" \" <th>jannah,</th>\\n\",\n",
|
||||
" \" <th>jannah</th>\\n\",\n",
|
||||
" \" <th>saya</th>\\n\",\n",
|
||||
" \" <th>sebagai</th>\\n\",\n",
|
||||
" \" <th>sedang</th>\\n\",\n",
|
||||
" \" <th>sering</th>\\n\",\n",
|
||||
" \" <th>tahap</th>\\n\",\n",
|
||||
" \" <th>teks</th>\\n\",\n",
|
||||
" \" <th>untuk</th>\\n\",\n",
|
||||
" \" <th>yang</th>\\n\",\n",
|
||||
" \" </tr>\\n\",\n",
|
||||
" \" </thead>\\n\",\n",
|
||||
" \" <tbody>\\n\",\n",
|
||||
" \" <tr>\\n\",\n",
|
||||
" \" <th>0</th>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>...</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" </tr>\\n\",\n",
|
||||
" \" <tr>\\n\",\n",
|
||||
" \" <th>1</th>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>...</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" </tr>\\n\",\n",
|
||||
" \" <tr>\\n\",\n",
|
||||
" \" <th>2</th>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>...</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>0</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" <td>1</td>\\n\",\n",
|
||||
" \" </tr>\\n\",\n",
|
||||
" \" </tbody>\\n\",\n",
|
||||
" \"</table>\\n\",\n",
|
||||
" \"<p>3 rows × 35 columns</p>\\n\",\n",
|
||||
" \"</div>\"\n",
|
||||
" ],\n",
|
||||
" \"text/plain\": [\n",
|
||||
" \" angka bag-of-words bagaimana bersama dan direpresentasikan \\\\\\n\",\n",
|
||||
" \"0 0 0 0 0 1 0 \\n\",\n",
|
||||
" \"1 1 1 1 0 0 1 \\n\",\n",
|
||||
" \"2 0 0 0 1 0 0 \\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \" eksperimen fatah fitur informatika ... saya, jannah saya sebagai \\\\\\n\",\n",
|
||||
" \"0 0 1 0 1 ... 1 1 1 0 \\n\",\n",
|
||||
" \"1 1 1 1 0 ... 0 0 0 1 \\n\",\n",
|
||||
" \"2 0 1 0 0 ... 0 0 0 0 \\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \" sedang sering tahap teks untuk yang \\n\",\n",
|
||||
" \"0 1 0 0 0 0 1 \\n\",\n",
|
||||
" \"1 0 0 0 1 1 0 \\n\",\n",
|
||||
" \"2 0 1 1 0 1 1 \\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"[3 rows x 35 columns]\"\n",
|
||||
" ]\n",
|
||||
" },\n",
|
||||
" \"metadata\": {},\n",
|
||||
" \"output_type\": \"display_data\"\n",
|
||||
" },\n",
|
||||
" {\n",
|
||||
" \"name\": \"stdout\",\n",
|
||||
" \"output_type\": \"stream\",\n",
|
||||
" \"text\": [\n",
|
||||
" \"\\n\",\n",
|
||||
" \"Shape (dokumen x fitur): (3, 35)\\n\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"source\": [\n",
|
||||
" \"import pandas as pd\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"# Buat matrix manual berdasarkan vocabulary yang sudah kamu buat\\n\",\n",
|
||||
" \"bow_matrix = []\\n\",\n",
|
||||
" \"for tokens in tokenized_docs:\\n\",\n",
|
||||
" \" # hitung frekuensi tiap kata pada vocabulary pada dokumen ini\\n\",\n",
|
||||
" \" vector = [tokens.count(word) for word in vocabulary] # tokens adalah list kata\\n\",\n",
|
||||
" \" bow_matrix.append(vector)\\n\",\n",
|
||||
" \"\\n\",\n",
|
||||
" \"# Konversi ke DataFrame agar rapi saat ditampilkan\\n\",\n",
|
||||
" \"df_bow_manual = pd.DataFrame(bow_matrix, columns=vocabulary)\\n\",\n",
|
||||
" \"display(df_bow_manual) # di Jupyter ini akan tampil tabel\\n\",\n",
|
||||
" \"print(\\\"\\\\nShape (dokumen x fitur):\\\", df_bow_manual.shape)\\n\"\n",
|
||||
" ]\n",
|
||||
" }\n",
|
||||
" ],\n",
|
||||
" \"metadata\": {\n",
|
||||
" \"colab\": {\n",
|
||||
" \"provenance\": []\n",
|
||||
" },\n",
|
||||
" \"kernelspec\": {\n",
|
||||
" \"display_name\": \"Python 3 (ipykernel)\",\n",
|
||||
" \"language\": \"python\",\n",
|
||||
" \"name\": \"python3\"\n",
|
||||
" },\n",
|
||||
" \"language_info\": {\n",
|
||||
" \"codemirror_mode\": {\n",
|
||||
" \"name\": \"ipython\",\n",
|
||||
" \"version\": 3\n",
|
||||
" },\n",
|
||||
" \"file_extension\": \".py\",\n",
|
||||
" \"mimetype\": \"text/x-python\",\n",
|
||||
" \"name\": \"python\",\n",
|
||||
" \"nbconvert_exporter\": \"python\",\n",
|
||||
" \"pygments_lexer\": \"ipython3\",\n",
|
||||
" \"version\": \"3.13.5\"\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" \"nbformat\": 4,\n",
|
||||
" \"nbformat_minor\": 4\n",
|
||||
"}\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user