Upload files to "Fitur_Ekstraksi_BOW_SVM_NB"
This commit is contained in:
parent
57e0cc9e4e
commit
8149bce06e
351
Fitur_Ekstraksi_BOW_SVM_NB/Fitur_Ekstraksi_BOW_SVM_NB.ipynb
Normal file
351
Fitur_Ekstraksi_BOW_SVM_NB/Fitur_Ekstraksi_BOW_SVM_NB.ipynb
Normal file
@ -0,0 +1,351 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "qBYcPYAb059g",
|
||||
"outputId": "9f57b704-da1b-4495-d366-24c30586dc76"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdin",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Masukkan jumlah dokumen yang ingin dimasukkan: 3\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Input jumlah dokumen\n",
|
||||
"import pandas as pd\n",
|
||||
"n = int(input(\"Masukkan jumlah dokumen yang ingin dimasukkan: \"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "mo-yt5Ob1N8j",
|
||||
"outputId": "362ac3e0-d84b-4014-db96-cc3b10ecdb32"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Masukkan teks untuk dokumen ke-1: saya belajar nlp di kampus\n",
|
||||
"Masukkan teks untuk dokumen ke-2: saya suka belajar ai\n",
|
||||
"Masukkan teks untuk dokumen ke-3: mahasiswa belajar data science dan nlp\n",
|
||||
"\n",
|
||||
"=== Dokumen yang Dimasukkan ===\n",
|
||||
"Doc 1: saya belajar nlp di kampus\n",
|
||||
"Doc 2: saya suka belajar ai\n",
|
||||
"Doc 3: mahasiswa belajar data science dan nlp\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Input teks dokumen satu per satu\n",
|
||||
"documents = []\n",
|
||||
"for i in range(n):\n",
|
||||
" teks = input(f\"Masukkan teks untuk dokumen ke-{i+1}: \")\n",
|
||||
" documents.append(teks)\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Dokumen yang Dimasukkan ===\")\n",
|
||||
"for i, doc in enumerate(documents):\n",
|
||||
" print(f\"Doc {i+1}: {doc}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "FkmxRAFq1oDK",
|
||||
"outputId": "62c4508e-1725-4f30-fbdb-4de8072498b2"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== Hasil Tokenisasi ===\n",
|
||||
"Doc 1: ['saya', 'belajar', 'nlp', 'di', 'kampus']\n",
|
||||
"Doc 2: ['saya', 'suka', 'belajar', 'ai']\n",
|
||||
"Doc 3: ['mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Tahap Tokenisasi\n",
|
||||
"tokenized_docs = []\n",
|
||||
"for doc in documents:\n",
|
||||
" tokens = doc.lower().split()\n",
|
||||
" tokenized_docs.append(tokens)\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Hasil Tokenisasi ===\")\n",
|
||||
"for i, tokens in enumerate(tokenized_docs):\n",
|
||||
" print(f\"Doc {i+1}: {tokens}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "ybC1Vo2C_c3q",
|
||||
"outputId": "fa31c57e-5364-4ded-fcd0-54d0db46c34b"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\n",
|
||||
"['saya', 'belajar', 'nlp', 'di', 'kampus', 'saya', 'suka', 'belajar', 'ai', 'mahasiswa', 'belajar', 'data', 'science', 'dan', 'nlp']\n",
|
||||
"Jumlah total kata dalam seluruh dokumen: 15\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Pembuatan Corpus\n",
|
||||
"corpus_all = [word for doc in tokenized_docs for word in doc]\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Corpus Keseluruhan (Semua Kata dari Semua Dokumen) ===\")\n",
|
||||
"print(corpus_all)\n",
|
||||
"print(f\"Jumlah total kata dalam seluruh dokumen: {len(corpus_all)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "s6S-Ma4R1xuq",
|
||||
"outputId": "98c3685b-1798-4038-d17e-6e45ca419b51"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== Vocabulary (Kata Unik) ===\n",
|
||||
"['ai', 'belajar', 'dan', 'data', 'di', 'kampus', 'mahasiswa', 'nlp', 'saya', 'science', 'suka']\n",
|
||||
"Jumlah kata unik (vocabulary size): 11\n",
|
||||
"\n",
|
||||
"=== Vocabulary (Kata Unik) ===\n",
|
||||
" 1. ai\n",
|
||||
" 2. belajar\n",
|
||||
" 3. dan\n",
|
||||
" 4. data\n",
|
||||
" 5. di\n",
|
||||
" 6. kampus\n",
|
||||
" 7. mahasiswa\n",
|
||||
" 8. nlp\n",
|
||||
" 9. saya\n",
|
||||
"10. science\n",
|
||||
"11. suka\n",
|
||||
"\n",
|
||||
"Jumlah kata unik (vocabulary size): 11\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Pembuatan Vocabulary\n",
|
||||
"vocabulary = sorted(set(corpus_all))\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
|
||||
"print(vocabulary)\n",
|
||||
"print(f\"Jumlah kata unik (vocabulary size): {len(vocabulary)}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"vocabulary = sorted(set(corpus_all))\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Vocabulary (Kata Unik) ===\")\n",
|
||||
"for idx, word in enumerate(vocabulary, start=1):\n",
|
||||
" print(f\"{idx:>2}. {word}\")\n",
|
||||
"print(f\"\\nJumlah kata unik (vocabulary size): {len(vocabulary)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"id": "ShevCTva2Fg9"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Representasi Numerik (Matriks BoW)\n",
|
||||
"bow_matrix = []\n",
|
||||
"for doc in tokenized_docs:\n",
|
||||
" vector = [doc.count(word) for word in vocabulary]\n",
|
||||
" bow_matrix.append(vector)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "-yB6D2pY2M0E",
|
||||
"outputId": "b6b2f4d3-da8b-4aee-e9ce-034def4d5cf7"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== Matriks Bag of Words ===\n",
|
||||
" ai belajar dan data di kampus mahasiswa nlp saya science suka\n",
|
||||
"D1 0 1 0 0 1 1 0 1 1 0 0\n",
|
||||
"D2 1 1 0 0 0 0 0 0 1 0 1\n",
|
||||
"D3 0 1 1 1 0 0 1 1 0 1 0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df_bow = pd.DataFrame(bow_matrix, columns=vocabulary)\n",
|
||||
"df_bow.index = [f\"D{i}\" for i in range(1, len(documents)+1)] # ubah label indeks jadi D1, D2, D3\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Matriks Bag of Words ===\")\n",
|
||||
"print(df_bow)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "8ruf5vKL2rGD",
|
||||
"outputId": "65a4674e-1c01-4833-ec55-f66f77b8b6c2"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\n",
|
||||
" Kata Frekuensi\n",
|
||||
"0 belajar 3\n",
|
||||
"1 nlp 2\n",
|
||||
"2 saya 2\n",
|
||||
"3 dan 1\n",
|
||||
"4 ai 1\n",
|
||||
"5 data 1\n",
|
||||
"6 di 1\n",
|
||||
"7 mahasiswa 1\n",
|
||||
"8 kampus 1\n",
|
||||
"9 science 1\n",
|
||||
"10 suka 1\n",
|
||||
"Frekuensi kata: 11\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Membuat Tabel Frekuensi Kata (Total dari seluruh dokumen)\n",
|
||||
"word_frequencies = df_bow.sum().sort_values(ascending=False).reset_index()\n",
|
||||
"word_frequencies.columns = [\"Kata\", \"Frekuensi\"]\n",
|
||||
"\n",
|
||||
"print(\"\\n=== Tabel Frekuensi Kata (Keseluruhan Dokumen) ===\")\n",
|
||||
"print(word_frequencies)\n",
|
||||
"print(f\"Frekuensi kata: {len(word_frequencies)}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "NQjExannHuj0"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "3ffe1c09",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# === SVM Classification ===\n",
|
||||
"from sklearn.svm import LinearSVC\n",
|
||||
"from sklearn.metrics import accuracy_score, classification_report\n",
|
||||
"\n",
|
||||
"svm_model = LinearSVC()\n",
|
||||
"svm_model.fit(X_train_tfidf, y_train)\n",
|
||||
"\n",
|
||||
"svm_pred = svm_model.predict(X_test_tfidf)\n",
|
||||
"\n",
|
||||
"print(\"=== SVM Accuracy ===\", accuracy_score(y_test, svm_pred))\n",
|
||||
"print(classification_report(y_test, svm_pred))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "1e2f604d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"\n",
|
||||
"# === Naive Bayes Classification ===\n",
|
||||
"from sklearn.naive_bayes import MultinomialNB\n",
|
||||
"\n",
|
||||
"nb_model = MultinomialNB()\n",
|
||||
"nb_model.fit(X_train_tfidf, y_train)\n",
|
||||
"\n",
|
||||
"nb_pred = nb_model.predict(X_test_tfidf)\n",
|
||||
"\n",
|
||||
"print(\"=== Naive Bayes Accuracy ===\", accuracy_score(y_test, nb_pred))\n",
|
||||
"print(classification_report(y_test, nb_pred))\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user