diff --git a/fitur_ekstraksi_BOW.ipynb b/fitur_ekstraksi_BOW.ipynb new file mode 100644 index 0000000..cbd759f --- /dev/null +++ b/fitur_ekstraksi_BOW.ipynb @@ -0,0 +1,668 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PKHmOczbUdK0" + }, + "outputs": [], + "source": [ + "{\n", + " \"cells\": [\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 11,\n", + " \"metadata\": {\n", + " \"id\": \"qBYcPYAb059g\"\n", + " },\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"# =========================\\n\",\n", + " \"# 1. IMPORT LIBRARY\\n\",\n", + " \"# =========================\\n\",\n", + " \"import re\\n\",\n", + " \"import pandas as pd\\n\",\n", + " \"\\n\",\n", + " \"from sklearn.feature_extraction.text import CountVectorizer\\n\",\n", + " \"from sklearn.naive_bayes import MultinomialNB\\n\",\n", + " \"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 12,\n", + " \"metadata\": {\n", + " \"id\": \"mo-yt5Ob1N8j\"\n", + " },\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"# =========================\\n\",\n", + " \"# 2. DATA TEKS MANUAL\\n\",\n", + " \"# =========================\\n\",\n", + " \"documents = [\\n\",\n", + " \" \\\"saya suka belajar data science\\\",\\n\",\n", + " \" \\\"machine learning sangat menarik\\\",\\n\",\n", + " \" \\\"saya tidak suka matematika\\\",\\n\",\n", + " \" \\\"belajar python itu menyenangkan\\\",\\n\",\n", + " \" \\\"data science membutuhkan matematika\\\",\\n\",\n", + " \" \\\"python sangat membantu data science\\\"\\n\",\n", + " \"]\\n\",\n", + " \"\\n\",\n", + " \"labels = [\\n\",\n", + " \" \\\"positif\\\",\\n\",\n", + " \" \\\"positif\\\",\\n\",\n", + " \" \\\"negatif\\\",\\n\",\n", + " \" \\\"positif\\\",\\n\",\n", + " \" \\\"netral\\\",\\n\",\n", + " \" \\\"positif\\\"\\n\",\n", + " \"]\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 13,\n", + " \"metadata\": {\n", + " \"id\": \"FkmxRAFq1oDK\"\n", + " },\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"# =========================\\n\",\n", + " \"# 3. PREPROCESSING\\n\",\n", + " \"# =========================\\n\",\n", + " \"def clean_text(text):\\n\",\n", + " \" text = text.lower()\\n\",\n", + " \" text = re.sub(r\\\"[^a-z\\\\s]\\\", \\\"\\\", text)\\n\",\n", + " \" text = re.sub(r\\\"\\\\s+\\\", \\\" \\\", text).strip()\\n\",\n", + " \" return text\\n\",\n", + " \"\\n\",\n", + " \"documents = [clean_text(doc) for doc in documents]\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 14,\n", + " \"metadata\": {\n", + " \"colab\": {\n", + " \"base_uri\": \"https://localhost:8080/\"\n", + " },\n", + " \"id\": \"ybC1Vo2C_c3q\",\n", + " \"outputId\": \"1695b30d-059d-4bce-e224-7c100b1958ee\"\n", + " },\n", + " \"outputs\": [\n", + " {\n", + " \"output_type\": \"stream\",\n", + " \"name\": \"stdout\",\n", + " \"text\": [\n", + " \"=== Bag of Words ===\\n\",\n", + " \" belajar data itu learning machine matematika membantu membutuhkan \\\\\\n\",\n", + " \"0 1 1 0 0 0 0 0 0 \\n\",\n", + " \"1 0 0 0 1 1 0 0 0 \\n\",\n", + " \"2 0 0 0 0 0 1 0 0 \\n\",\n", + " \"3 1 0 1 0 0 0 0 0 \\n\",\n", + " \"4 0 1 0 0 0 1 0 1 \\n\",\n", + " \"5 0 1 0 0 0 0 1 0 \\n\",\n", + " \"\\n\",\n", + " \" menarik menyenangkan python sangat saya science suka tidak \\n\",\n", + " \"0 0 0 0 0 1 1 1 0 \\n\",\n", + " \"1 1 0 0 1 0 0 0 0 \\n\",\n", + " \"2 0 0 0 0 1 0 1 1 \\n\",\n", + " \"3 0 1 1 0 0 0 0 0 \\n\",\n", + " \"4 0 0 0 0 0 1 0 0 \\n\",\n", + " \"5 0 0 1 1 0 1 0 0 \\n\"\n", + " ]\n", + " }\n", + " ],\n", + " \"source\": [\n", + " \"# =========================\\n\",\n", + " \"# 4. FEATURE EXTRACTION (BOW)\\n\",\n", + " \"# =========================\\n\",\n", + " \"vectorizer = CountVectorizer()\\n\",\n", + " \"X = vectorizer.fit_transform(documents)\\n\",\n", + " \"\\n\",\n", + " \"df_bow = pd.DataFrame(\\n\",\n", + " \" X.toarray(),\\n\",\n", + " \" columns=vectorizer.get_feature_names_out()\\n\",\n", + " \")\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"=== Bag of Words ===\\\")\\n\",\n", + " \"print(df_bow)\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 15,\n", + " \"metadata\": {\n", + " \"colab\": {\n", + " \"base_uri\": \"https://localhost:8080/\",\n", + " \"height\": 80\n", + " },\n", + " \"id\": \"s6S-Ma4R1xuq\",\n", + " \"outputId\": \"f65359e3-bb87-42b4-fb37-9c035f88e7ad\"\n", + " },\n", + " \"outputs\": [\n", + " {\n", + " \"output_type\": \"execute_result\",\n", + " \"data\": {\n", + " \"text/plain\": [\n", + " \"MultinomialNB()\"\n", + " ],\n", + " \"text/html\": [\n", + " \"
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
\"\n", + " ]\n", + " },\n", + " \"metadata\": {},\n", + " \"execution_count\": 15\n", + " }\n", + " ],\n", + " \"source\": [\n", + " \"# =========================\\n\",\n", + " \"# 5. MODEL KLASIFIKASI\\n\",\n", + " \"# =========================\\n\",\n", + " \"model = MultinomialNB()\\n\",\n", + " \"model.fit(X, labels)\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 16,\n", + " \"metadata\": {\n", + " \"id\": \"ShevCTva2Fg9\",\n", + " \"colab\": {\n", + " \"base_uri\": \"https://localhost:8080/\"\n", + " },\n", + " \"outputId\": \"228e8f03-bcfb-4ecc-c36f-d86402530a5b\"\n", + " },\n", + " \"outputs\": [\n", + " {\n", + " \"output_type\": \"stream\",\n", + " \"name\": \"stdout\",\n", + " \"text\": [\n", + " \"\\n\",\n", + " \"Kalimat uji : saya suka belajar python\\n\",\n", + " \"Hasil klasifikasi : positif\\n\"\n", + " ]\n", + " }\n", + " ],\n", + " \"source\": [\n", + " \"# =========================\\n\",\n", + " \"# 6. PREDIKSI DATA BARU\\n\",\n", + " \"# =========================\\n\",\n", + " \"test_sentence = [\\\"saya suka belajar python\\\"]\\n\",\n", + " \"test_sentence = [clean_text(test_sentence[0])]\\n\",\n", + " \"\\n\",\n", + " \"X_test = vectorizer.transform(test_sentence)\\n\",\n", + " \"prediction = model.predict(X_test)\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"\\\\nKalimat uji :\\\", test_sentence[0])\\n\",\n", + " \"print(\\\"Hasil klasifikasi :\\\", prediction[0])\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 16,\n", + " \"metadata\": {\n", + " \"id\": \"NQjExannHuj0\"\n", + " },\n", + " \"outputs\": [],\n", + " \"source\": []\n", + " }\n", + " ],\n", + " \"metadata\": {\n", + " \"colab\": {\n", + " \"provenance\": []\n", + " },\n", + " \"kernelspec\": {\n", + " \"display_name\": \"Python 3 (ipykernel)\",\n", + " \"language\": \"python\",\n", + " \"name\": \"python3\"\n", + " },\n", + " \"language_info\": {\n", + " \"codemirror_mode\": {\n", + " \"name\": \"ipython\",\n", + " \"version\": 3\n", + " },\n", + " \"file_extension\": \".py\",\n", + " \"mimetype\": \"text/x-python\",\n", + " \"name\": \"python\",\n", + " \"nbconvert_exporter\": \"python\",\n", + " \"pygments_lexer\": \"ipython3\",\n", + " \"version\": \"3.12.2\"\n", + " }\n", + " },\n", + " \"nbformat\": 4,\n", + " \"nbformat_minor\": 0\n", + "}" + ] + } + ] +} \ No newline at end of file