diff --git a/fitur_ekstraksi_BOW.ipynb b/fitur_ekstraksi_BOW.ipynb new file mode 100644 index 0000000..cbd759f --- /dev/null +++ b/fitur_ekstraksi_BOW.ipynb @@ -0,0 +1,668 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PKHmOczbUdK0" + }, + "outputs": [], + "source": [ + "{\n", + " \"cells\": [\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 11,\n", + " \"metadata\": {\n", + " \"id\": \"qBYcPYAb059g\"\n", + " },\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"# =========================\\n\",\n", + " \"# 1. IMPORT LIBRARY\\n\",\n", + " \"# =========================\\n\",\n", + " \"import re\\n\",\n", + " \"import pandas as pd\\n\",\n", + " \"\\n\",\n", + " \"from sklearn.feature_extraction.text import CountVectorizer\\n\",\n", + " \"from sklearn.naive_bayes import MultinomialNB\\n\",\n", + " \"from sklearn.metrics import accuracy_score, confusion_matrix, classification_report\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 12,\n", + " \"metadata\": {\n", + " \"id\": \"mo-yt5Ob1N8j\"\n", + " },\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"# =========================\\n\",\n", + " \"# 2. DATA TEKS MANUAL\\n\",\n", + " \"# =========================\\n\",\n", + " \"documents = [\\n\",\n", + " \" \\\"saya suka belajar data science\\\",\\n\",\n", + " \" \\\"machine learning sangat menarik\\\",\\n\",\n", + " \" \\\"saya tidak suka matematika\\\",\\n\",\n", + " \" \\\"belajar python itu menyenangkan\\\",\\n\",\n", + " \" \\\"data science membutuhkan matematika\\\",\\n\",\n", + " \" \\\"python sangat membantu data science\\\"\\n\",\n", + " \"]\\n\",\n", + " \"\\n\",\n", + " \"labels = [\\n\",\n", + " \" \\\"positif\\\",\\n\",\n", + " \" \\\"positif\\\",\\n\",\n", + " \" \\\"negatif\\\",\\n\",\n", + " \" \\\"positif\\\",\\n\",\n", + " \" \\\"netral\\\",\\n\",\n", + " \" \\\"positif\\\"\\n\",\n", + " \"]\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 13,\n", + " \"metadata\": {\n", + " \"id\": \"FkmxRAFq1oDK\"\n", + " },\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"# =========================\\n\",\n", + " \"# 3. PREPROCESSING\\n\",\n", + " \"# =========================\\n\",\n", + " \"def clean_text(text):\\n\",\n", + " \" text = text.lower()\\n\",\n", + " \" text = re.sub(r\\\"[^a-z\\\\s]\\\", \\\"\\\", text)\\n\",\n", + " \" text = re.sub(r\\\"\\\\s+\\\", \\\" \\\", text).strip()\\n\",\n", + " \" return text\\n\",\n", + " \"\\n\",\n", + " \"documents = [clean_text(doc) for doc in documents]\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 14,\n", + " \"metadata\": {\n", + " \"colab\": {\n", + " \"base_uri\": \"https://localhost:8080/\"\n", + " },\n", + " \"id\": \"ybC1Vo2C_c3q\",\n", + " \"outputId\": \"1695b30d-059d-4bce-e224-7c100b1958ee\"\n", + " },\n", + " \"outputs\": [\n", + " {\n", + " \"output_type\": \"stream\",\n", + " \"name\": \"stdout\",\n", + " \"text\": [\n", + " \"=== Bag of Words ===\\n\",\n", + " \" belajar data itu learning machine matematika membantu membutuhkan \\\\\\n\",\n", + " \"0 1 1 0 0 0 0 0 0 \\n\",\n", + " \"1 0 0 0 1 1 0 0 0 \\n\",\n", + " \"2 0 0 0 0 0 1 0 0 \\n\",\n", + " \"3 1 0 1 0 0 0 0 0 \\n\",\n", + " \"4 0 1 0 0 0 1 0 1 \\n\",\n", + " \"5 0 1 0 0 0 0 1 0 \\n\",\n", + " \"\\n\",\n", + " \" menarik menyenangkan python sangat saya science suka tidak \\n\",\n", + " \"0 0 0 0 0 1 1 1 0 \\n\",\n", + " \"1 1 0 0 1 0 0 0 0 \\n\",\n", + " \"2 0 0 0 0 1 0 1 1 \\n\",\n", + " \"3 0 1 1 0 0 0 0 0 \\n\",\n", + " \"4 0 0 0 0 0 1 0 0 \\n\",\n", + " \"5 0 0 1 1 0 1 0 0 \\n\"\n", + " ]\n", + " }\n", + " ],\n", + " \"source\": [\n", + " \"# =========================\\n\",\n", + " \"# 4. FEATURE EXTRACTION (BOW)\\n\",\n", + " \"# =========================\\n\",\n", + " \"vectorizer = CountVectorizer()\\n\",\n", + " \"X = vectorizer.fit_transform(documents)\\n\",\n", + " \"\\n\",\n", + " \"df_bow = pd.DataFrame(\\n\",\n", + " \" X.toarray(),\\n\",\n", + " \" columns=vectorizer.get_feature_names_out()\\n\",\n", + " \")\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"=== Bag of Words ===\\\")\\n\",\n", + " \"print(df_bow)\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 15,\n", + " \"metadata\": {\n", + " \"colab\": {\n", + " \"base_uri\": \"https://localhost:8080/\",\n", + " \"height\": 80\n", + " },\n", + " \"id\": \"s6S-Ma4R1xuq\",\n", + " \"outputId\": \"f65359e3-bb87-42b4-fb37-9c035f88e7ad\"\n", + " },\n", + " \"outputs\": [\n", + " {\n", + " \"output_type\": \"execute_result\",\n", + " \"data\": {\n", + " \"text/plain\": [\n", + " \"MultinomialNB()\"\n", + " ],\n", + " \"text/html\": [\n", + " \"
MultinomialNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
MultinomialNB()