{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "JVPdWpz3hhbj" }, "source": [ "\n" ] }, { "cell_type": "code", "source": [ "# =========================\n", "# 1. IMPORT LIBRARY\n", "# =========================\n", "import re\n", "import pandas as pd\n", "from sklearn.feature_extraction.text import CountVectorizer" ], "metadata": { "id": "e4-gyAeqOK31" }, "execution_count": 4, "outputs": [] }, { "cell_type": "code", "source": [ "# =========================\n", "# 2. DATA TEKS MANUAL\n", "# =========================\n", "texts = [\n", " \"saya suka belajar data science\",\n", " \"machine learning adalah bagian dari data science\",\n", " \"belajar python sangat menyenangkan\"\n", "]" ], "metadata": { "id": "TGtk0NFNOPXo" }, "execution_count": 5, "outputs": [] }, { "cell_type": "code", "source": [ "# =========================\n", "# 3. PREPROCESSING (MODIFIKASI)\n", "# =========================\n", "def clean_text(text):\n", " text = text.lower()\n", " text = re.sub(r\"[^a-z\\s]\", \"\", text)\n", " text = re.sub(r\"\\s+\", \" \", text).strip()\n", " return text\n", "\n", "texts_cleaned = [clean_text(t) for t in texts]" ], "metadata": { "id": "gtVIwpAaOTFq" }, "execution_count": 6, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "4Mvva3v65h1v" }, "source": [ "# **UNIGRAM**" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1cub_VJnUJMl", "outputId": "86744608-6288-4962-da15-bc77f1186ac2" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "=== UNIGRAM ===\n", " adalah bagian belajar dari data learning machine menyenangkan \\\n", "0 0 0 1 0 1 0 0 0 \n", "1 1 1 0 1 1 1 1 0 \n", "2 0 0 1 0 0 0 0 1 \n", "\n", " python sangat saya science suka \n", "0 0 0 1 1 1 \n", "1 0 0 0 1 0 \n", "2 1 1 0 0 0 \n" ] } ], "source": [ "unigram_vectorizer = CountVectorizer(ngram_range=(1,1))\n", "X_uni = unigram_vectorizer.fit_transform(texts_cleaned)\n", "\n", "df_unigram = pd.DataFrame(\n", " X_uni.toarray(),\n", " columns=unigram_vectorizer.get_feature_names_out()\n", ")\n", "\n", "print(\"=== UNIGRAM ===\")\n", "print(df_unigram)" ] }, { "cell_type": "markdown", "metadata": { "id": "Vstwt996-FrS" }, "source": [ "# **BIGRAM**" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "XRIY4qgTVbjl", "outputId": "75895ab7-8b5e-4113-e9f8-a613858a109e" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "=== BIGRAM ===\n", " adalah bagian bagian dari belajar data belajar python dari data \\\n", "0 0 0 1 0 0 \n", "1 1 1 0 0 1 \n", "2 0 0 0 1 0 \n", "\n", " data science learning adalah machine learning python sangat \\\n", "0 1 0 0 0 \n", "1 1 1 1 0 \n", "2 0 0 0 1 \n", "\n", " sangat menyenangkan saya suka suka belajar \n", "0 0 1 1 \n", "1 0 0 0 \n", "2 1 0 0 \n" ] } ], "source": [ "bigram_vectorizer = CountVectorizer(ngram_range=(2,2))\n", "X_bi = bigram_vectorizer.fit_transform(texts_cleaned)\n", "\n", "df_bigram = pd.DataFrame(\n", " X_bi.toarray(),\n", " columns=bigram_vectorizer.get_feature_names_out()\n", ")\n", "\n", "print(\"\\n=== BIGRAM ===\")\n", "print(df_bigram)" ] }, { "cell_type": "markdown", "metadata": { "id": "E6n1IU8X-G9S" }, "source": [ "# **TRIGRAM**" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "BIRARsj2FHJg", "outputId": "141e0aeb-23a0-4996-84e1-36477888587f" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "=== UNIGRAM + BIGRAM ===\n", " adalah adalah bagian bagian bagian dari belajar belajar data \\\n", "0 0 0 0 0 1 1 \n", "1 1 1 1 1 0 0 \n", "2 0 0 0 0 1 0 \n", "\n", " belajar python dari dari data data ... menyenangkan python \\\n", "0 0 0 0 1 ... 0 0 \n", "1 0 1 1 1 ... 0 0 \n", "2 1 0 0 0 ... 1 1 \n", "\n", " python sangat sangat sangat menyenangkan saya saya suka science suka \\\n", "0 0 0 0 1 1 1 1 \n", "1 0 0 0 0 0 1 0 \n", "2 1 1 1 0 0 0 0 \n", "\n", " suka belajar \n", "0 1 \n", "1 0 \n", "2 0 \n", "\n", "[3 rows x 25 columns]\n" ] } ], "source": [ "combined_vectorizer = CountVectorizer(ngram_range=(1,2))\n", "X_combined = combined_vectorizer.fit_transform(texts_cleaned)\n", "\n", "df_combined = pd.DataFrame(\n", " X_combined.toarray(),\n", " columns=combined_vectorizer.get_feature_names_out()\n", ")\n", "\n", "print(\"\\n=== UNIGRAM + BIGRAM ===\")\n", "print(df_combined)" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 0 }