Upload files to "/"

2026-01-20 23:26:56 +07:00 · 2026-01-20 23:26:56 +07:00 · b5a069dd7f
commit b5a069dd7f
parent 5533fc2ceb
1 changed files with 260 additions and 0 deletions
--- a/N_Gram.ipynb
+++ b/N_Gram.ipynb
@ -0,0 +1,260 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JVPdWpz3hhbj"
+      },
+      "source": [
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# =========================\n",
+        "# 1. IMPORT LIBRARY\n",
+        "# =========================\n",
+        "import re\n",
+        "import pandas as pd\n",
+        "from sklearn.feature_extraction.text import CountVectorizer"
+      ],
+      "metadata": {
+        "id": "e4-gyAeqOK31"
+      },
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# =========================\n",
+        "# 2. DATA TEKS MANUAL\n",
+        "# =========================\n",
+        "texts = [\n",
+        "    \"saya suka belajar data science\",\n",
+        "    \"machine learning adalah bagian dari data science\",\n",
+        "    \"belajar python sangat menyenangkan\"\n",
+        "]"
+      ],
+      "metadata": {
+        "id": "TGtk0NFNOPXo"
+      },
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# =========================\n",
+        "# 3. PREPROCESSING (MODIFIKASI)\n",
+        "# =========================\n",
+        "def clean_text(text):\n",
+        "    text = text.lower()\n",
+        "    text = re.sub(r\"[^a-z\\s]\", \"\", text)\n",
+        "    text = re.sub(r\"\\s+\", \" \", text).strip()\n",
+        "    return text\n",
+        "\n",
+        "texts_cleaned = [clean_text(t) for t in texts]"
+      ],
+      "metadata": {
+        "id": "gtVIwpAaOTFq"
+      },
+      "execution_count": 6,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "4Mvva3v65h1v"
+      },
+      "source": [
+        "# **UNIGRAM**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "1cub_VJnUJMl",
+        "outputId": "86744608-6288-4962-da15-bc77f1186ac2"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "=== UNIGRAM ===\n",
+            "   adalah  bagian  belajar  dari  data  learning  machine  menyenangkan  \\\n",
+            "0       0       0        1     0     1         0        0             0   \n",
+            "1       1       1        0     1     1         1        1             0   \n",
+            "2       0       0        1     0     0         0        0             1   \n",
+            "\n",
+            "   python  sangat  saya  science  suka  \n",
+            "0       0       0     1        1     1  \n",
+            "1       0       0     0        1     0  \n",
+            "2       1       1     0        0     0  \n"
+          ]
+        }
+      ],
+      "source": [
+        "unigram_vectorizer = CountVectorizer(ngram_range=(1,1))\n",
+        "X_uni = unigram_vectorizer.fit_transform(texts_cleaned)\n",
+        "\n",
+        "df_unigram = pd.DataFrame(\n",
+        "    X_uni.toarray(),\n",
+        "    columns=unigram_vectorizer.get_feature_names_out()\n",
+        ")\n",
+        "\n",
+        "print(\"=== UNIGRAM ===\")\n",
+        "print(df_unigram)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Vstwt996-FrS"
+      },
+      "source": [
+        "# **BIGRAM**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "XRIY4qgTVbjl",
+        "outputId": "75895ab7-8b5e-4113-e9f8-a613858a109e"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\n",
+            "=== BIGRAM ===\n",
+            "   adalah bagian  bagian dari  belajar data  belajar python  dari data  \\\n",
+            "0              0            0             1               0          0   \n",
+            "1              1            1             0               0          1   \n",
+            "2              0            0             0               1          0   \n",
+            "\n",
+            "   data science  learning adalah  machine learning  python sangat  \\\n",
+            "0             1                0                 0              0   \n",
+            "1             1                1                 1              0   \n",
+            "2             0                0                 0              1   \n",
+            "\n",
+            "   sangat menyenangkan  saya suka  suka belajar  \n",
+            "0                    0          1             1  \n",
+            "1                    0          0             0  \n",
+            "2                    1          0             0  \n"
+          ]
+        }
+      ],
+      "source": [
+        "bigram_vectorizer = CountVectorizer(ngram_range=(2,2))\n",
+        "X_bi = bigram_vectorizer.fit_transform(texts_cleaned)\n",
+        "\n",
+        "df_bigram = pd.DataFrame(\n",
+        "    X_bi.toarray(),\n",
+        "    columns=bigram_vectorizer.get_feature_names_out()\n",
+        ")\n",
+        "\n",
+        "print(\"\\n=== BIGRAM ===\")\n",
+        "print(df_bigram)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "E6n1IU8X-G9S"
+      },
+      "source": [
+        "# **TRIGRAM**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "BIRARsj2FHJg",
+        "outputId": "141e0aeb-23a0-4996-84e1-36477888587f"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "\n",
+            "=== UNIGRAM + BIGRAM ===\n",
+            "   adalah  adalah bagian  bagian  bagian dari  belajar  belajar data  \\\n",
+            "0       0              0       0            0        1             1   \n",
+            "1       1              1       1            1        0             0   \n",
+            "2       0              0       0            0        1             0   \n",
+            "\n",
+            "   belajar python  dari  dari data  data  ...  menyenangkan  python  \\\n",
+            "0               0     0          0     1  ...             0       0   \n",
+            "1               0     1          1     1  ...             0       0   \n",
+            "2               1     0          0     0  ...             1       1   \n",
+            "\n",
+            "   python sangat  sangat  sangat menyenangkan  saya  saya suka  science  suka  \\\n",
+            "0              0       0                    0     1          1        1     1   \n",
+            "1              0       0                    0     0          0        1     0   \n",
+            "2              1       1                    1     0          0        0     0   \n",
+            "\n",
+            "   suka belajar  \n",
+            "0             1  \n",
+            "1             0  \n",
+            "2             0  \n",
+            "\n",
+            "[3 rows x 25 columns]\n"
+          ]
+        }
+      ],
+      "source": [
+        "combined_vectorizer = CountVectorizer(ngram_range=(1,2))\n",
+        "X_combined = combined_vectorizer.fit_transform(texts_cleaned)\n",
+        "\n",
+        "df_combined = pd.DataFrame(\n",
+        "    X_combined.toarray(),\n",
+        "    columns=combined_vectorizer.get_feature_names_out()\n",
+        ")\n",
+        "\n",
+        "print(\"\\n=== UNIGRAM + BIGRAM ===\")\n",
+        "print(df_combined)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.12.2"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}