diff --git a/N_Gram.ipynb b/N_Gram.ipynb new file mode 100644 index 0000000..9910e37 --- /dev/null +++ b/N_Gram.ipynb @@ -0,0 +1,260 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "JVPdWpz3hhbj" + }, + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "source": [ + "# =========================\n", + "# 1. IMPORT LIBRARY\n", + "# =========================\n", + "import re\n", + "import pandas as pd\n", + "from sklearn.feature_extraction.text import CountVectorizer" + ], + "metadata": { + "id": "e4-gyAeqOK31" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# =========================\n", + "# 2. DATA TEKS MANUAL\n", + "# =========================\n", + "texts = [\n", + " \"saya suka belajar data science\",\n", + " \"machine learning adalah bagian dari data science\",\n", + " \"belajar python sangat menyenangkan\"\n", + "]" + ], + "metadata": { + "id": "TGtk0NFNOPXo" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# =========================\n", + "# 3. PREPROCESSING (MODIFIKASI)\n", + "# =========================\n", + "def clean_text(text):\n", + " text = text.lower()\n", + " text = re.sub(r\"[^a-z\\s]\", \"\", text)\n", + " text = re.sub(r\"\\s+\", \" \", text).strip()\n", + " return text\n", + "\n", + "texts_cleaned = [clean_text(t) for t in texts]" + ], + "metadata": { + "id": "gtVIwpAaOTFq" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4Mvva3v65h1v" + }, + "source": [ + "# **UNIGRAM**" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1cub_VJnUJMl", + "outputId": "86744608-6288-4962-da15-bc77f1186ac2" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "=== UNIGRAM ===\n", + " adalah bagian belajar dari data learning machine menyenangkan \\\n", + "0 0 0 1 0 1 0 0 0 \n", + "1 1 1 0 1 1 1 1 0 \n", + "2 0 0 1 0 0 0 0 1 \n", + "\n", + " python sangat saya science suka \n", + "0 0 0 1 1 1 \n", + "1 0 0 0 1 0 \n", + "2 1 1 0 0 0 \n" + ] + } + ], + "source": [ + "unigram_vectorizer = CountVectorizer(ngram_range=(1,1))\n", + "X_uni = unigram_vectorizer.fit_transform(texts_cleaned)\n", + "\n", + "df_unigram = pd.DataFrame(\n", + " X_uni.toarray(),\n", + " columns=unigram_vectorizer.get_feature_names_out()\n", + ")\n", + "\n", + "print(\"=== UNIGRAM ===\")\n", + "print(df_unigram)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Vstwt996-FrS" + }, + "source": [ + "# **BIGRAM**" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "XRIY4qgTVbjl", + "outputId": "75895ab7-8b5e-4113-e9f8-a613858a109e" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "=== BIGRAM ===\n", + " adalah bagian bagian dari belajar data belajar python dari data \\\n", + "0 0 0 1 0 0 \n", + "1 1 1 0 0 1 \n", + "2 0 0 0 1 0 \n", + "\n", + " data science learning adalah machine learning python sangat \\\n", + "0 1 0 0 0 \n", + "1 1 1 1 0 \n", + "2 0 0 0 1 \n", + "\n", + " sangat menyenangkan saya suka suka belajar \n", + "0 0 1 1 \n", + "1 0 0 0 \n", + "2 1 0 0 \n" + ] + } + ], + "source": [ + "bigram_vectorizer = CountVectorizer(ngram_range=(2,2))\n", + "X_bi = bigram_vectorizer.fit_transform(texts_cleaned)\n", + "\n", + "df_bigram = pd.DataFrame(\n", + " X_bi.toarray(),\n", + " columns=bigram_vectorizer.get_feature_names_out()\n", + ")\n", + "\n", + "print(\"\\n=== BIGRAM ===\")\n", + "print(df_bigram)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "E6n1IU8X-G9S" + }, + "source": [ + "# **TRIGRAM**" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "BIRARsj2FHJg", + "outputId": "141e0aeb-23a0-4996-84e1-36477888587f" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "=== UNIGRAM + BIGRAM ===\n", + " adalah adalah bagian bagian bagian dari belajar belajar data \\\n", + "0 0 0 0 0 1 1 \n", + "1 1 1 1 1 0 0 \n", + "2 0 0 0 0 1 0 \n", + "\n", + " belajar python dari dari data data ... menyenangkan python \\\n", + "0 0 0 0 1 ... 0 0 \n", + "1 0 1 1 1 ... 0 0 \n", + "2 1 0 0 0 ... 1 1 \n", + "\n", + " python sangat sangat sangat menyenangkan saya saya suka science suka \\\n", + "0 0 0 0 1 1 1 1 \n", + "1 0 0 0 0 0 1 0 \n", + "2 1 1 1 0 0 0 0 \n", + "\n", + " suka belajar \n", + "0 1 \n", + "1 0 \n", + "2 0 \n", + "\n", + "[3 rows x 25 columns]\n" + ] + } + ], + "source": [ + "combined_vectorizer = CountVectorizer(ngram_range=(1,2))\n", + "X_combined = combined_vectorizer.fit_transform(texts_cleaned)\n", + "\n", + "df_combined = pd.DataFrame(\n", + " X_combined.toarray(),\n", + " columns=combined_vectorizer.get_feature_names_out()\n", + ")\n", + "\n", + "print(\"\\n=== UNIGRAM + BIGRAM ===\")\n", + "print(df_combined)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file