From 1a603fff4f5839f4b9b7494dd5d0335a61b6c786 Mon Sep 17 00:00:00 2001 From: 202210715213 MONA DEWINTHA AGUSTINE <202210715213@mhs.ubharajaya.ac.id> Date: Thu, 22 Jan 2026 10:03:41 +0700 Subject: [PATCH] Upload files to "/" --- data_preparation_&_preprocessing.ipynb | 581 +++++++++++++++++++++++++ 1 file changed, 581 insertions(+) create mode 100644 data_preparation_&_preprocessing.ipynb diff --git a/data_preparation_&_preprocessing.ipynb b/data_preparation_&_preprocessing.ipynb new file mode 100644 index 0000000..28d0c8b --- /dev/null +++ b/data_preparation_&_preprocessing.ipynb @@ -0,0 +1,581 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "0a533e2e-9fee-4f5a-a4a2-2e3faa18d85a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: requests in c:\\users\\rosyad\\lib\\site-packages (2.32.3)Note: you may need to restart the kernel to use updated packages.\n", + "\n", + "Requirement already satisfied: pandas in c:\\users\\rosyad\\lib\\site-packages (2.2.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\rosyad\\lib\\site-packages (from requests) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\rosyad\\lib\\site-packages (from requests) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\rosyad\\lib\\site-packages (from requests) (2.3.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\rosyad\\lib\\site-packages (from requests) (2025.4.26)\n", + "Requirement already satisfied: numpy>=1.26.0 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2.1.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2024.1)\n", + "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\rosyad\\lib\\site-packages (from pandas) (2025.2)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\rosyad\\lib\\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n" + ] + } + ], + "source": [ + "pip install requests pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3cbceaf7-0932-4f21-9cac-6fb2ecb6e37c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Politik: 50\n" + ] + } + ], + "source": [ + "import requests\n", + "import pandas as pd\n", + "\n", + "API_KEY = \"ef2ecb89fbe0f7a22d4d075da633640c\"\n", + "url = \"https://gnews.io/api/v4/search\"\n", + "\n", + "rows = []\n", + "\n", + "for page in range(1, 6):\n", + " params = {\n", + " \"q\": \"pemerintah OR presiden OR DPR OR kebijakan\",\n", + " \"lang\": \"id\",\n", + " \"max\": 20,\n", + " \"page\": page,\n", + " \"token\": API_KEY\n", + " }\n", + "\n", + " r = requests.get(url, params=params)\n", + " data = r.json()\n", + "\n", + " for article in data[\"articles\"]:\n", + " text = article[\"title\"] + \" \" + article[\"description\"]\n", + " rows.append({\"text\": text, \"label\": \"Politik\"})\n", + "\n", + "df = pd.DataFrame(rows).head(100)\n", + "df.to_csv(\"berita_politik.csv\", index=False)\n", + "print(\"Politik:\", len(df))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ecad8de9-3f98-4a51-b17b-8c0ca960bf4f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Olahraga: 12\n" + ] + } + ], + "source": [ + "import requests\n", + "import pandas as pd\n", + "\n", + "API_KEY = \"ef2ecb89fbe0f7a22d4d075da633640c\"\n", + "url = \"https://gnews.io/api/v4/search\"\n", + "\n", + "rows = []\n", + "\n", + "for page in range(1, 6):\n", + " params = {\n", + " \"q\": \"sepak bola OR timnas OR liga indonesia OR olahraga\",\n", + " \"lang\": \"id\",\n", + " \"max\": 20,\n", + " \"page\": page,\n", + " \"token\": API_KEY\n", + " }\n", + "\n", + " r = requests.get(url, params=params)\n", + " data = r.json()\n", + "\n", + " for article in data[\"articles\"]:\n", + " text = article[\"title\"] + \" \" + article[\"description\"]\n", + " rows.append({\"text\": text, \"label\": \"Olahraga\"})\n", + "\n", + "df = pd.DataFrame(rows).head(100)\n", + "df.to_csv(\"berita_olahraga.csv\", index=False)\n", + "print(\"Olahraga:\", len(df))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a3746155-cbcd-4df1-aed7-06db697a4e17", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Teknologi: 50\n" + ] + } + ], + "source": [ + "import requests\n", + "import pandas as pd\n", + "\n", + "API_KEY = \"ef2ecb89fbe0f7a22d4d075da633640c\"\n", + "url = \"https://gnews.io/api/v4/search\"\n", + "\n", + "rows = []\n", + "\n", + "for page in range(1, 6):\n", + " params = {\n", + " \"q\": \"teknologi OR startup OR aplikasi OR AI OR gadget\",\n", + " \"lang\": \"id\",\n", + " \"max\": 20,\n", + " \"page\": page,\n", + " \"token\": API_KEY\n", + " }\n", + "\n", + " r = requests.get(url, params=params)\n", + " data = r.json()\n", + "\n", + " for article in data[\"articles\"]:\n", + " text = article[\"title\"] + \" \" + article[\"description\"]\n", + " rows.append({\"text\": text, \"label\": \"Teknologi\"})\n", + "\n", + "df = pd.DataFrame(rows).head(100)\n", + "df.to_csv(\"berita_teknologi.csv\", index=False)\n", + "print(\"Teknologi:\", len(df))" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "c9c6de7e-1dbd-490a-aff4-9e647e1e62b8", + "metadata": {}, + "outputs": [], + "source": [ + "df_olahraga = pd.read_csv(\"berita_olahraga.csv\", encoding=\"latin1\")\n", + "df_olahraga.to_csv(\"berita_olahraga_utf8.csv\", index=False, encoding=\"utf-8\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "73658334-e371-4975-8f5f-e98f7b3da55d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "label\n", + "Politik 50\n", + "Olahraga 50\n", + "Teknologi 50\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.concat([\n", + " pd.read_csv(\"berita_politik.csv\"),\n", + " pd.read_csv(\"berita_olahraga_utf8.csv\"),\n", + " pd.read_csv(\"berita_teknologi.csv\")\n", + "])\n", + "\n", + "df[\"label\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "56cd79d3-256f-4e89-be56-2d440969c7cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "label\n", + "Politik 50\n", + "Olahraga 50\n", + "Teknologi 50\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"label\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "85fa5276-a690-4796-ac9c-f49583dcff19", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(\"dataset_berita_final.csv\", index=False, encoding=\"utf-8\")" + ] + }, + { + "cell_type": "markdown", + "id": "38aa3a7d-b0d1-4ec4-bbbe-f00393119a6e", + "metadata": {}, + "source": [ + "PREPROCESSING" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "b52a028e-24cd-4cb3-9e54-d9479981fb2c", + "metadata": {}, + "outputs": [], + "source": [ + "#IMPORT LIBRARY\n", + "import pandas as pd\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "fb25cc8c-bf70-4181-85f9-686c8a48ca9e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textlabel
0Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba...Politik
1Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe...Politik
2Ini Dia Presiden yang Menang Piplres hingga 7 ...Politik
3Solidaritas Antar Provinsi: Sumsel Kirim Semba...Politik
4Tinjau Lokasi Asap Tambang Pongkor, Adian Mint...Politik
\n", + "
" + ], + "text/plain": [ + " text label\n", + "0 Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba... Politik\n", + "1 Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe... Politik\n", + "2 Ini Dia Presiden yang Menang Piplres hingga 7 ... Politik\n", + "3 Solidaritas Antar Provinsi: Sumsel Kirim Semba... Politik\n", + "4 Tinjau Lokasi Asap Tambang Pongkor, Adian Mint... Politik" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#LOAD DATASET\n", + "df = pd.read_csv(\"dataset_berita_final.csv\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "2600ea09-49fa-45d7-8309-f9f159653d70", + "metadata": {}, + "outputs": [], + "source": [ + "#FUNGSI PREPROCESSING\n", + "def preprocess_text(text):\n", + " text = text.lower() # lowercase\n", + " text = re.sub(r\"http\\S+\", \"\", text) # hapus URL\n", + " text = re.sub(r\"[^a-zA-Z\\s]\", \" \", text) # hapus angka & simbol\n", + " text = re.sub(r\"\\s+\", \" \", text).strip() # hapus spasi berlebih\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "01164d05-2aaa-4fa6-9664-a86c35bcc6d5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
textclean_text
0Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba...dewan perdamaian gaza resmi dibentuk trump bak...
1Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe...puasa tanggal berapa ini jadwal versi pemerint...
2Ini Dia Presiden yang Menang Piplres hingga 7 ...ini dia presiden yang menang piplres hingga ka...
3Solidaritas Antar Provinsi: Sumsel Kirim Semba...solidaritas antar provinsi sumsel kirim sembak...
4Tinjau Lokasi Asap Tambang Pongkor, Adian Mint...tinjau lokasi asap tambang pongkor adian minta...
\n", + "
" + ], + "text/plain": [ + " text \\\n", + "0 Dewan Perdamaian Gaza Resmi Dibentuk Trump, Ba... \n", + "1 Puasa 2026 Tanggal Berapa? Ini Jadwal Versi Pe... \n", + "2 Ini Dia Presiden yang Menang Piplres hingga 7 ... \n", + "3 Solidaritas Antar Provinsi: Sumsel Kirim Semba... \n", + "4 Tinjau Lokasi Asap Tambang Pongkor, Adian Mint... \n", + "\n", + " clean_text \n", + "0 dewan perdamaian gaza resmi dibentuk trump bak... \n", + "1 puasa tanggal berapa ini jadwal versi pemerint... \n", + "2 ini dia presiden yang menang piplres hingga ka... \n", + "3 solidaritas antar provinsi sumsel kirim sembak... \n", + "4 tinjau lokasi asap tambang pongkor adian minta... " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#TERAPKAN KE DATASET\n", + "df[\"clean_text\"] = df[\"text\"].apply(preprocess_text)\n", + "df[[\"text\", \"clean_text\"]].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "9c9b6282-2b86-4a80-b449-4e7d5c99338f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: Sastrawi in c:\\users\\rosyad\\lib\\site-packages (1.0.1)Note: you may need to restart the kernel to use updated packages.\n", + "\n" + ] + } + ], + "source": [ + "#INSTALL SASTRAWI\n", + "pip install Sastrawi" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "ce7659a9-c512-470b-a3d5-8ccbd0cf985e", + "metadata": {}, + "outputs": [], + "source": [ + "#IMPORT & STOPWORD\n", + "from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory\n", + "\n", + "factory = StopWordRemoverFactory()\n", + "stopword_remover = factory.create_stop_word_remover()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "30ae25a0-8d86-4052-879c-a57fa466741f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 dewan perdamaian gaza resmi dibentuk trump bak...\n", + "1 puasa tanggal berapa jadwal versi pemerintah n...\n", + "2 dia presiden menang piplres hingga kali yoweri...\n", + "3 solidaritas antar provinsi sumsel kirim sembak...\n", + "4 tinjau lokasi asap tambang pongkor adian minta...\n", + "Name: clean_text, dtype: object" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#TERAPKAN\n", + "df[\"clean_text\"] = df[\"clean_text\"].apply(lambda x: stopword_remover.remove(x))\n", + "df[\"clean_text\"].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "c50305b0-5dce-4d86-ad36-76e44d002c79", + "metadata": {}, + "outputs": [], + "source": [ + "#IMPORT STEMMER\n", + "from Sastrawi.Stemmer.StemmerFactory import StemmerFactory\n", + "\n", + "stemmer = StemmerFactory().create_stemmer()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "fb5ddd48-6615-4cfa-9785-6a377d4325c6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 dewan damai gaza resmi bentuk trump bakal jadi...\n", + "1 puasa tanggal berapa jadwal versi perintah nu ...\n", + "2 dia presiden menang piplres hingga kali yoweri...\n", + "3 solidaritas antar provinsi sumsel kirim sembak...\n", + "4 tinjau lokasi asap tambang pongkor adi minta a...\n", + "Name: clean_text, dtype: object" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#TERAPKAN\n", + "df[\"clean_text\"] = df[\"clean_text\"].apply(lambda x: stemmer.stem(x))\n", + "df[\"clean_text\"].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "dbb677b1-3126-4389-8a49-10df9cae83b5", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(\"dataset_berita_preprocessed.csv\", index=False, encoding=\"utf-8\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}