From c3e95e66d9679ba11686443851a2625308081533 Mon Sep 17 00:00:00 2001 From: 202210715070 LUSIANA SITUMORANG <202210715070@mhs.ubharajaya.ac.id> Date: Tue, 20 Jan 2026 23:05:57 +0700 Subject: [PATCH] Upload files to "/" --- Information_Extraction.ipynb | 167 +++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 Information_Extraction.ipynb diff --git a/Information_Extraction.ipynb b/Information_Extraction.ipynb new file mode 100644 index 0000000..edbed71 --- /dev/null +++ b/Information_Extraction.ipynb @@ -0,0 +1,167 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "id": "7c7601d6-3c91-453e-8c29-706528237596", + "metadata": { + "scrolled": true, + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7c7601d6-3c91-453e-8c29-706528237596", + "outputId": "df473be6-c537-431b-8bc9-66b1ca1d64b1" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Teks sumber:\n", + "\n", + "PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023\n", + "di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\n", + "disaksikan oleh ratusan undangan.\n", + "\n", + "\n", + "HASIL INFORMATION EXTRACTION:\n", + "EventType: Product Launch\n", + "Organization: PT Maju Jaya\n", + "LaunchDateOriginal: 12 Agustus 2023\n", + "LaunchDateISO: 2023-08-12\n", + "Location: Jakarta\n", + "SourceText: PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023 di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan disaksikan oleh ratusan undangan.\n" + ] + } + ], + "source": [ + "# =========================\n", + "# NLP – Information Extraction (MODIFIED VERSION)\n", + "# =========================\n", + "\n", + "import re\n", + "import nltk\n", + "from datetime import datetime\n", + "\n", + "# =========================\n", + "# 1. DATA TEKS (DIMODIFIKASI)\n", + "# =========================\n", + "text = \"\"\"\n", + "PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023\n", + "di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\n", + "disaksikan oleh ratusan undangan.\n", + "\"\"\"\n", + "\n", + "print(\"Teks sumber:\")\n", + "print(text)\n", + "\n", + "# =========================\n", + "# 2. PREPROCESSING TAMBAHAN\n", + "# =========================\n", + "def clean_text(text):\n", + " text = text.strip()\n", + " text = re.sub(r\"\\s+\", \" \", text)\n", + " return text\n", + "\n", + "text = clean_text(text)\n", + "\n", + "# =========================\n", + "# 3. EKSTRAKSI TANGGAL (LEBIH DARI 1 FORMAT)\n", + "# =========================\n", + "date_patterns = [\n", + " r\"\\d{1,2}\\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\\s\\d{4}\",\n", + " r\"\\d{4}-\\d{2}-\\d{2}\"\n", + "]\n", + "\n", + "dates = []\n", + "for pattern in date_patterns:\n", + " dates.extend(re.findall(pattern, text))\n", + "\n", + "# =========================\n", + "# 4. NORMALISASI TANGGAL\n", + "# =========================\n", + "bulan_map = {\n", + " \"Januari\": \"01\", \"Februari\": \"02\", \"Maret\": \"03\",\n", + " \"April\": \"04\", \"Mei\": \"05\", \"Juni\": \"06\",\n", + " \"Juli\": \"07\", \"Agustus\": \"08\", \"September\": \"09\",\n", + " \"Oktober\": \"10\", \"November\": \"11\", \"Desember\": \"12\"\n", + "}\n", + "\n", + "def normalize_date(date_str):\n", + " try:\n", + " parts = date_str.split()\n", + " return f\"{parts[2]}-{bulan_map[parts[1]]}-{parts[0].zfill(2)}\"\n", + " except:\n", + " return None\n", + "\n", + "normalized_dates = [normalize_date(d) for d in dates]\n", + "\n", + "# =========================\n", + "# 5. EKSTRAKSI LOKASI (RULE-BASED)\n", + "# =========================\n", + "location_pattern = r\"di\\s([A-Z][a-zA-Z]+)\"\n", + "location_match = re.search(location_pattern, text)\n", + "location = location_match.group(1) if location_match else None\n", + "\n", + "# =========================\n", + "# 6. EKSTRAKSI ORGANISASI (MODIFIKASI)\n", + "# =========================\n", + "org_pattern = r\"PT\\s[A-Z][a-zA-Z]+\\s[A-Z][a-zA-Z]+\"\n", + "organization = re.findall(org_pattern, text)\n", + "\n", + "# =========================\n", + "# 7. EVENT TEMPLATE (LEBIH LENGKAP)\n", + "# =========================\n", + "event_template = {\n", + " \"EventType\": \"Product Launch\",\n", + " \"Organization\": organization[0] if organization else None,\n", + " \"LaunchDateOriginal\": dates[0] if dates else None,\n", + " \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n", + " \"Location\": location,\n", + " \"SourceText\": text\n", + "}\n", + "\n", + "# =========================\n", + "# 8. OUTPUT\n", + "# =========================\n", + "print(\"\\nHASIL INFORMATION EXTRACTION:\")\n", + "for key, value in event_template.items():\n", + " print(f\"{key}: {value}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1e128113-af1e-45a1-8586-48c4acf578b4", + "metadata": { + "id": "1e128113-af1e-45a1-8586-48c4acf578b4" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + }, + "colab": { + "provenance": [] + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file