diff --git a/information_extraxtion.ipynb b/information_extraxtion.ipynb new file mode 100644 index 0000000..874f649 --- /dev/null +++ b/information_extraxtion.ipynb @@ -0,0 +1,195 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8kSq7ukiTzaw" + }, + "outputs": [], + "source": [ + "{\n", + " \"cells\": [\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 6,\n", + " \"id\": \"7c7601d6-3c91-453e-8c29-706528237596\",\n", + " \"metadata\": {\n", + " \"scrolled\": true,\n", + " \"colab\": {\n", + " \"base_uri\": \"https://localhost:8080/\"\n", + " },\n", + " \"id\": \"7c7601d6-3c91-453e-8c29-706528237596\",\n", + " \"outputId\": \"df473be6-c537-431b-8bc9-66b1ca1d64b1\"\n", + " },\n", + " \"outputs\": [\n", + " {\n", + " \"output_type\": \"stream\",\n", + " \"name\": \"stdout\",\n", + " \"text\": [\n", + " \"Teks sumber:\\n\",\n", + " \"\\n\",\n", + " \"PT Abadi resmi meluncurkan produk terbaru mereka pada 12 Agustus 2026\\n\",\n", + " \"di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\\n\",\n", + " \"disaksikan oleh ratusan undangan.\\n\",\n", + " \"\\n\",\n", + " \"\\n\",\n", + " \"HASIL INFORMATION EXTRACTION:\\n\",\n", + " \"EventType: Product Launch\\n\",\n", + " \"Organization: PT Abadi\\n\",\n", + " \"LaunchDateOriginal: 12 Agustus 2026\\n\",\n", + " \"LaunchDateISO: 2026-08-12\\n\",\n", + " \"Location: Jakarta\\n\",\n", + " \"SourceText: PT Abadi resmi meluncurkan produk terbaru mereka pada 12 Agustus 2026 di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan disaksikan oleh ratusan undangan.\\n\"\n", + " ]\n", + " }\n", + " ],\n", + " \"source\": [\n", + " \"# =========================\\n\",\n", + " \"# NLP – Information Extraction (MODIFIED VERSION)\\n\",\n", + " \"# =========================\\n\",\n", + " \"\\n\",\n", + " \"import re\\n\",\n", + " \"import nltk\\n\",\n", + " \"from datetime import datetime\\n\",\n", + " \"\\n\",\n", + " \"# =========================\\n\",\n", + " \"# 1. DATA TEKS (DIMODIFIKASI)\\n\",\n", + " \"# =========================\\n\",\n", + " \"text = \\\"\\\"\\\"\\n\",\n", + " \"PT Abadi resmi meluncurkan produk terbaru mereka pada 12 Agustus 2026\\n\",\n", + " \"di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\\n\",\n", + " \"disaksikan oleh ratusan undangan.\\n\",\n", + " \"\\\"\\\"\\\"\\n\",\n", + " \"\\n\",\n", + " \"print(\\\"Teks sumber:\\\")\\n\",\n", + " \"print(text)\\n\",\n", + " \"\\n\",\n", + " \"# =========================\\n\",\n", + " \"# 2. PREPROCESSING TAMBAHAN\\n\",\n", + " \"# =========================\\n\",\n", + " \"def clean_text(text):\\n\",\n", + " \" text = text.strip()\\n\",\n", + " \" text = re.sub(r\\\"\\\\s+\\\", \\\" \\\", text)\\n\",\n", + " \" return text\\n\",\n", + " \"\\n\",\n", + " \"text = clean_text(text)\\n\",\n", + " \"\\n\",\n", + " \"# =========================\\n\",\n", + " \"# 3. EKSTRAKSI TANGGAL (LEBIH DARI 1 FORMAT)\\n\",\n", + " \"# =========================\\n\",\n", + " \"date_patterns = [\\n\",\n", + " \" r\\\"\\\\d{1,2}\\\\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\\\\s\\\\d{4}\\\",\\n\",\n", + " \" r\\\"\\\\d{4}-\\\\d{2}-\\\\d{2}\\\"\\n\",\n", + " \"]\\n\",\n", + " \"\\n\",\n", + " \"dates = []\\n\",\n", + " \"for pattern in date_patterns:\\n\",\n", + " \" dates.extend(re.findall(pattern, text))\\n\",\n", + " \"\\n\",\n", + " \"# =========================\\n\",\n", + " \"# 4. NORMALISASI TANGGAL\\n\",\n", + " \"# =========================\\n\",\n", + " \"bulan_map = {\\n\",\n", + " \" \\\"Januari\\\": \\\"01\\\", \\\"Februari\\\": \\\"02\\\", \\\"Maret\\\": \\\"03\\\",\\n\",\n", + " \" \\\"April\\\": \\\"04\\\", \\\"Mei\\\": \\\"05\\\", \\\"Juni\\\": \\\"06\\\",\\n\",\n", + " \" \\\"Juli\\\": \\\"07\\\", \\\"Agustus\\\": \\\"08\\\", \\\"September\\\": \\\"09\\\",\\n\",\n", + " \" \\\"Oktober\\\": \\\"10\\\", \\\"November\\\": \\\"11\\\", \\\"Desember\\\": \\\"12\\\"\\n\",\n", + " \"}\\n\",\n", + " \"\\n\",\n", + " \"def normalize_date(date_str):\\n\",\n", + " \" try:\\n\",\n", + " \" parts = date_str.split()\\n\",\n", + " \" return f\\\"{parts[2]}-{bulan_map[parts[1]]}-{parts[0].zfill(2)}\\\"\\n\",\n", + " \" except:\\n\",\n", + " \" return None\\n\",\n", + " \"\\n\",\n", + " \"normalized_dates = [normalize_date(d) for d in dates]\\n\",\n", + " \"\\n\",\n", + " \"# =========================\\n\",\n", + " \"# 5. EKSTRAKSI LOKASI (RULE-BASED)\\n\",\n", + " \"# =========================\\n\",\n", + " \"location_pattern = r\\\"di\\\\s([A-Z][a-zA-Z]+)\\\"\\n\",\n", + " \"location_match = re.search(location_pattern, text)\\n\",\n", + " \"location = location_match.group(1) if location_match else None\\n\",\n", + " \"\\n\",\n", + " \"# =========================\\n\",\n", + " \"# 6. EKSTRAKSI ORGANISASI (MODIFIKASI)\\n\",\n", + " \"# =========================\\n\",\n", + " \"org_pattern = r\\\"PT\\\\s[A-Z][a-zA-Z]+\\\\s[A-Z][a-zA-Z]+\\\"\\n\",\n", + " \"organization = re.findall(org_pattern, text)\\n\",\n", + " \"\\n\",\n", + " \"# =========================\\n\",\n", + " \"# 7. EVENT TEMPLATE (LEBIH LENGKAP)\\n\",\n", + " \"# =========================\\n\",\n", + " \"event_template = {\\n\",\n", + " \" \\\"EventType\\\": \\\"Product Launch\\\",\\n\",\n", + " \" \\\"Organization\\\": organization[0] if organization else None,\\n\",\n", + " \" \\\"LaunchDateOriginal\\\": dates[0] if dates else None,\\n\",\n", + " \" \\\"LaunchDateISO\\\": normalized_dates[0] if normalized_dates else None,\\n\",\n", + " \" \\\"Location\\\": location,\\n\",\n", + " \" \\\"SourceText\\\": text\\n\",\n", + " \"}\\n\",\n", + " \"\\n\",\n", + " \"# =========================\\n\",\n", + " \"# 8. OUTPUT\\n\",\n", + " \"# =========================\\n\",\n", + " \"print(\\\"\\\\nHASIL INFORMATION EXTRACTION:\\\")\\n\",\n", + " \"for key, value in event_template.items():\\n\",\n", + " \" print(f\\\"{key}: {value}\\\")\"\n", + " ]\n", + " },\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": 6,\n", + " \"id\": \"1e128113-af1e-45a1-8586-48c4acf578b4\",\n", + " \"metadata\": {\n", + " \"id\": \"1e128113-af1e-45a1-8586-48c4acf578b4\"\n", + " },\n", + " \"outputs\": [],\n", + " \"source\": []\n", + " }\n", + " ],\n", + " \"metadata\": {\n", + " \"kernelspec\": {\n", + " \"display_name\": \"Python 3 (ipykernel)\",\n", + " \"language\": \"python\",\n", + " \"name\": \"python3\"\n", + " },\n", + " \"language_info\": {\n", + " \"codemirror_mode\": {\n", + " \"name\": \"ipython\",\n", + " \"version\": 3\n", + " },\n", + " \"file_extension\": \".py\",\n", + " \"mimetype\": \"text/x-python\",\n", + " \"name\": \"python\",\n", + " \"nbconvert_exporter\": \"python\",\n", + " \"pygments_lexer\": \"ipython3\",\n", + " \"version\": \"3.12.2\"\n", + " },\n", + " \"colab\": {\n", + " \"provenance\": []\n", + " }\n", + " },\n", + " \"nbformat\": 4,\n", + " \"nbformat_minor\": 5\n", + "}" + ] + } + ] +} \ No newline at end of file