diff --git a/information_extraxtion(Wildanul_Jannah).ipynb b/information_extraxtion(Wildanul_Jannah).ipynb new file mode 100644 index 0000000..7ecdef2 --- /dev/null +++ b/information_extraxtion(Wildanul_Jannah).ipynb @@ -0,0 +1,223 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ky8yGDf_kyx8" + }, + "outputs": [], + "source": [ + "{\n", + " \"nbformat\": 4,\n", + " \"nbformat_minor\": 0,\n", + " \"metadata\": {\n", + " \"colab\": {\n", + " \"provenance\": []\n", + " },\n", + " \"kernelspec\": {\n", + " \"name\": \"python3\",\n", + " \"display_name\": \"Python 3\"\n", + " },\n", + " \"language_info\": {\n", + " \"name\": \"python\"\n", + " }\n", + " },\n", + " \"cells\": [\n", + " {\n", + " \"cell_type\": \"code\",\n", + " \"execution_count\": null,\n", + " \"metadata\": {\n", + " \"id\": \"8kSq7ukiTzaw\"\n", + " },\n", + " \"outputs\": [],\n", + " \"source\": [\n", + " \"{\\n\",\n", + " \" \\\"cells\\\": [\\n\",\n", + " \" {\\n\",\n", + " \" \\\"cell_type\\\": \\\"code\\\",\\n\",\n", + " \" \\\"execution_count\\\": 6,\\n\",\n", + " \" \\\"id\\\": \\\"7c7601d6-3c91-453e-8c29-706528237596\\\",\\n\",\n", + " \" \\\"metadata\\\": {\\n\",\n", + " \" \\\"scrolled\\\": true,\\n\",\n", + " \" \\\"colab\\\": {\\n\",\n", + " \" \\\"base_uri\\\": \\\"https://localhost:8080/\\\"\\n\",\n", + " \" },\\n\",\n", + " \" \\\"id\\\": \\\"7c7601d6-3c91-453e-8c29-706528237596\\\",\\n\",\n", + " \" \\\"outputId\\\": \\\"df473be6-c537-431b-8bc9-66b1ca1d64b1\\\"\\n\",\n", + " \" },\\n\",\n", + " \" \\\"outputs\\\": [\\n\",\n", + " \" {\\n\",\n", + " \" \\\"output_type\\\": \\\"stream\\\",\\n\",\n", + " \" \\\"name\\\": \\\"stdout\\\",\\n\",\n", + " \" \\\"text\\\": [\\n\",\n", + " \" \\\"Teks sumber:\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"PT Tempo Grub resmi meluncurkan produk terbaru mereka pada 28 maret 2026\\\\n\\\",\\n\",\n", + " \" \\\"di Bandung. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\\\\n\\\",\\n\",\n", + " \" \\\"disaksikan oleh ratusan undangan.\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"HASIL INFORMATION EXTRACTION:\\\\n\\\",\\n\",\n", + " \" \\\"EventType: Product Launch\\\\n\\\",\\n\",\n", + " \" \\\"Organization: PT Abadi\\\\n\\\",\\n\",\n", + " \" \\\"LaunchDateOriginal: 12 Agustus 2026\\\\n\\\",\\n\",\n", + " \" \\\"LaunchDateISO: 2026-03-28\\\\n\\\",\\n\",\n", + " \" \\\"Location: Bandung\\\\n\\\",\\n\",\n", + " \" \\\"SourceText: PT Tempo Grub resmi meluncurkan produk terbaru mereka pada 12 Agustus 2026 di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan disaksikan oleh ratusan undangan.\\\\n\\\"\\n\",\n", + " \" ]\\n\",\n", + " \" }\\n\",\n", + " \" ],\\n\",\n", + " \" \\\"source\\\": [\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"# NLP – Information Extraction (MODIFIED VERSION)\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"import re\\\\n\\\",\\n\",\n", + " \" \\\"import nltk\\\\n\\\",\\n\",\n", + " \" \\\"from datetime import datetime\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"# 1. DATA TEKS (DIMODIFIKASI)\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"text = \\\\\\\"\\\\\\\"\\\\\\\"\\\\n\\\",\\n\",\n", + " \" \\\"PT Tempo Grub resmi meluncurkan produk terbaru mereka pada 28 maret 2026\\\\n\\\",\\n\",\n", + " \" \\\"di Bandung. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\\\\n\\\",\\n\",\n", + " \" \\\"disaksikan oleh ratusan undangan.\\\\n\\\",\\n\",\n", + " \" \\\"\\\\\\\"\\\\\\\"\\\\\\\"\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"print(\\\\\\\"Teks sumber:\\\\\\\")\\\\n\\\",\\n\",\n", + " \" \\\"print(text)\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"# 2. PREPROCESSING TAMBAHAN\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"def clean_text(text):\\\\n\\\",\\n\",\n", + " \" \\\" text = text.strip()\\\\n\\\",\\n\",\n", + " \" \\\" text = re.sub(r\\\\\\\"\\\\\\\\s+\\\\\\\", \\\\\\\" \\\\\\\", text)\\\\n\\\",\\n\",\n", + " \" \\\" return text\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"text = clean_text(text)\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"# 3. EKSTRAKSI TANGGAL (LEBIH DARI 1 FORMAT)\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"date_patterns = [\\\\n\\\",\\n\",\n", + " \" \\\" r\\\\\\\"\\\\\\\\d{2,8}\\\\\\\\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\\\\\\\\s\\\\\\\\d{4}\\\\\\\",\\\\n\\\",\\n\",\n", + " \" \\\" r\\\\\\\"\\\\\\\\d{4}-\\\\\\\\d{2}-\\\\\\\\d{2}\\\\\\\"\\\\n\\\",\\n\",\n", + " \" \\\"]\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"dates = []\\\\n\\\",\\n\",\n", + " \" \\\"for pattern in date_patterns:\\\\n\\\",\\n\",\n", + " \" \\\" dates.extend(re.findall(pattern, text))\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"# 4. NORMALISASI TANGGAL\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"bulan_map = {\\\\n\\\",\\n\",\n", + " \" \\\" \\\\\\\"Januari\\\\\\\": \\\\\\\"01\\\\\\\", \\\\\\\"Februari\\\\\\\": \\\\\\\"02\\\\\\\", \\\\\\\"Maret\\\\\\\": \\\\\\\"03\\\\\\\",\\\\n\\\",\\n\",\n", + " \" \\\" \\\\\\\"April\\\\\\\": \\\\\\\"04\\\\\\\", \\\\\\\"Mei\\\\\\\": \\\\\\\"05\\\\\\\", \\\\\\\"Juni\\\\\\\": \\\\\\\"06\\\\\\\",\\\\n\\\",\\n\",\n", + " \" \\\" \\\\\\\"Juli\\\\\\\": \\\\\\\"07\\\\\\\", \\\\\\\"Agustus\\\\\\\": \\\\\\\"08\\\\\\\", \\\\\\\"September\\\\\\\": \\\\\\\"09\\\\\\\",\\\\n\\\",\\n\",\n", + " \" \\\" \\\\\\\"Oktober\\\\\\\": \\\\\\\"10\\\\\\\", \\\\\\\"November\\\\\\\": \\\\\\\"11\\\\\\\", \\\\\\\"Desember\\\\\\\": \\\\\\\"12\\\\\\\"\\\\n\\\",\\n\",\n", + " \" \\\"}\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"def normalize_date(date_str):\\\\n\\\",\\n\",\n", + " \" \\\" try:\\\\n\\\",\\n\",\n", + " \" \\\" parts = date_str.split()\\\\n\\\",\\n\",\n", + " \" \\\" return f\\\\\\\"{parts[2]}-{bulan_map[parts[1]]}-{parts[0].zfill(2)}\\\\\\\"\\\\n\\\",\\n\",\n", + " \" \\\" except:\\\\n\\\",\\n\",\n", + " \" \\\" return None\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"normalized_dates = [normalize_date(d) for d in dates]\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"# 5. EKSTRAKSI LOKASI (RULE-BASED)\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"location_pattern = r\\\\\\\"di\\\\\\\\s([A-Z][a-zA-Z]+)\\\\\\\"\\\\n\\\",\\n\",\n", + " \" \\\"location_match = re.search(location_pattern, text)\\\\n\\\",\\n\",\n", + " \" \\\"location = location_match.group(1) if location_match else None\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"# 6. EKSTRAKSI ORGANISASI (MODIFIKASI)\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"org_pattern = r\\\\\\\"PT\\\\\\\\s[A-Z][a-zA-Z]+\\\\\\\\s[A-Z][a-zA-Z]+\\\\\\\"\\\\n\\\",\\n\",\n", + " \" \\\"organization = re.findall(org_pattern, text)\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"# 7. EVENT TEMPLATE (LEBIH LENGKAP)\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"event_template = {\\\\n\\\",\\n\",\n", + " \" \\\" \\\\\\\"EventType\\\\\\\": \\\\\\\"Product Launch\\\\\\\",\\\\n\\\",\\n\",\n", + " \" \\\" \\\\\\\"Organization\\\\\\\": organization[0] if organization else None,\\\\n\\\",\\n\",\n", + " \" \\\" \\\\\\\"LaunchDateOriginal\\\\\\\": dates[0] if dates else None,\\\\n\\\",\\n\",\n", + " \" \\\" \\\\\\\"LaunchDateISO\\\\\\\": normalized_dates[0] if normalized_dates else None,\\\\n\\\",\\n\",\n", + " \" \\\" \\\\\\\"Location\\\\\\\": location,\\\\n\\\",\\n\",\n", + " \" \\\" \\\\\\\"SourceText\\\\\\\": text\\\\n\\\",\\n\",\n", + " \" \\\"}\\\\n\\\",\\n\",\n", + " \" \\\"\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"# 8. OUTPUT\\\\n\\\",\\n\",\n", + " \" \\\"# =========================\\\\n\\\",\\n\",\n", + " \" \\\"print(\\\\\\\"\\\\\\\\nHASIL INFORMATION EXTRACTION:\\\\\\\")\\\\n\\\",\\n\",\n", + " \" \\\"for key, value in event_template.items():\\\\n\\\",\\n\",\n", + " \" \\\" print(f\\\\\\\"{key}: {value}\\\\\\\")\\\"\\n\",\n", + " \" ]\\n\",\n", + " \" },\\n\",\n", + " \" {\\n\",\n", + " \" \\\"cell_type\\\": \\\"code\\\",\\n\",\n", + " \" \\\"execution_count\\\": 6,\\n\",\n", + " \" \\\"id\\\": \\\"1e128113-af1e-45a1-8586-48c4acf578b4\\\",\\n\",\n", + " \" \\\"metadata\\\": {\\n\",\n", + " \" \\\"id\\\": \\\"1e128113-af1e-45a1-8586-48c4acf578b4\\\"\\n\",\n", + " \" },\\n\",\n", + " \" \\\"outputs\\\": [],\\n\",\n", + " \" \\\"source\\\": []\\n\",\n", + " \" }\\n\",\n", + " \" ],\\n\",\n", + " \" \\\"metadata\\\": {\\n\",\n", + " \" \\\"kernelspec\\\": {\\n\",\n", + " \" \\\"display_name\\\": \\\"Python 3 (ipykernel)\\\",\\n\",\n", + " \" \\\"language\\\": \\\"python\\\",\\n\",\n", + " \" \\\"name\\\": \\\"python3\\\"\\n\",\n", + " \" },\\n\",\n", + " \" \\\"language_info\\\": {\\n\",\n", + " \" \\\"codemirror_mode\\\": {\\n\",\n", + " \" \\\"name\\\": \\\"ipython\\\",\\n\",\n", + " \" \\\"version\\\": 3\\n\",\n", + " \" },\\n\",\n", + " \" \\\"file_extension\\\": \\\".py\\\",\\n\",\n", + " \" \\\"mimetype\\\": \\\"text/x-python\\\",\\n\",\n", + " \" \\\"name\\\": \\\"python\\\",\\n\",\n", + " \" \\\"nbconvert_exporter\\\": \\\"python\\\",\\n\",\n", + " \" \\\"pygments_lexer\\\": \\\"ipython3\\\",\\n\",\n", + " \" \\\"version\\\": \\\"3.12.2\\\"\\n\",\n", + " \" },\\n\",\n", + " \" \\\"colab\\\": {\\n\",\n", + " \" \\\"provenance\\\": []\\n\",\n", + " \" }\\n\",\n", + " \" },\\n\",\n", + " \" \\\"nbformat\\\": 4,\\n\",\n", + " \" \\\"nbformat_minor\\\": 5\\n\",\n", + " \"}\"\n", + " ]\n", + " }\n", + " ]\n", + "}" + ] + } + ] +} \ No newline at end of file