Repositori-NLP/information_extraxtion.ipynb

195 lines
9.0 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "8kSq7ukiTzaw"
},
"outputs": [],
"source": [
"{\n",
" \"cells\": [\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 6,\n",
" \"id\": \"7c7601d6-3c91-453e-8c29-706528237596\",\n",
" \"metadata\": {\n",
" \"scrolled\": true,\n",
" \"colab\": {\n",
" \"base_uri\": \"https://localhost:8080/\"\n",
" },\n",
" \"id\": \"7c7601d6-3c91-453e-8c29-706528237596\",\n",
" \"outputId\": \"df473be6-c537-431b-8bc9-66b1ca1d64b1\"\n",
" },\n",
" \"outputs\": [\n",
" {\n",
" \"output_type\": \"stream\",\n",
" \"name\": \"stdout\",\n",
" \"text\": [\n",
" \"Teks sumber:\\n\",\n",
" \"\\n\",\n",
" \"PT Abadi resmi meluncurkan produk terbaru mereka pada 12 Agustus 2026\\n\",\n",
" \"di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\\n\",\n",
" \"disaksikan oleh ratusan undangan.\\n\",\n",
" \"\\n\",\n",
" \"\\n\",\n",
" \"HASIL INFORMATION EXTRACTION:\\n\",\n",
" \"EventType: Product Launch\\n\",\n",
" \"Organization: PT Abadi\\n\",\n",
" \"LaunchDateOriginal: 12 Agustus 2026\\n\",\n",
" \"LaunchDateISO: 2026-08-12\\n\",\n",
" \"Location: Jakarta\\n\",\n",
" \"SourceText: PT Abadi resmi meluncurkan produk terbaru mereka pada 12 Agustus 2026 di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan disaksikan oleh ratusan undangan.\\n\"\n",
" ]\n",
" }\n",
" ],\n",
" \"source\": [\n",
" \"# =========================\\n\",\n",
" \"# NLP Information Extraction (MODIFIED VERSION)\\n\",\n",
" \"# =========================\\n\",\n",
" \"\\n\",\n",
" \"import re\\n\",\n",
" \"import nltk\\n\",\n",
" \"from datetime import datetime\\n\",\n",
" \"\\n\",\n",
" \"# =========================\\n\",\n",
" \"# 1. DATA TEKS (DIMODIFIKASI)\\n\",\n",
" \"# =========================\\n\",\n",
" \"text = \\\"\\\"\\\"\\n\",\n",
" \"PT Abadi resmi meluncurkan produk terbaru mereka pada 12 Agustus 2026\\n\",\n",
" \"di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan\\n\",\n",
" \"disaksikan oleh ratusan undangan.\\n\",\n",
" \"\\\"\\\"\\\"\\n\",\n",
" \"\\n\",\n",
" \"print(\\\"Teks sumber:\\\")\\n\",\n",
" \"print(text)\\n\",\n",
" \"\\n\",\n",
" \"# =========================\\n\",\n",
" \"# 2. PREPROCESSING TAMBAHAN\\n\",\n",
" \"# =========================\\n\",\n",
" \"def clean_text(text):\\n\",\n",
" \" text = text.strip()\\n\",\n",
" \" text = re.sub(r\\\"\\\\s+\\\", \\\" \\\", text)\\n\",\n",
" \" return text\\n\",\n",
" \"\\n\",\n",
" \"text = clean_text(text)\\n\",\n",
" \"\\n\",\n",
" \"# =========================\\n\",\n",
" \"# 3. EKSTRAKSI TANGGAL (LEBIH DARI 1 FORMAT)\\n\",\n",
" \"# =========================\\n\",\n",
" \"date_patterns = [\\n\",\n",
" \" r\\\"\\\\d{1,2}\\\\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\\\\s\\\\d{4}\\\",\\n\",\n",
" \" r\\\"\\\\d{4}-\\\\d{2}-\\\\d{2}\\\"\\n\",\n",
" \"]\\n\",\n",
" \"\\n\",\n",
" \"dates = []\\n\",\n",
" \"for pattern in date_patterns:\\n\",\n",
" \" dates.extend(re.findall(pattern, text))\\n\",\n",
" \"\\n\",\n",
" \"# =========================\\n\",\n",
" \"# 4. NORMALISASI TANGGAL\\n\",\n",
" \"# =========================\\n\",\n",
" \"bulan_map = {\\n\",\n",
" \" \\\"Januari\\\": \\\"01\\\", \\\"Februari\\\": \\\"02\\\", \\\"Maret\\\": \\\"03\\\",\\n\",\n",
" \" \\\"April\\\": \\\"04\\\", \\\"Mei\\\": \\\"05\\\", \\\"Juni\\\": \\\"06\\\",\\n\",\n",
" \" \\\"Juli\\\": \\\"07\\\", \\\"Agustus\\\": \\\"08\\\", \\\"September\\\": \\\"09\\\",\\n\",\n",
" \" \\\"Oktober\\\": \\\"10\\\", \\\"November\\\": \\\"11\\\", \\\"Desember\\\": \\\"12\\\"\\n\",\n",
" \"}\\n\",\n",
" \"\\n\",\n",
" \"def normalize_date(date_str):\\n\",\n",
" \" try:\\n\",\n",
" \" parts = date_str.split()\\n\",\n",
" \" return f\\\"{parts[2]}-{bulan_map[parts[1]]}-{parts[0].zfill(2)}\\\"\\n\",\n",
" \" except:\\n\",\n",
" \" return None\\n\",\n",
" \"\\n\",\n",
" \"normalized_dates = [normalize_date(d) for d in dates]\\n\",\n",
" \"\\n\",\n",
" \"# =========================\\n\",\n",
" \"# 5. EKSTRAKSI LOKASI (RULE-BASED)\\n\",\n",
" \"# =========================\\n\",\n",
" \"location_pattern = r\\\"di\\\\s([A-Z][a-zA-Z]+)\\\"\\n\",\n",
" \"location_match = re.search(location_pattern, text)\\n\",\n",
" \"location = location_match.group(1) if location_match else None\\n\",\n",
" \"\\n\",\n",
" \"# =========================\\n\",\n",
" \"# 6. EKSTRAKSI ORGANISASI (MODIFIKASI)\\n\",\n",
" \"# =========================\\n\",\n",
" \"org_pattern = r\\\"PT\\\\s[A-Z][a-zA-Z]+\\\\s[A-Z][a-zA-Z]+\\\"\\n\",\n",
" \"organization = re.findall(org_pattern, text)\\n\",\n",
" \"\\n\",\n",
" \"# =========================\\n\",\n",
" \"# 7. EVENT TEMPLATE (LEBIH LENGKAP)\\n\",\n",
" \"# =========================\\n\",\n",
" \"event_template = {\\n\",\n",
" \" \\\"EventType\\\": \\\"Product Launch\\\",\\n\",\n",
" \" \\\"Organization\\\": organization[0] if organization else None,\\n\",\n",
" \" \\\"LaunchDateOriginal\\\": dates[0] if dates else None,\\n\",\n",
" \" \\\"LaunchDateISO\\\": normalized_dates[0] if normalized_dates else None,\\n\",\n",
" \" \\\"Location\\\": location,\\n\",\n",
" \" \\\"SourceText\\\": text\\n\",\n",
" \"}\\n\",\n",
" \"\\n\",\n",
" \"# =========================\\n\",\n",
" \"# 8. OUTPUT\\n\",\n",
" \"# =========================\\n\",\n",
" \"print(\\\"\\\\nHASIL INFORMATION EXTRACTION:\\\")\\n\",\n",
" \"for key, value in event_template.items():\\n\",\n",
" \" print(f\\\"{key}: {value}\\\")\"\n",
" ]\n",
" },\n",
" {\n",
" \"cell_type\": \"code\",\n",
" \"execution_count\": 6,\n",
" \"id\": \"1e128113-af1e-45a1-8586-48c4acf578b4\",\n",
" \"metadata\": {\n",
" \"id\": \"1e128113-af1e-45a1-8586-48c4acf578b4\"\n",
" },\n",
" \"outputs\": [],\n",
" \"source\": []\n",
" }\n",
" ],\n",
" \"metadata\": {\n",
" \"kernelspec\": {\n",
" \"display_name\": \"Python 3 (ipykernel)\",\n",
" \"language\": \"python\",\n",
" \"name\": \"python3\"\n",
" },\n",
" \"language_info\": {\n",
" \"codemirror_mode\": {\n",
" \"name\": \"ipython\",\n",
" \"version\": 3\n",
" },\n",
" \"file_extension\": \".py\",\n",
" \"mimetype\": \"text/x-python\",\n",
" \"name\": \"python\",\n",
" \"nbconvert_exporter\": \"python\",\n",
" \"pygments_lexer\": \"ipython3\",\n",
" \"version\": \"3.12.2\"\n",
" },\n",
" \"colab\": {\n",
" \"provenance\": []\n",
" }\n",
" },\n",
" \"nbformat\": 4,\n",
" \"nbformat_minor\": 5\n",
"}"
]
}
]
}