In [6]:
# =========================
# NLP â€“ Information Extraction (MODIFIED VERSION)
# =========================

import re
import nltk
from datetime import datetime

# =========================
# 1. DATA TEKS (DIMODIFIKASI)
# =========================
text = """
PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023
di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan
disaksikan oleh ratusan undangan.
"""

print("Teks sumber:")
print(text)

# =========================
# 2. PREPROCESSING TAMBAHAN
# =========================
def clean_text(text):
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text

text = clean_text(text)

# =========================
# 3. EKSTRAKSI TANGGAL (LEBIH DARI 1 FORMAT)
# =========================
date_patterns = [
    r"\d{1,2}\s(?:Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\s\d{4}",
    r"\d{4}-\d{2}-\d{2}"
]

dates = []
for pattern in date_patterns:
    dates.extend(re.findall(pattern, text))

# =========================
# 4. NORMALISASI TANGGAL
# =========================
bulan_map = {
    "Januari": "01", "Februari": "02", "Maret": "03",
    "April": "04", "Mei": "05", "Juni": "06",
    "Juli": "07", "Agustus": "08", "September": "09",
    "Oktober": "10", "November": "11", "Desember": "12"
}

def normalize_date(date_str):
    try:
        parts = date_str.split()
        return f"{parts[2]}-{bulan_map[parts[1]]}-{parts[0].zfill(2)}"
    except:
        return None

normalized_dates = [normalize_date(d) for d in dates]

# =========================
# 5. EKSTRAKSI LOKASI (RULE-BASED)
# =========================
location_pattern = r"di\s([A-Z][a-zA-Z]+)"
location_match = re.search(location_pattern, text)
location = location_match.group(1) if location_match else None

# =========================
# 6. EKSTRAKSI ORGANISASI (MODIFIKASI)
# =========================
org_pattern = r"PT\s[A-Z][a-zA-Z]+\s[A-Z][a-zA-Z]+"
organization = re.findall(org_pattern, text)

# =========================
# 7. EVENT TEMPLATE (LEBIH LENGKAP)
# =========================
event_template = {
    "EventType": "Product Launch",
    "Organization": organization[0] if organization else None,
    "LaunchDateOriginal": dates[0] if dates else None,
    "LaunchDateISO": normalized_dates[0] if normalized_dates else None,
    "Location": location,
    "SourceText": text
}

# =========================
# 8. OUTPUT
# =========================
print("\nHASIL INFORMATION EXTRACTION:")
for key, value in event_template.items():
    print(f"{key}: {value}")

Teks sumber:

PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023
di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan
disaksikan oleh ratusan undangan.


HASIL INFORMATION EXTRACTION:
EventType: Product Launch
Organization: PT Maju Jaya
LaunchDateOriginal: 12 Agustus 2023
LaunchDateISO: 2023-08-12
Location: Jakarta
SourceText: PT Maju Jaya resmi meluncurkan produk terbaru mereka pada 12 Agustus 2023 di Jakarta. Acara peluncuran tersebut dihadiri oleh CEO perusahaan dan disaksikan oleh ratusan undangan.
