materi-praktikum/.virtual_documents/NLP/Information Extraction.ipynb
2025-12-02 02:51:49 +00:00

135 lines
3.5 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# =========================
# NLP Information Extraction (IE)
# Prakt
# =========================
# 1. Cek versi & instalasi dasar
import sys
import nltk
print("Python:", sys.version)
print("NLTK :", nltk.__version__)
# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:
# !{sys.executable} -m pip install --upgrade nltk
# 2. Import pustaka
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import pos_tag
# 3. Download resource NLTK yang stabil
nltk.download('averaged_perceptron_tagger', quiet=True)
try:
nltk.download('punkt', quiet=True)
print("Resource punkt siap dipakai")
except:
print("Gagal download punkt, akan tetap pakai wordpunct_tokenize")
# 4. Contoh teks (bisa diganti)
text = """
CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California.
Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco.
"""
print("TEKS CONTOH:")
print(text)
# 5. Tokenisasi dan POS tagging
tokens = wordpunct_tokenize(text)
print("\nTOKENS:")
print(tokens)
pos_tags = pos_tag(tokens)
print("\nPOS TAGS (30 pertama):")
print(pos_tags[:30])
# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)
def simple_capital_ner(tokens):
entities = []
current = []
for tok in tokens:
if tok.istitle() and tok.isalpha():
current.append(tok)
else:
if current:
entities.append(" ".join(current))
current = []
if current:
entities.append(" ".join(current))
return entities
candidate_entities = simple_capital_ner(tokens)
print("\nCANDIDATE ENTITIES (sederhana):")
for e in candidate_entities:
print("-", e)
# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan RegEx
bulan_id = r"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)"
date_pattern = rf"\b(\d{{1,2}})\s+{bulan_id}\s+(\d{{4}})\b"
dates = re.findall(date_pattern, text)
print("\nTANGGAL YANG DITEMUKAN:")
for d in dates:
print("-", " ".join(d))
# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)
bulan_map = {
"Januari": "01",
"Februari": "02",
"Maret": "03",
"April": "04",
"Mei": "05",
"Juni": "06",
"Juli": "07",
"Agustus": "08",
"September": "09",
"Oktober": "10",
"November": "11",
"Desember": "12",
}
normalized_dates = []
for hari, bulan, tahun in dates:
bulan_num = bulan_map[bulan]
hari_num = hari.zfill(2)
iso = f"{tahun}-{bulan_num}-{hari_num}"
normalized_dates.append(iso)
print("\nTANGGAL (FORMAT ISO 8601):")
for iso in normalized_dates:
print("-", iso)
# 9. Ekstraksi event peluncuran produk (pattern-based)
pattern = r"(.+?) meluncurkan (.+?) pada (\d{1,2} " + bulan_id + r" \d{4}) di (.+?)\."
match = re.search(pattern, text)
if match:
company = match.group(1).strip(" ,")
product = match.group(2).strip(" ,")
tgl_str = match.group(3)
location = match.group(4).strip(" ,")
print("\nEVENT PELUNCURAN PRODUK:")
print(" Perusahaan :", company)
print(" Produk :", product)
print(" Tanggal :", tgl_str)
print(" Lokasi :", location)
else:
company = product = tgl_str = location = None
print("\nPola event peluncuran produk tidak ditemukan.")
# 10. Template IE (Template Extraction)
event_template = {
"Company": company,
"Product": product,
"LaunchDateOriginal": tgl_str,
"LaunchDateISO": normalized_dates[0] if normalized_dates else None,
"Location": location,
}
print("\nTEMPLATE EVENT (hasil akhir IE):")
print(event_template)