135 lines
3.5 KiB
Plaintext
135 lines
3.5 KiB
Plaintext
# =========================
|
||
# NLP – Information Extraction (IE)
|
||
# Prakt
|
||
# =========================
|
||
|
||
# 1. Cek versi & instalasi dasar
|
||
import sys
|
||
import nltk
|
||
print("Python:", sys.version)
|
||
print("NLTK :", nltk.__version__)
|
||
|
||
# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:
|
||
# !{sys.executable} -m pip install --upgrade nltk
|
||
|
||
# 2. Import pustaka
|
||
import re
|
||
from nltk.tokenize import wordpunct_tokenize
|
||
from nltk import pos_tag
|
||
|
||
# 3. Download resource NLTK yang stabil
|
||
nltk.download('averaged_perceptron_tagger', quiet=True)
|
||
try:
|
||
nltk.download('punkt', quiet=True)
|
||
print("Resource punkt siap dipakai")
|
||
except:
|
||
print("Gagal download punkt, akan tetap pakai wordpunct_tokenize")
|
||
|
||
# 4. Contoh teks (bisa diganti)
|
||
text = """
|
||
CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California.
|
||
Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco.
|
||
"""
|
||
print("TEKS CONTOH:")
|
||
print(text)
|
||
|
||
# 5. Tokenisasi dan POS tagging
|
||
tokens = wordpunct_tokenize(text)
|
||
print("\nTOKENS:")
|
||
print(tokens)
|
||
|
||
pos_tags = pos_tag(tokens)
|
||
print("\nPOS TAGS (30 pertama):")
|
||
print(pos_tags[:30])
|
||
|
||
# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)
|
||
def simple_capital_ner(tokens):
|
||
entities = []
|
||
current = []
|
||
for tok in tokens:
|
||
if tok.istitle() and tok.isalpha():
|
||
current.append(tok)
|
||
else:
|
||
if current:
|
||
entities.append(" ".join(current))
|
||
current = []
|
||
if current:
|
||
entities.append(" ".join(current))
|
||
return entities
|
||
|
||
candidate_entities = simple_capital_ner(tokens)
|
||
print("\nCANDIDATE ENTITIES (sederhana):")
|
||
for e in candidate_entities:
|
||
print("-", e)
|
||
|
||
# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan RegEx
|
||
bulan_id = r"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)"
|
||
date_pattern = rf"\b(\d{{1,2}})\s+{bulan_id}\s+(\d{{4}})\b"
|
||
|
||
dates = re.findall(date_pattern, text)
|
||
print("\nTANGGAL YANG DITEMUKAN:")
|
||
for d in dates:
|
||
print("-", " ".join(d))
|
||
|
||
# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)
|
||
bulan_map = {
|
||
"Januari": "01",
|
||
"Februari": "02",
|
||
"Maret": "03",
|
||
"April": "04",
|
||
"Mei": "05",
|
||
"Juni": "06",
|
||
"Juli": "07",
|
||
"Agustus": "08",
|
||
"September": "09",
|
||
"Oktober": "10",
|
||
"November": "11",
|
||
"Desember": "12",
|
||
}
|
||
|
||
normalized_dates = []
|
||
for hari, bulan, tahun in dates:
|
||
bulan_num = bulan_map[bulan]
|
||
hari_num = hari.zfill(2)
|
||
iso = f"{tahun}-{bulan_num}-{hari_num}"
|
||
normalized_dates.append(iso)
|
||
|
||
print("\nTANGGAL (FORMAT ISO 8601):")
|
||
for iso in normalized_dates:
|
||
print("-", iso)
|
||
|
||
# 9. Ekstraksi event peluncuran produk (pattern-based)
|
||
pattern = r"(.+?) meluncurkan (.+?) pada (\d{1,2} " + bulan_id + r" \d{4}) di (.+?)\."
|
||
match = re.search(pattern, text)
|
||
|
||
if match:
|
||
company = match.group(1).strip(" ,")
|
||
product = match.group(2).strip(" ,")
|
||
tgl_str = match.group(3)
|
||
location = match.group(4).strip(" ,")
|
||
|
||
print("\nEVENT PELUNCURAN PRODUK:")
|
||
print(" Perusahaan :", company)
|
||
print(" Produk :", product)
|
||
print(" Tanggal :", tgl_str)
|
||
print(" Lokasi :", location)
|
||
else:
|
||
company = product = tgl_str = location = None
|
||
print("\nPola event peluncuran produk tidak ditemukan.")
|
||
|
||
# 10. Template IE (Template Extraction)
|
||
event_template = {
|
||
"Company": company,
|
||
"Product": product,
|
||
"LaunchDateOriginal": tgl_str,
|
||
"LaunchDateISO": normalized_dates[0] if normalized_dates else None,
|
||
"Location": location,
|
||
}
|
||
|
||
print("\nTEMPLATE EVENT (hasil akhir IE):")
|
||
print(event_template)
|
||
|
||
|
||
|
||
|