# ========================= # NLP – Information Extraction (Demo S1) # ========================= # 1. Cek versi & instalasi dasar import sys import nltk print("Python:", sys.version) print("NLTK :", nltk.__version__) # Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah: # !{sys.executable} -m pip install --upgrade nltk # 2. Import pustaka import re from nltk.tokenize import wordpunct_tokenize from nltk import pos_tag # 3. Download resource NLTK yang stabil nltk.download('averaged_perceptron_tagger', quiet=True) try: nltk.download('punkt', quiet=True) print("Resource punkt siap dipakai") except: print("Gagal download punkt, akan tetap pakai wordpunct_tokenize") # 4. Contoh teks (bisa diganti) text = """ CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California. Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco. """ print("TEKS CONTOH:") print(text) # 5. Tokenisasi dan POS tagging tokens = wordpunct_tokenize(text) print("\nTOKENS:") print(tokens) pos_tags = pos_tag(tokens) print("\nPOS TAGS (30 pertama):") print(pos_tags[:30]) # 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital) def simple_capital_ner(tokens): entities = [] current = [] for tok in tokens: if tok.istitle() and tok.isalpha(): current.append(tok) else: if current: entities.append(" ".join(current)) current = [] if current: entities.append(" ".join(current)) return entities candidate_entities = simple_capital_ner(tokens) print("\nCANDIDATE ENTITIES (sederhana):") for e in candidate_entities: print("-", e) # 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan regex bulan_id = r"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)" date_pattern = rf"\b(\d{{1,2}})\s+{bulan_id}\s+(\d{{4}})\b" dates = re.findall(date_pattern, text) print("\nTANGGAL YANG DITEMUKAN:") for d in dates: print("-", " ".join(d)) # 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD) bulan_map = { "Januari": "01", "Februari": "02", "Maret": "03", "April": "04", "Mei": "05", "Juni": "06", "Juli": "07", "Agustus": "08", "September": "09", "Oktober": "10", "November": "11", "Desember": "12", } normalized_dates = [] for hari, bulan, tahun in dates: bulan_num = bulan_map[bulan] hari_num = hari.zfill(2) iso = f"{tahun}-{bulan_num}-{hari_num}" normalized_dates.append(iso) print("\nTANGGAL (FORMAT ISO 8601):") for iso in normalized_dates: print("-", iso) # 9. Ekstraksi event peluncuran produk (pattern-based) pattern = r"(.+?) meluncurkan (.+?) pada (\d{1,2} " + bulan_id + r" \d{4}) di (.+?)\." match = re.search(pattern, text) if match: company = match.group(1).strip(" ,") product = match.group(2).strip(" ,") tgl_str = match.group(3) location = match.group(4).strip(" ,") print("\nEVENT PELUNCURAN PRODUK:") print(" Perusahaan :", company) print(" Produk :", product) print(" Tanggal :", tgl_str) print(" Lokasi :", location) else: company = product = tgl_str = location = None print("\nPola event peluncuran produk tidak ditemukan.") # 10. Template IE (Template Extraction) event_template = { "Company": company, "Product": product, "LaunchDateOriginal": tgl_str, "LaunchDateISO": normalized_dates[0] if normalized_dates else None, "Location": location, } print("\nTEMPLATE EVENT (hasil akhir IE):") print(event_template)