# =========================
# NLP – Information Extraction (IE)
# Praktikum Fatah
#
# Nama  : Fatah Sabila Rosyad
# NIM   : 202210715288
# Kelas : F7B2
# =========================

In [2]:
import nltk

# Download resource yang dibutuhkan untuk POS Tagging (versi terbaru)
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Fatah Sabila
[nltk_data]     Rosyad\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Fatah Sabila
[nltk_data]     Rosyad\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package punkt to C:\Users\Fatah Sabila
[nltk_data]     Rosyad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# =========================
# NLP – Information Extraction (IE)
# Praktikum
# =========================

# 1. Cek versi & instalasi dasar
import sys
import nltk
print("Python:", sys.version)
print("NLTK :", nltk.__version__)

# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:
# !{sys.executable} -m pip install --upgrade nltk

# 2. Import pustaka
import re
from nltk.tokenize import wordpunct_tokenize
from nltk import pos_tag

# 3. Download resource NLTK yang stabil
nltk.download('averaged_perceptron_tagger', quiet=True)
try:
    nltk.download('punkt', quiet=True)
    print("Resource punkt siap dipakai")
except:
    print("Gagal download punkt, akan tetap pakai wordpunct_tokenize")

# 4. Contoh teks (bisa diganti)
text = """
Mahasiswa informatika bernama Fatah Sabila Rosyad melakukan praktikum NLP
tentang Information Extraction pada 22 November 2025 di Universitas Bhayangkara Jakarta Raya.
Pada kegiatan tersebut, Fatah mempelajari ekstraksi entitas, tanggal, dan event berbasis teks.
"""
print("TEKS CONTOH:")
print(text)

# 5. Tokenisasi dan POS tagging
tokens = wordpunct_tokenize(text)
print("\nTOKENS:")
print(tokens)

pos_tags = pos_tag(tokens)
print("\nPOS TAGS (30 pertama):")
print(pos_tags[:30])

# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)
def simple_capital_ner(tokens):
    entities = []
    current = []
    for tok in tokens:
        if tok.istitle() and tok.isalpha():
            current.append(tok)
        else:
            if current:
                entities.append(" ".join(current))
                current = []
    if current:
        entities.append(" ".join(current))
    return entities

candidate_entities = simple_capital_ner(tokens)
print("\nCANDIDATE ENTITIES (sederhana):")
for e in candidate_entities:
    print("-", e)

# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan RegEx
bulan_id = r"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)"
date_pattern = rf"\b(\d{{1,2}})\s+{bulan_id}\s+(\d{{4}})\b"

dates = re.findall(date_pattern, text)
print("\nTANGGAL YANG DITEMUKAN:")
for d in dates:
    print("-", " ".join(d))

# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)
bulan_map = {
    "Januari": "01",
    "Februari": "02",
    "Maret": "03",
    "April": "04",
    "Mei": "05",
    "Juni": "06",
    "Juli": "07",
    "Agustus": "08",
    "September": "09",
    "Oktober": "10",
    "November": "11",
    "Desember": "12",
}

normalized_dates = []
for hari, bulan, tahun in dates:
    bulan_num = bulan_map[bulan]
    hari_num = hari.zfill(2)
    iso = f"{tahun}-{bulan_num}-{hari_num}"
    normalized_dates.append(iso)

print("\nTANGGAL (FORMAT ISO 8601):")
for iso in normalized_dates:
    print("-", iso)

# 9. Ekstraksi event peluncuran produk (pattern-based)
pattern = r"(.+?) meluncurkan (.+?) pada (\d{1,2} " + bulan_id + r" \d{4}) di (.+?)\."
match = re.search(pattern, text)

if match:
    company = match.group(1).strip(" ,")
    product = match.group(2).strip(" ,")
    tgl_str = match.group(3)
    location = match.group(4).strip(" ,")

    print("\nEVENT PELUNCURAN PRODUK:")
    print("  Perusahaan :", company)
    print("  Produk     :", product)
    print("  Tanggal    :", tgl_str)
    print("  Lokasi     :", location)
else:
    company = product = tgl_str = location = None
    print("\nPola event peluncuran produk tidak ditemukan.")

# 10. Template IE (Template Extraction)
event_template = {
    "Company": company,
    "Product": product,
    "LaunchDateOriginal": tgl_str,
    "LaunchDateISO": normalized_dates[0] if normalized_dates else None,
    "Location": location,
}

print("\nTEMPLATE EVENT (hasil akhir IE):")
print(event_template)


Python: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]
NLTK : 3.9.1
Resource punkt siap dipakai
TEKS CONTOH:

Mahasiswa informatika bernama Fatah Sabila Rosyad melakukan praktikum NLP
tentang Information Extraction pada 22 November 2025 di Universitas Bhayangkara Jakarta Raya.
Pada kegiatan tersebut, Fatah mempelajari ekstraksi entitas, tanggal, dan event berbasis teks.


TOKENS:
['Mahasiswa', 'informatika', 'bernama', 'Fatah', 'Sabila', 'Rosyad', 'melakukan', 'praktikum', 'NLP', 'tentang', 'Information', 'Extraction', 'pada', '22', 'November', '2025', 'di', 'Universitas', 'Bhayangkara', 'Jakarta', 'Raya', '.', 'Pada', 'kegiatan', 'tersebut', ',', 'Fatah', 'mempelajari', 'ekstraksi', 'entitas', ',', 'tanggal', ',', 'dan', 'event', 'berbasis', 'teks', '.']

POS TAGS (30 pertama):
[('Mahasiswa', 'NNP'), ('informatika', 'JJ'), ('bernama', 'NN'), ('Fatah', 'NNP'), ('Sabila', 'NNP'), ('Rosyad', 'NNP'), ('melakukan', 'NN'), ('praktikum', 'NN')

In [5]:
print("\nProses Information Extraction selesai dijalankan oleh Fatah.")


Proses Information Extraction selesai dijalankan oleh Fatah.
