script information extraction
This commit is contained in:
parent
789cae7ff1
commit
c5013b53ea
@ -1,83 +1,132 @@
|
|||||||
|
# =========================
|
||||||
|
# NLP – Information Extraction (Demo S1)
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
# 1. Cek versi & instalasi dasar
|
||||||
|
import sys
|
||||||
|
import nltk
|
||||||
|
print("Python:", sys.version)
|
||||||
|
print("NLTK :", nltk.__version__)
|
||||||
|
|
||||||
|
# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:
|
||||||
|
# !{sys.executable} -m pip install --upgrade nltk
|
||||||
|
|
||||||
# ---------------------------------------------------------
|
# 2. Import pustaka
|
||||||
# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network
|
import re
|
||||||
# ---------------------------------------------------------
|
from nltk.tokenize import wordpunct_tokenize
|
||||||
|
from nltk import pos_tag
|
||||||
|
|
||||||
import pandas as pd
|
# 3. Download resource NLTK yang stabil
|
||||||
from sklearn.model_selection import train_test_split
|
nltk.download('averaged_perceptron_tagger', quiet=True)
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
try:
|
||||||
from sklearn.neural_network import MLPClassifier
|
nltk.download('punkt', quiet=True)
|
||||||
from sklearn.metrics import classification_report, confusion_matrix
|
print("Resource punkt siap dipakai")
|
||||||
|
except:
|
||||||
|
print("Gagal download punkt, akan tetap pakai wordpunct_tokenize")
|
||||||
|
|
||||||
# -----------------------------------------
|
# 4. Contoh teks (bisa diganti)
|
||||||
# 1. Contoh Dataset
|
text = """
|
||||||
# -----------------------------------------
|
CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California.
|
||||||
# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll)
|
Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco.
|
||||||
|
"""
|
||||||
|
print("TEKS CONTOH:")
|
||||||
|
print(text)
|
||||||
|
|
||||||
data = {
|
# 5. Tokenisasi dan POS tagging
|
||||||
"text": [
|
tokens = wordpunct_tokenize(text)
|
||||||
"Saya suka produk ini, luar biasa",
|
print("\nTOKENS:")
|
||||||
"Layanannya buruk, sangat kecewa",
|
print(tokens)
|
||||||
"Pembelian terbaik yang pernah saya lakukan",
|
|
||||||
"Saya benci produk ini, buang-buang uang",
|
pos_tags = pos_tag(tokens)
|
||||||
"Kualitasnya sangat bagus, direkomendasikan",
|
print("\nPOS TAGS (30 pertama):")
|
||||||
"Pengalaman buruk, tidak akan membeli lagi"
|
print(pos_tags[:30])
|
||||||
],
|
|
||||||
"label": ["positive", "negative", "positive", "negative", "positive", "negative"]
|
# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)
|
||||||
|
def simple_capital_ner(tokens):
|
||||||
|
entities = []
|
||||||
|
current = []
|
||||||
|
for tok in tokens:
|
||||||
|
if tok.istitle() and tok.isalpha():
|
||||||
|
current.append(tok)
|
||||||
|
else:
|
||||||
|
if current:
|
||||||
|
entities.append(" ".join(current))
|
||||||
|
current = []
|
||||||
|
if current:
|
||||||
|
entities.append(" ".join(current))
|
||||||
|
return entities
|
||||||
|
|
||||||
|
candidate_entities = simple_capital_ner(tokens)
|
||||||
|
print("\nCANDIDATE ENTITIES (sederhana):")
|
||||||
|
for e in candidate_entities:
|
||||||
|
print("-", e)
|
||||||
|
|
||||||
|
# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan regex
|
||||||
|
bulan_id = r"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)"
|
||||||
|
date_pattern = rf"\b(\d{{1,2}})\s+{bulan_id}\s+(\d{{4}})\b"
|
||||||
|
|
||||||
|
dates = re.findall(date_pattern, text)
|
||||||
|
print("\nTANGGAL YANG DITEMUKAN:")
|
||||||
|
for d in dates:
|
||||||
|
print("-", " ".join(d))
|
||||||
|
|
||||||
|
# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)
|
||||||
|
bulan_map = {
|
||||||
|
"Januari": "01",
|
||||||
|
"Februari": "02",
|
||||||
|
"Maret": "03",
|
||||||
|
"April": "04",
|
||||||
|
"Mei": "05",
|
||||||
|
"Juni": "06",
|
||||||
|
"Juli": "07",
|
||||||
|
"Agustus": "08",
|
||||||
|
"September": "09",
|
||||||
|
"Oktober": "10",
|
||||||
|
"November": "11",
|
||||||
|
"Desember": "12",
|
||||||
}
|
}
|
||||||
|
|
||||||
df = pd.DataFrame(data)
|
normalized_dates = []
|
||||||
|
for hari, bulan, tahun in dates:
|
||||||
|
bulan_num = bulan_map[bulan]
|
||||||
|
hari_num = hari.zfill(2)
|
||||||
|
iso = f"{tahun}-{bulan_num}-{hari_num}"
|
||||||
|
normalized_dates.append(iso)
|
||||||
|
|
||||||
# -----------------------------------------
|
print("\nTANGGAL (FORMAT ISO 8601):")
|
||||||
# 2. Split Train & Test
|
for iso in normalized_dates:
|
||||||
# -----------------------------------------
|
print("-", iso)
|
||||||
X_train, X_test, y_train, y_test = train_test_split(
|
|
||||||
df["text"], df["label"], test_size=0.3, random_state=42
|
|
||||||
)
|
|
||||||
|
|
||||||
# -----------------------------------------
|
# 9. Ekstraksi event peluncuran produk (pattern-based)
|
||||||
# 3. TF-IDF Vectorization
|
pattern = r"(.+?) meluncurkan (.+?) pada (\d{1,2} " + bulan_id + r" \d{4}) di (.+?)\."
|
||||||
# -----------------------------------------
|
match = re.search(pattern, text)
|
||||||
tfidf = TfidfVectorizer(max_features=5000)
|
|
||||||
X_train_tfidf = tfidf.fit_transform(X_train)
|
|
||||||
X_test_tfidf = tfidf.transform(X_test)
|
|
||||||
|
|
||||||
# -----------------------------------------
|
if match:
|
||||||
# 4. Feedforward ANN (MLPClassifier)
|
company = match.group(1).strip(" ,")
|
||||||
# -----------------------------------------
|
product = match.group(2).strip(" ,")
|
||||||
model = MLPClassifier(
|
tgl_str = match.group(3)
|
||||||
hidden_layer_sizes=(256, 64),
|
location = match.group(4).strip(" ,")
|
||||||
activation='relu',
|
|
||||||
solver='adam',
|
|
||||||
max_iter=500,
|
|
||||||
random_state=42
|
|
||||||
)
|
|
||||||
|
|
||||||
model.fit(X_train_tfidf, y_train)
|
print("\nEVENT PELUNCURAN PRODUK:")
|
||||||
|
print(" Perusahaan :", company)
|
||||||
|
print(" Produk :", product)
|
||||||
|
print(" Tanggal :", tgl_str)
|
||||||
|
print(" Lokasi :", location)
|
||||||
|
else:
|
||||||
|
company = product = tgl_str = location = None
|
||||||
|
print("\nPola event peluncuran produk tidak ditemukan.")
|
||||||
|
|
||||||
# -----------------------------------------
|
# 10. Template IE (Template Extraction)
|
||||||
# 5. Evaluasi Model
|
event_template = {
|
||||||
# -----------------------------------------
|
"Company": company,
|
||||||
y_pred = model.predict(X_test_tfidf)
|
"Product": product,
|
||||||
|
"LaunchDateOriginal": tgl_str,
|
||||||
|
"LaunchDateISO": normalized_dates[0] if normalized_dates else None,
|
||||||
|
"Location": location,
|
||||||
|
}
|
||||||
|
|
||||||
print("=== Classification Report ===")
|
print("\nTEMPLATE EVENT (hasil akhir IE):")
|
||||||
print(classification_report(y_test, y_pred))
|
print(event_template)
|
||||||
|
|
||||||
print("=== Confusion Matrix ===")
|
|
||||||
print(confusion_matrix(y_test, y_pred))
|
|
||||||
|
|
||||||
# -----------------------------------------
|
|
||||||
# 6. Prediksi Teks Baru
|
|
||||||
# -----------------------------------------
|
|
||||||
sample_text = ["barang bagus luar biasa"]
|
|
||||||
sample_text = ["barang buruk, saya kecewa"]
|
|
||||||
sample_vec = tfidf.transform(sample_text)
|
|
||||||
prediction = model.predict(sample_vec)
|
|
||||||
|
|
||||||
print("\nPrediksi untuk:", sample_text[0])
|
|
||||||
print("Hasil:", prediction[0])
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"cells": [],
|
||||||
|
"metadata": {},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
218
NLP/Information Extraction.ipynb
Normal file
218
NLP/Information Extraction.ipynb
Normal file
@ -0,0 +1,218 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "7c7601d6-3c91-453e-8c29-706528237596",
|
||||||
|
"metadata": {
|
||||||
|
"scrolled": true
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Python: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0]\n",
|
||||||
|
"NLTK : 3.9.2\n",
|
||||||
|
"Resource punkt siap dipakai\n",
|
||||||
|
"TEKS CONTOH:\n",
|
||||||
|
"\n",
|
||||||
|
"CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California.\n",
|
||||||
|
"Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"TOKENS:\n",
|
||||||
|
"['CEO', 'Apple', ',', 'Tim', 'Cook', ',', 'meluncurkan', 'iPhone', '15', 'pada', '12', 'September', '2023', 'di', 'Cupertino', ',', 'California', '.', 'Pada', 'hari', 'berikutnya', ',', 'ia', 'menghadiri', 'pertemuan', 'investor', 'di', 'San', 'Francisco', '.']\n",
|
||||||
|
"\n",
|
||||||
|
"POS TAGS (30 pertama):\n",
|
||||||
|
"[('CEO', 'NNP'), ('Apple', 'NNP'), (',', ','), ('Tim', 'NNP'), ('Cook', 'NNP'), (',', ','), ('meluncurkan', 'VBD'), ('iPhone', 'NN'), ('15', 'CD'), ('pada', 'NN'), ('12', 'CD'), ('September', 'NNP'), ('2023', 'CD'), ('di', 'NN'), ('Cupertino', 'NNP'), (',', ','), ('California', 'NNP'), ('.', '.'), ('Pada', 'NNP'), ('hari', 'NN'), ('berikutnya', 'NN'), (',', ','), ('ia', 'JJ'), ('menghadiri', 'NN'), ('pertemuan', 'JJ'), ('investor', 'NN'), ('di', 'JJ'), ('San', 'NNP'), ('Francisco', 'NNP'), ('.', '.')]\n",
|
||||||
|
"\n",
|
||||||
|
"CANDIDATE ENTITIES (sederhana):\n",
|
||||||
|
"- Apple\n",
|
||||||
|
"- Tim Cook\n",
|
||||||
|
"- September\n",
|
||||||
|
"- Cupertino\n",
|
||||||
|
"- California\n",
|
||||||
|
"- Pada\n",
|
||||||
|
"- San Francisco\n",
|
||||||
|
"\n",
|
||||||
|
"TANGGAL YANG DITEMUKAN:\n",
|
||||||
|
"- 12 September 2023\n",
|
||||||
|
"\n",
|
||||||
|
"TANGGAL (FORMAT ISO 8601):\n",
|
||||||
|
"- 2023-09-12\n",
|
||||||
|
"\n",
|
||||||
|
"EVENT PELUNCURAN PRODUK:\n",
|
||||||
|
" Perusahaan : CEO Apple, Tim Cook\n",
|
||||||
|
" Produk : iPhone 15\n",
|
||||||
|
" Tanggal : 12 September 2023\n",
|
||||||
|
" Lokasi : September\n",
|
||||||
|
"\n",
|
||||||
|
"TEMPLATE EVENT (hasil akhir IE):\n",
|
||||||
|
"{'Company': 'CEO Apple, Tim Cook', 'Product': 'iPhone 15', 'LaunchDateOriginal': '12 September 2023', 'LaunchDateISO': '2023-09-12', 'Location': 'September'}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# =========================\n",
|
||||||
|
"# NLP – Information Extraction (IE)\n",
|
||||||
|
"# =========================\n",
|
||||||
|
"\n",
|
||||||
|
"# 1. Cek versi & instalasi dasar\n",
|
||||||
|
"import sys\n",
|
||||||
|
"import nltk\n",
|
||||||
|
"print(\"Python:\", sys.version)\n",
|
||||||
|
"print(\"NLTK :\", nltk.__version__)\n",
|
||||||
|
"\n",
|
||||||
|
"# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:\n",
|
||||||
|
"# !{sys.executable} -m pip install --upgrade nltk\n",
|
||||||
|
"\n",
|
||||||
|
"# 2. Import pustaka\n",
|
||||||
|
"import re\n",
|
||||||
|
"from nltk.tokenize import wordpunct_tokenize\n",
|
||||||
|
"from nltk import pos_tag\n",
|
||||||
|
"\n",
|
||||||
|
"# 3. Download resource NLTK yang stabil\n",
|
||||||
|
"nltk.download('averaged_perceptron_tagger', quiet=True)\n",
|
||||||
|
"try:\n",
|
||||||
|
" nltk.download('punkt', quiet=True)\n",
|
||||||
|
" print(\"Resource punkt siap dipakai\")\n",
|
||||||
|
"except:\n",
|
||||||
|
" print(\"Gagal download punkt, akan tetap pakai wordpunct_tokenize\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 4. Contoh teks (bisa diganti)\n",
|
||||||
|
"text = \"\"\"\n",
|
||||||
|
"CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California.\n",
|
||||||
|
"Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco.\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"print(\"TEKS CONTOH:\")\n",
|
||||||
|
"print(text)\n",
|
||||||
|
"\n",
|
||||||
|
"# 5. Tokenisasi dan POS tagging\n",
|
||||||
|
"tokens = wordpunct_tokenize(text)\n",
|
||||||
|
"print(\"\\nTOKENS:\")\n",
|
||||||
|
"print(tokens)\n",
|
||||||
|
"\n",
|
||||||
|
"pos_tags = pos_tag(tokens)\n",
|
||||||
|
"print(\"\\nPOS TAGS (30 pertama):\")\n",
|
||||||
|
"print(pos_tags[:30])\n",
|
||||||
|
"\n",
|
||||||
|
"# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)\n",
|
||||||
|
"def simple_capital_ner(tokens):\n",
|
||||||
|
" entities = []\n",
|
||||||
|
" current = []\n",
|
||||||
|
" for tok in tokens:\n",
|
||||||
|
" if tok.istitle() and tok.isalpha():\n",
|
||||||
|
" current.append(tok)\n",
|
||||||
|
" else:\n",
|
||||||
|
" if current:\n",
|
||||||
|
" entities.append(\" \".join(current))\n",
|
||||||
|
" current = []\n",
|
||||||
|
" if current:\n",
|
||||||
|
" entities.append(\" \".join(current))\n",
|
||||||
|
" return entities\n",
|
||||||
|
"\n",
|
||||||
|
"candidate_entities = simple_capital_ner(tokens)\n",
|
||||||
|
"print(\"\\nCANDIDATE ENTITIES (sederhana):\")\n",
|
||||||
|
"for e in candidate_entities:\n",
|
||||||
|
" print(\"-\", e)\n",
|
||||||
|
"\n",
|
||||||
|
"# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan RegEx\n",
|
||||||
|
"bulan_id = r\"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\"\n",
|
||||||
|
"date_pattern = rf\"\\b(\\d{{1,2}})\\s+{bulan_id}\\s+(\\d{{4}})\\b\"\n",
|
||||||
|
"\n",
|
||||||
|
"dates = re.findall(date_pattern, text)\n",
|
||||||
|
"print(\"\\nTANGGAL YANG DITEMUKAN:\")\n",
|
||||||
|
"for d in dates:\n",
|
||||||
|
" print(\"-\", \" \".join(d))\n",
|
||||||
|
"\n",
|
||||||
|
"# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)\n",
|
||||||
|
"bulan_map = {\n",
|
||||||
|
" \"Januari\": \"01\",\n",
|
||||||
|
" \"Februari\": \"02\",\n",
|
||||||
|
" \"Maret\": \"03\",\n",
|
||||||
|
" \"April\": \"04\",\n",
|
||||||
|
" \"Mei\": \"05\",\n",
|
||||||
|
" \"Juni\": \"06\",\n",
|
||||||
|
" \"Juli\": \"07\",\n",
|
||||||
|
" \"Agustus\": \"08\",\n",
|
||||||
|
" \"September\": \"09\",\n",
|
||||||
|
" \"Oktober\": \"10\",\n",
|
||||||
|
" \"November\": \"11\",\n",
|
||||||
|
" \"Desember\": \"12\",\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"normalized_dates = []\n",
|
||||||
|
"for hari, bulan, tahun in dates:\n",
|
||||||
|
" bulan_num = bulan_map[bulan]\n",
|
||||||
|
" hari_num = hari.zfill(2)\n",
|
||||||
|
" iso = f\"{tahun}-{bulan_num}-{hari_num}\"\n",
|
||||||
|
" normalized_dates.append(iso)\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nTANGGAL (FORMAT ISO 8601):\")\n",
|
||||||
|
"for iso in normalized_dates:\n",
|
||||||
|
" print(\"-\", iso)\n",
|
||||||
|
"\n",
|
||||||
|
"# 9. Ekstraksi event peluncuran produk (pattern-based)\n",
|
||||||
|
"pattern = r\"(.+?) meluncurkan (.+?) pada (\\d{1,2} \" + bulan_id + r\" \\d{4}) di (.+?)\\.\"\n",
|
||||||
|
"match = re.search(pattern, text)\n",
|
||||||
|
"\n",
|
||||||
|
"if match:\n",
|
||||||
|
" company = match.group(1).strip(\" ,\")\n",
|
||||||
|
" product = match.group(2).strip(\" ,\")\n",
|
||||||
|
" tgl_str = match.group(3)\n",
|
||||||
|
" location = match.group(4).strip(\" ,\")\n",
|
||||||
|
"\n",
|
||||||
|
" print(\"\\nEVENT PELUNCURAN PRODUK:\")\n",
|
||||||
|
" print(\" Perusahaan :\", company)\n",
|
||||||
|
" print(\" Produk :\", product)\n",
|
||||||
|
" print(\" Tanggal :\", tgl_str)\n",
|
||||||
|
" print(\" Lokasi :\", location)\n",
|
||||||
|
"else:\n",
|
||||||
|
" company = product = tgl_str = location = None\n",
|
||||||
|
" print(\"\\nPola event peluncuran produk tidak ditemukan.\")\n",
|
||||||
|
"\n",
|
||||||
|
"# 10. Template IE (Template Extraction)\n",
|
||||||
|
"event_template = {\n",
|
||||||
|
" \"Company\": company,\n",
|
||||||
|
" \"Product\": product,\n",
|
||||||
|
" \"LaunchDateOriginal\": tgl_str,\n",
|
||||||
|
" \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n",
|
||||||
|
" \"Location\": location,\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"print(\"\\nTEMPLATE EVENT (hasil akhir IE):\")\n",
|
||||||
|
"print(event_template)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "1e128113-af1e-45a1-8586-48c4acf578b4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.12.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user