diff --git a/.virtual_documents/NLP/Untitled.ipynb b/.virtual_documents/NLP/Untitled.ipynb index 1ebdb71..1d837fd 100644 --- a/.virtual_documents/NLP/Untitled.ipynb +++ b/.virtual_documents/NLP/Untitled.ipynb @@ -1,83 +1,132 @@ +# ========================= +# NLP – Information Extraction (Demo S1) +# ========================= +# 1. Cek versi & instalasi dasar +import sys +import nltk +print("Python:", sys.version) +print("NLTK :", nltk.__version__) +# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah: +# !{sys.executable} -m pip install --upgrade nltk -# --------------------------------------------------------- -# Klasifikasi Teks dengan TF-IDF + Feedforward Neural Network -# --------------------------------------------------------- +# 2. Import pustaka +import re +from nltk.tokenize import wordpunct_tokenize +from nltk import pos_tag -import pandas as pd -from sklearn.model_selection import train_test_split -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.neural_network import MLPClassifier -from sklearn.metrics import classification_report, confusion_matrix +# 3. Download resource NLTK yang stabil +nltk.download('averaged_perceptron_tagger', quiet=True) +try: + nltk.download('punkt', quiet=True) + print("Resource punkt siap dipakai") +except: + print("Gagal download punkt, akan tetap pakai wordpunct_tokenize") -# ----------------------------------------- -# 1. Contoh Dataset -# ----------------------------------------- -# Anda bisa mengganti dataset ini dengan dataset lain (CSV, JSON, dll) +# 4. Contoh teks (bisa diganti) +text = """ +CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California. +Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco. +""" +print("TEKS CONTOH:") +print(text) -data = { - "text": [ - "Saya suka produk ini, luar biasa", - "Layanannya buruk, sangat kecewa", - "Pembelian terbaik yang pernah saya lakukan", - "Saya benci produk ini, buang-buang uang", - "Kualitasnya sangat bagus, direkomendasikan", - "Pengalaman buruk, tidak akan membeli lagi" - ], - "label": ["positive", "negative", "positive", "negative", "positive", "negative"] +# 5. Tokenisasi dan POS tagging +tokens = wordpunct_tokenize(text) +print("\nTOKENS:") +print(tokens) + +pos_tags = pos_tag(tokens) +print("\nPOS TAGS (30 pertama):") +print(pos_tags[:30]) + +# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital) +def simple_capital_ner(tokens): + entities = [] + current = [] + for tok in tokens: + if tok.istitle() and tok.isalpha(): + current.append(tok) + else: + if current: + entities.append(" ".join(current)) + current = [] + if current: + entities.append(" ".join(current)) + return entities + +candidate_entities = simple_capital_ner(tokens) +print("\nCANDIDATE ENTITIES (sederhana):") +for e in candidate_entities: + print("-", e) + +# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan regex +bulan_id = r"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)" +date_pattern = rf"\b(\d{{1,2}})\s+{bulan_id}\s+(\d{{4}})\b" + +dates = re.findall(date_pattern, text) +print("\nTANGGAL YANG DITEMUKAN:") +for d in dates: + print("-", " ".join(d)) + +# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD) +bulan_map = { + "Januari": "01", + "Februari": "02", + "Maret": "03", + "April": "04", + "Mei": "05", + "Juni": "06", + "Juli": "07", + "Agustus": "08", + "September": "09", + "Oktober": "10", + "November": "11", + "Desember": "12", } -df = pd.DataFrame(data) +normalized_dates = [] +for hari, bulan, tahun in dates: + bulan_num = bulan_map[bulan] + hari_num = hari.zfill(2) + iso = f"{tahun}-{bulan_num}-{hari_num}" + normalized_dates.append(iso) -# ----------------------------------------- -# 2. Split Train & Test -# ----------------------------------------- -X_train, X_test, y_train, y_test = train_test_split( - df["text"], df["label"], test_size=0.3, random_state=42 -) +print("\nTANGGAL (FORMAT ISO 8601):") +for iso in normalized_dates: + print("-", iso) -# ----------------------------------------- -# 3. TF-IDF Vectorization -# ----------------------------------------- -tfidf = TfidfVectorizer(max_features=5000) -X_train_tfidf = tfidf.fit_transform(X_train) -X_test_tfidf = tfidf.transform(X_test) +# 9. Ekstraksi event peluncuran produk (pattern-based) +pattern = r"(.+?) meluncurkan (.+?) pada (\d{1,2} " + bulan_id + r" \d{4}) di (.+?)\." +match = re.search(pattern, text) -# ----------------------------------------- -# 4. Feedforward ANN (MLPClassifier) -# ----------------------------------------- -model = MLPClassifier( - hidden_layer_sizes=(256, 64), - activation='relu', - solver='adam', - max_iter=500, - random_state=42 -) +if match: + company = match.group(1).strip(" ,") + product = match.group(2).strip(" ,") + tgl_str = match.group(3) + location = match.group(4).strip(" ,") -model.fit(X_train_tfidf, y_train) + print("\nEVENT PELUNCURAN PRODUK:") + print(" Perusahaan :", company) + print(" Produk :", product) + print(" Tanggal :", tgl_str) + print(" Lokasi :", location) +else: + company = product = tgl_str = location = None + print("\nPola event peluncuran produk tidak ditemukan.") -# ----------------------------------------- -# 5. Evaluasi Model -# ----------------------------------------- -y_pred = model.predict(X_test_tfidf) +# 10. Template IE (Template Extraction) +event_template = { + "Company": company, + "Product": product, + "LaunchDateOriginal": tgl_str, + "LaunchDateISO": normalized_dates[0] if normalized_dates else None, + "Location": location, +} -print("=== Classification Report ===") -print(classification_report(y_test, y_pred)) - -print("=== Confusion Matrix ===") -print(confusion_matrix(y_test, y_pred)) - -# ----------------------------------------- -# 6. Prediksi Teks Baru -# ----------------------------------------- -sample_text = ["barang bagus luar biasa"] -sample_text = ["barang buruk, saya kecewa"] -sample_vec = tfidf.transform(sample_text) -prediction = model.predict(sample_vec) - -print("\nPrediksi untuk:", sample_text[0]) -print("Hasil:", prediction[0]) +print("\nTEMPLATE EVENT (hasil akhir IE):") +print(event_template) diff --git a/NLP/.ipynb_checkpoints/Information Extraction-checkpoint.ipynb b/NLP/.ipynb_checkpoints/Information Extraction-checkpoint.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/NLP/.ipynb_checkpoints/Information Extraction-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/NLP/Information Extraction.ipynb b/NLP/Information Extraction.ipynb new file mode 100644 index 0000000..92b4632 --- /dev/null +++ b/NLP/Information Extraction.ipynb @@ -0,0 +1,218 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "7c7601d6-3c91-453e-8c29-706528237596", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Python: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0]\n", + "NLTK : 3.9.2\n", + "Resource punkt siap dipakai\n", + "TEKS CONTOH:\n", + "\n", + "CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California.\n", + "Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco.\n", + "\n", + "\n", + "TOKENS:\n", + "['CEO', 'Apple', ',', 'Tim', 'Cook', ',', 'meluncurkan', 'iPhone', '15', 'pada', '12', 'September', '2023', 'di', 'Cupertino', ',', 'California', '.', 'Pada', 'hari', 'berikutnya', ',', 'ia', 'menghadiri', 'pertemuan', 'investor', 'di', 'San', 'Francisco', '.']\n", + "\n", + "POS TAGS (30 pertama):\n", + "[('CEO', 'NNP'), ('Apple', 'NNP'), (',', ','), ('Tim', 'NNP'), ('Cook', 'NNP'), (',', ','), ('meluncurkan', 'VBD'), ('iPhone', 'NN'), ('15', 'CD'), ('pada', 'NN'), ('12', 'CD'), ('September', 'NNP'), ('2023', 'CD'), ('di', 'NN'), ('Cupertino', 'NNP'), (',', ','), ('California', 'NNP'), ('.', '.'), ('Pada', 'NNP'), ('hari', 'NN'), ('berikutnya', 'NN'), (',', ','), ('ia', 'JJ'), ('menghadiri', 'NN'), ('pertemuan', 'JJ'), ('investor', 'NN'), ('di', 'JJ'), ('San', 'NNP'), ('Francisco', 'NNP'), ('.', '.')]\n", + "\n", + "CANDIDATE ENTITIES (sederhana):\n", + "- Apple\n", + "- Tim Cook\n", + "- September\n", + "- Cupertino\n", + "- California\n", + "- Pada\n", + "- San Francisco\n", + "\n", + "TANGGAL YANG DITEMUKAN:\n", + "- 12 September 2023\n", + "\n", + "TANGGAL (FORMAT ISO 8601):\n", + "- 2023-09-12\n", + "\n", + "EVENT PELUNCURAN PRODUK:\n", + " Perusahaan : CEO Apple, Tim Cook\n", + " Produk : iPhone 15\n", + " Tanggal : 12 September 2023\n", + " Lokasi : September\n", + "\n", + "TEMPLATE EVENT (hasil akhir IE):\n", + "{'Company': 'CEO Apple, Tim Cook', 'Product': 'iPhone 15', 'LaunchDateOriginal': '12 September 2023', 'LaunchDateISO': '2023-09-12', 'Location': 'September'}\n" + ] + } + ], + "source": [ + "# =========================\n", + "# NLP – Information Extraction (IE)\n", + "# =========================\n", + "\n", + "# 1. Cek versi & instalasi dasar\n", + "import sys\n", + "import nltk\n", + "print(\"Python:\", sys.version)\n", + "print(\"NLTK :\", nltk.__version__)\n", + "\n", + "# Jika di environment belum ada NLTK terbaru, aktifkan baris di bawah:\n", + "# !{sys.executable} -m pip install --upgrade nltk\n", + "\n", + "# 2. Import pustaka\n", + "import re\n", + "from nltk.tokenize import wordpunct_tokenize\n", + "from nltk import pos_tag\n", + "\n", + "# 3. Download resource NLTK yang stabil\n", + "nltk.download('averaged_perceptron_tagger', quiet=True)\n", + "try:\n", + " nltk.download('punkt', quiet=True)\n", + " print(\"Resource punkt siap dipakai\")\n", + "except:\n", + " print(\"Gagal download punkt, akan tetap pakai wordpunct_tokenize\")\n", + "\n", + "# 4. Contoh teks (bisa diganti)\n", + "text = \"\"\"\n", + "CEO Apple, Tim Cook, meluncurkan iPhone 15 pada 12 September 2023 di Cupertino, California.\n", + "Pada hari berikutnya, ia menghadiri pertemuan investor di San Francisco.\n", + "\"\"\"\n", + "print(\"TEKS CONTOH:\")\n", + "print(text)\n", + "\n", + "# 5. Tokenisasi dan POS tagging\n", + "tokens = wordpunct_tokenize(text)\n", + "print(\"\\nTOKENS:\")\n", + "print(tokens)\n", + "\n", + "pos_tags = pos_tag(tokens)\n", + "print(\"\\nPOS TAGS (30 pertama):\")\n", + "print(pos_tags[:30])\n", + "\n", + "# 6. Ekstraksi “entitas” sederhana (berbasis huruf kapital)\n", + "def simple_capital_ner(tokens):\n", + " entities = []\n", + " current = []\n", + " for tok in tokens:\n", + " if tok.istitle() and tok.isalpha():\n", + " current.append(tok)\n", + " else:\n", + " if current:\n", + " entities.append(\" \".join(current))\n", + " current = []\n", + " if current:\n", + " entities.append(\" \".join(current))\n", + " return entities\n", + "\n", + "candidate_entities = simple_capital_ner(tokens)\n", + "print(\"\\nCANDIDATE ENTITIES (sederhana):\")\n", + "for e in candidate_entities:\n", + " print(\"-\", e)\n", + "\n", + "# 7. Ekstraksi tanggal (Ekstraksi Waktu) dengan RegEx\n", + "bulan_id = r\"(Januari|Februari|Maret|April|Mei|Juni|Juli|Agustus|September|Oktober|November|Desember)\"\n", + "date_pattern = rf\"\\b(\\d{{1,2}})\\s+{bulan_id}\\s+(\\d{{4}})\\b\"\n", + "\n", + "dates = re.findall(date_pattern, text)\n", + "print(\"\\nTANGGAL YANG DITEMUKAN:\")\n", + "for d in dates:\n", + " print(\"-\", \" \".join(d))\n", + "\n", + "# 8. Normalisasi tanggal ke format ISO 8601 (YYYY-MM-DD)\n", + "bulan_map = {\n", + " \"Januari\": \"01\",\n", + " \"Februari\": \"02\",\n", + " \"Maret\": \"03\",\n", + " \"April\": \"04\",\n", + " \"Mei\": \"05\",\n", + " \"Juni\": \"06\",\n", + " \"Juli\": \"07\",\n", + " \"Agustus\": \"08\",\n", + " \"September\": \"09\",\n", + " \"Oktober\": \"10\",\n", + " \"November\": \"11\",\n", + " \"Desember\": \"12\",\n", + "}\n", + "\n", + "normalized_dates = []\n", + "for hari, bulan, tahun in dates:\n", + " bulan_num = bulan_map[bulan]\n", + " hari_num = hari.zfill(2)\n", + " iso = f\"{tahun}-{bulan_num}-{hari_num}\"\n", + " normalized_dates.append(iso)\n", + "\n", + "print(\"\\nTANGGAL (FORMAT ISO 8601):\")\n", + "for iso in normalized_dates:\n", + " print(\"-\", iso)\n", + "\n", + "# 9. Ekstraksi event peluncuran produk (pattern-based)\n", + "pattern = r\"(.+?) meluncurkan (.+?) pada (\\d{1,2} \" + bulan_id + r\" \\d{4}) di (.+?)\\.\"\n", + "match = re.search(pattern, text)\n", + "\n", + "if match:\n", + " company = match.group(1).strip(\" ,\")\n", + " product = match.group(2).strip(\" ,\")\n", + " tgl_str = match.group(3)\n", + " location = match.group(4).strip(\" ,\")\n", + "\n", + " print(\"\\nEVENT PELUNCURAN PRODUK:\")\n", + " print(\" Perusahaan :\", company)\n", + " print(\" Produk :\", product)\n", + " print(\" Tanggal :\", tgl_str)\n", + " print(\" Lokasi :\", location)\n", + "else:\n", + " company = product = tgl_str = location = None\n", + " print(\"\\nPola event peluncuran produk tidak ditemukan.\")\n", + "\n", + "# 10. Template IE (Template Extraction)\n", + "event_template = {\n", + " \"Company\": company,\n", + " \"Product\": product,\n", + " \"LaunchDateOriginal\": tgl_str,\n", + " \"LaunchDateISO\": normalized_dates[0] if normalized_dates else None,\n", + " \"Location\": location,\n", + "}\n", + "\n", + "print(\"\\nTEMPLATE EVENT (hasil akhir IE):\")\n", + "print(event_template)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1e128113-af1e-45a1-8586-48c4acf578b4", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}