# K-Means Clustering: Jumlah Pendidik SMA 2024
# Analisis Pengelompokan Wilayah Berdasarkan Data Pendidik

# ==========================================
# 1. IMPORT LIBRARY
# ==========================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
import warnings
from google.colab import files # Tetap diperlukan untuk fungsionalitas Colab

# Mengabaikan warning dan set style
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# ==========================================
# 2. LOAD DATA
# ==========================================
# Data contoh (Hapus jika Anda menggunakan file upload)
data_contoh = {
    'wilayah': ['DKI Jakarta', 'Jawa Barat', 'Jawa Tengah', 'Jawa Timur',
                'Sumatera Utara', 'Banten', 'Sulawesi Selatan', 'Kalimantan Timur',
                'Papua', 'Maluku', 'Bali', 'NTB', 'Lampung', 'Riau',
                'Sulawesi Utara', 'Kalimantan Selatan', 'Jambi', 'Bengkulu',
                'Aceh', 'Sumatera Barat', 'NTT', 'Papua Barat', 'Gorontalo', 'Maluku Utara'],
    'jumlah_pendidik': [15000, 25000, 20000, 23000, 12000, 10000, 8000, 5000,
                        3000, 2500, 6000, 4500, 7000, 6500, 4000, 5500, 4200, 3500,
                        8500, 7500, 3800, 2000, 1800, 2200],
    'jumlah_sekolah': [450, 850, 750, 800, 400, 350, 300, 200,
                       150, 120, 250, 180, 280, 260, 170, 220, 190, 160,
                       320, 300, 190, 100, 90, 110]
}
df = pd.DataFrame(data_contoh)

# Hitung rasio guru per sekolah
df['rasio_guru_per_sekolah'] = df['jumlah_pendidik'] / df['jumlah_sekolah']

# ==========================================
# 3. EKSPLORASI DATA RINGKAS
# ==========================================
print("="*60)
print("EKSPLORASI DATA AWAL")
print("="*60)
print(f"\nJumlah data: {len(df)} wilayah. Tidak ada missing value.\n")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))
print("\nStatistik Deskriptif:\n", df.describe().round(1).to_markdown(numalign="left", stralign="left"))

# ==========================================
# 4. VISUALISASI DATA AWAL (Disederhanakan)
# ==========================================
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Eksplorasi Data Pendidik SMA 2024', fontsize=16, fontweight='bold')

# Plot 1: Scatter plot
axes[0].scatter(df['jumlah_sekolah'], df['jumlah_pendidik'], alpha=0.7, s=100)
axes[0].set_xlabel('Jumlah Sekolah')
axes[0].set_ylabel('Jumlah Pendidik')
axes[0].set_title('Distribusi Pendidik vs Sekolah')

# Plot 2: Distribusi rasio guru per sekolah
sns.histplot(df['rasio_guru_per_sekolah'], bins=15, ax=axes[1], kde=True)
axes[1].set_xlabel('Rasio Guru per Sekolah')
axes[1].set_title('Distribusi Rasio Guru per Sekolah')

# Plot 3: Box Plot Jumlah Pendidik dan Sekolah (Menggunakan Seaborn untuk kemudahan)
sns.boxplot(data=df[['jumlah_pendidik', 'jumlah_sekolah']], ax=axes[2],
            orient='v', palette=['#FFA07A', '#98D8C8'])
axes[2].set_title('Box Plot Outlier')

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# ==========================================
# 5. PERSIAPAN DATA UNTUK CLUSTERING
# ==========================================
print("\n" + "="*60)
print("PERSIAPAN DATA")
print("="*60)

# Pilih fitur, Normalisasi, dan Persiapan K-Means
features = ['jumlah_pendidik', 'jumlah_sekolah']
X = df[features].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"\nFitur yang digunakan: {features}")
print(f"Shape data: {X.shape}")
print("Data telah dinormalisasi menggunakan StandardScaler.")

# ==========================================
# 6. MENENTUKAN JUMLAH CLUSTER OPTIMAL
# ==========================================
print("\n" + "="*60)
print("MENENTUKAN JUMLAH CLUSTER OPTIMAL")
print("="*60)

inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Visualisasi Elbow Method dan Silhouette Score
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Elbow Method
axes[0].plot(K_range, inertias, 'bo-')
axes[0].set_xlabel('Jumlah Cluster (K)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')
axes[0].axvline(x=3, color='r', linestyle='--', label='K=3 (Rekomendasi)')
axes[0].legend()

# Silhouette Score
axes[1].plot(K_range, silhouette_scores, 'go-')
axes[1].set_xlabel('Jumlah Cluster (K)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score')
axes[1].axvline(x=3, color='r', linestyle='--', label='K=3 (Rekomendasi)')
axes[1].legend()

plt.show()

# Tampilkan tabel evaluasi
eval_df = pd.DataFrame({'K': list(K_range), 'Inertia': inertias, 'Silhouette Score': silhouette_scores})
print("\nEvaluasi untuk berbagai K:\n", eval_df.to_markdown(index=False))

# ==========================================
# 7. IMPLEMENTASI K-MEANS CLUSTERING
# ==========================================
print("\n" + "="*60)
print("IMPLEMENTASI K-MEANS CLUSTERING (K=3)")
print("="*60)

optimal_k = 3
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')
df['cluster'] = kmeans.fit_predict(X_scaled)

# Metrik Evaluasi Akhir
s_score = silhouette_score(X_scaled, df['cluster'])
db_score = davies_bouldin_score(X_scaled, df['cluster'])
print(f"\nClustering selesai! Iterasi: {kmeans.n_iter_}")
print(f"Inertia: {kmeans.inertia_:.2f} | Silhouette Score: {s_score:.3f} | Davies-Bouldin Index: {db_score:.3f}")

# ==========================================
# 8. ANALISIS HASIL CLUSTERING RINGKAS
# ==========================================
print("\n" + "="*60)
print("ANALISIS HASIL CLUSTERING")
print("="*60)

# Buat label cluster
cluster_labels = {}
# Menghitung rata-rata pendidik untuk penentuan label
avg_pendidik = df.groupby('cluster')['jumlah_pendidik'].mean()

for i in range(optimal_k):
    if avg_pendidik[i] > 15000:
        cluster_labels[i] = 'Kepadatan Tinggi'
    elif avg_pendidik[i] > 7000:
        cluster_labels[i] = 'Kepadatan Sedang'
    else:
        cluster_labels[i] = 'Kepadatan Rendah'

df['label_cluster'] = df['cluster'].map(cluster_labels)

# Ringkasan per cluster
summary = df.groupby('cluster').agg(
    wilayah=('wilayah', 'count'),
    pendidik_avg=('jumlah_pendidik', 'mean'),
    sekolah_avg=('jumlah_sekolah', 'mean'),
    rasio_avg=('rasio_guru_per_sekolah', 'mean')
).round(2).rename(index=cluster_labels)

print("\nRingkasan Statistik per Cluster:\n", summary.to_markdown(numalign="left", stralign="left"))

# Tampilkan wilayah per cluster
print("\nWilayah dalam setiap Cluster:")
for i in range(optimal_k):
    wilayah_list = df[df['cluster'] == i]['wilayah'].values
    print(f"  • Cluster {i} ({cluster_labels[i]}): {', '.join(wilayah_list)}")


# ==========================================
# 9. VISUALISASI HASIL CLUSTERING (Disederhanakan)
# ==========================================
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Hasil K-Means Clustering: Pendidik SMA 2024', fontsize=16, fontweight='bold')

# Plot 1: Scatter plot dengan cluster
centroids = scaler.inverse_transform(kmeans.cluster_centers_)

sns.scatterplot(data=df, x='jumlah_sekolah', y='jumlah_pendidik', hue='label_cluster',
                style='label_cluster', s=150, ax=axes[0], palette='viridis', legend='full',
                edgecolor='black')
axes[0].scatter(centroids[:, 1], centroids[:, 0], marker='X', s=400, c='red',
                  edgecolors='black', linewidth=2, label='Centroid', zorder=5)
axes[0].set_xlabel('Jumlah Sekolah')
axes[0].set_ylabel('Jumlah Pendidik')
axes[0].set_title('Scatter Plot: Hasil Clustering (Pendidik vs Sekolah)')
axes[0].legend(title='Cluster')


# Plot 2: Box plot rasio guru per sekolah per cluster
sns.boxplot(data=df, x='label_cluster', y='rasio_guru_per_sekolah', ax=axes[1],
            palette='viridis')
axes[1].set_xlabel('Cluster')
axes[1].set_ylabel('Rasio Guru per Sekolah')
axes[1].set_title('Perbandingan Rasio Guru per Sekolah per Cluster')

plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()

# ==========================================
# 10. INTERPRETASI DAN REKOMENDASI RINGKAS
# ==========================================
print("\n" + "="*60)
print("INTERPRETASI & REKOMENDASI")
print("="*60)

for cluster, label in cluster_labels.items():
    avg_rasio = summary.loc[label, 'rasio_avg']
    print(f"\nCluster: {cluster} - {label} (Rasio rata-rata: {avg_rasio:.2f})")

    if avg_rasio < 25:
        print("   REKOMENDASI: **PENAMBAHAN guru** (Rasio di bawah standar ideal 25-30)")
    elif avg_rasio > 35:
        print("   REKOMENDASI: **SURPLUS guru** (Pertimbangkan redistribusi)")
    else:
        print("   ✓ STATUS: Rasio guru sudah dalam **batas ideal**.")

# ==========================================
# 11. EXPORT HASIL
# ==========================================
print("\n" + "="*60)
print("EXPORT HASIL")
print("="*60)

hasil_df = df[['wilayah', 'jumlah_pendidik', 'jumlah_sekolah',
               'rasio_guru_per_sekolah', 'cluster', 'label_cluster']].sort_values('cluster')
hasil_df.to_csv('hasil_clustering_pendidik_sma_2024.csv', index=False)
print("Hasil clustering telah disimpan ke: hasil_clustering_pendidik_sma_2024.csv")

try:
    files.download('hasil_clustering_pendidik_sma_2024.csv')
except Exception as e:
    print("Gagal auto-download. Pastikan Anda menjalankan kode di Google Colab.")

print("\n" + "="*60)
print("ANALISIS SELESAI!")
print("="*60)


OUTPUT

============================================================
EKSPLORASI DATA AWAL
============================================================

Jumlah data: 24 wilayah. Tidak ada missing value.

| wilayah        | jumlah_pendidik   | jumlah_sekolah   | rasio_guru_per_sekolah   |
|:---------------|:------------------|:-----------------|:-------------------------|
| DKI Jakarta    | 15000             | 450              | 33.3333                  |
| Jawa Barat     | 25000             | 850              | 29.4118                  |
| Jawa Tengah    | 20000             | 750              | 26.6667                  |
| Jawa Timur     | 23000             | 800              | 28.75                    |
| Sumatera Utara | 12000             | 400              | 30                       |

Statistik Deskriptif:
 |       | jumlah_pendidik   | jumlah_sekolah   | rasio_guru_per_sekolah   |
|:------|:------------------|:-----------------|:-------------------------|
| count | 24                | 24               | 24                       |
| mean  | 7937.5            | 299.6            | 24.7                     |
| std   | 6573.4            | 214.8            | 3.7                      |
| min   | 1800              | 90               | 20                       |
| 25%   | 3725              | 167.5            | 21.6                     |
| 50%   | 5750              | 235              | 25                       |
| 75%   | 8875              | 327.5            | 26.7                     |
| max   | 25000             | 850              | 33.3                     |


============================================================
PERSIAPAN DATA
============================================================

Fitur yang digunakan: ['jumlah_pendidik', 'jumlah_sekolah']
Shape data: (24, 2)
Data telah dinormalisasi menggunakan StandardScaler.

============================================================
MENENTUKAN JUMLAH CLUSTER OPTIMAL
============================================================


Evaluasi untuk berbagai K:
 |   K |   Inertia |   Silhouette Score |
|----:|----------:|-------------------:|
|   2 | 10.4674   |           0.731571 |
|   3 |  3.67937  |           0.594546 |
|   4 |  1.82605  |           0.585692 |
|   5 |  1.17329  |           0.557479 |
|   6 |  0.830915 |           0.501176 |
|   7 |  0.532124 |           0.472502 |
|   8 |  0.399343 |           0.420903 |
|   9 |  0.322782 |           0.386624 |
|  10 |  0.249896 |           0.353665 |

============================================================
IMPLEMENTASI K-MEANS CLUSTERING (K=3)
============================================================

Clustering selesai! Iterasi: 6
Inertia: 3.68 | Silhouette Score: 0.595 | Davies-Bouldin Index: 0.464

============================================================
ANALISIS HASIL CLUSTERING
============================================================

Ringkasan Statistik per Cluster:
 | cluster          | wilayah   | pendidik_avg   | sekolah_avg   | rasio_avg   |
|:-----------------|:----------|:---------------|:--------------|:------------|
| Kepadatan Rendah | 14        | 3892.86        | 170.71        | 22.31       |
| Kepadatan Tinggi | 3         | 22666.7        | 800           | 28.28       |
| Kepadatan Sedang | 7         | 9714.29        | 342.86        | 27.88       |

Wilayah dalam setiap Cluster:
  • Cluster 0 (Kepadatan Rendah): Kalimantan Timur, Papua, Maluku, Bali, NTB, Riau, Sulawesi Utara, Kalimantan Selatan, Jambi, Bengkulu, NTT, Papua Barat, Gorontalo, Maluku Utara
  • Cluster 1 (Kepadatan Tinggi): Jawa Barat, Jawa Tengah, Jawa Timur
  • Cluster 2 (Kepadatan Sedang): DKI Jakarta, Sumatera Utara, Banten, Sulawesi Selatan, Lampung, Aceh, Sumatera Barat


============================================================
INTERPRETASI & REKOMENDASI
============================================================

Cluster: 0 - Kepadatan Rendah (Rasio rata-rata: 22.31)
   REKOMENDASI: **PENAMBAHAN guru** (Rasio di bawah standar ideal 25-30)

Cluster: 1 - Kepadatan Tinggi (Rasio rata-rata: 28.28)
   ✓ STATUS: Rasio guru sudah dalam **batas ideal**.

Cluster: 2 - Kepadatan Sedang (Rasio rata-rata: 27.88)
   ✓ STATUS: Rasio guru sudah dalam **batas ideal**.

============================================================
EXPORT HASIL
============================================================
Hasil clustering telah disimpan ke: hasil_clustering_pendidik_sma_2024.csv

============================================================
ANALISIS SELESAI!
============================================================