import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report, f1_score

# Fungsi hitung Jaccard manual (untuk sklearn lama)
def jaccard_manual(y_true, y_pred):
    intersection = np.logical_and(y_true == 1, y_pred == 1).sum()
    union = np.logical_or(y_true == 1, y_pred == 1).sum()
    return intersection / union if union != 0 else 0

# ================================
# 1. Load Data
# ================================
cell_df = pd.read_csv("cell_samples.csv")

# ================================
# 2. Bersihkan Kolom BareNuc
# ================================
cell_df = cell_df[pd.to_numeric(cell_df['BareNuc'], errors='coerce').notnull()]
cell_df['BareNuc'] = cell_df['BareNuc'].astype('int')

# ================================
# 3. Buat Feature dan Label
# ================================
feature_df = cell_df[['Clump','UnifSize','UnifShape','MargAdh',
                      'SingEpiSize','BareNuc','BlandChrom','NormNucl','Mit']].astype(float)

X = np.asarray(feature_df)
y = np.where(cell_df['Class'] == 2, 0, 1)  # 0 = Benign, 1 = Malignant

# ================================
# 4. Split Train/Test
# ================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

# ================================
# 5. Model SVM Kernel LINEAR
# ================================
model = svm.SVC(kernel='linear')
model.fit(X_train, y_train)

# ================================
# 6. Prediksi
# ================================
y_pred = model.predict(X_test)

# ================================
# 7. Evaluasi
# ================================
print("Avg F1-score:", f1_score(y_test, y_pred, average='weighted'))
print("Jaccard score:", jaccard_manual(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))