In [4]:
# =========================
# 1. IMPORT LIBRARY
# =========================
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
# =========================
# 2. DATA TEKS MANUAL
# =========================
texts = [
    "saya suka belajar data science",
    "machine learning adalah bagian dari data science",
    "belajar python sangat menyenangkan"
]

In [6]:
# =========================
# 3. PREPROCESSING (MODIFIKASI)
# =========================
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

texts_cleaned = [clean_text(t) for t in texts]

# **UNIGRAM**

In [7]:
unigram_vectorizer = CountVectorizer(ngram_range=(1,1))
X_uni = unigram_vectorizer.fit_transform(texts_cleaned)

df_unigram = pd.DataFrame(
    X_uni.toarray(),
    columns=unigram_vectorizer.get_feature_names_out()
)

print("=== UNIGRAM ===")
print(df_unigram)

=== UNIGRAM ===
   adalah  bagian  belajar  dari  data  learning  machine  menyenangkan  \
0       0       0        1     0     1         0        0             0   
1       1       1        0     1     1         1        1             0   
2       0       0        1     0     0         0        0             1   

   python  sangat  saya  science  suka  
0       0       0     1        1     1  
1       0       0     0        1     0  
2       1       1     0        0     0  


# **BIGRAM**

In [8]:
bigram_vectorizer = CountVectorizer(ngram_range=(2,2))
X_bi = bigram_vectorizer.fit_transform(texts_cleaned)

df_bigram = pd.DataFrame(
    X_bi.toarray(),
    columns=bigram_vectorizer.get_feature_names_out()
)

print("\n=== BIGRAM ===")
print(df_bigram)


=== BIGRAM ===
   adalah bagian  bagian dari  belajar data  belajar python  dari data  \
0              0            0             1               0          0   
1              1            1             0               0          1   
2              0            0             0               1          0   

   data science  learning adalah  machine learning  python sangat  \
0             1                0                 0              0   
1             1                1                 1              0   
2             0                0                 0              1   

   sangat menyenangkan  saya suka  suka belajar  
0                    0          1             1  
1                    0          0             0  
2                    1          0             0  


# **TRIGRAM**

In [9]:
combined_vectorizer = CountVectorizer(ngram_range=(1,2))
X_combined = combined_vectorizer.fit_transform(texts_cleaned)

df_combined = pd.DataFrame(
    X_combined.toarray(),
    columns=combined_vectorizer.get_feature_names_out()
)

print("\n=== UNIGRAM + BIGRAM ===")
print(df_combined)


=== UNIGRAM + BIGRAM ===
   adalah  adalah bagian  bagian  bagian dari  belajar  belajar data  \
0       0              0       0            0        1             1   
1       1              1       1            1        0             0   
2       0              0       0            0        1             0   

   belajar python  dari  dari data  data  ...  menyenangkan  python  \
0               0     0          0     1  ...             0       0   
1               0     1          1     1  ...             0       0   
2               1     0          0     0  ...             1       1   

   python sangat  sangat  sangat menyenangkan  saya  saya suka  science  suka  \
0              0       0                    0     1          1        1     1   
1              0       0                    0     0          0        1     0   
2              1       1                    1     0          0        0     0   

   suka belajar  
0             1  
1             0  
2             0  

[3