1. One-Hot Encoding
import pandas as pd
df = pd.DataFrame({'Animal': ['Cat', 'Dog', 'Fish']})
df_one_hot = pd.get_dummies(df, columns=['Animal'], dtype=int)
print(df_one_hot)
Output:
Animal_Cat Animal_Dog Animal_Fish
0 1 0 0
1 0 1 0
2 0 0 1
2. Label Encoding
from sklearn.preprocessing import LabelEncoder
df = pd.DataFrame({'Animal': ['Cat', 'Dog', 'Fish']})
label_encoder = LabelEncoder()
df['Animal_Label'] = label_encoder.fit_transform(df['Animal'])
print(df)
Output:
Animal Animal_Label
0 Cat 0
1 Dog 1
2 Fish 2
3. Binary Encoding
!pip install category_encoders
import pandas as pd
from category_encoders import BinaryEncoder
df = pd.DataFrame({'Color': ['Red', 'Blue', 'Green']})
encoder = BinaryEncoder(cols=['Color'])
df_binary = encoder.fit_transform(df)
print(df_binary)
Output:
Color_0 Color_1
0 0 1
1 1 0
2 1 1
4. Frequency Encoding
df = pd.DataFrame({'Fruit': ['Apple', 'Banana', 'Apple', 'Orange', 'Banana']})
frequency_map = df['Fruit'].value_counts().to_dict()
df['Fruit_Frequency'] = df['Fruit'].map(frequency_map)
print(df)
Output:
Fruit Fruit_Frequency
0 Apple 2
1 Banana 2
2 Apple 2
3 Orange 1
4 Banana 2
5. Target Encoding
df = pd.DataFrame({
'City': ['New York', 'London', 'New York', 'Tokyo'],
'Sales': [100, 200, 150, 300]
})
target_mean = df.groupby('City')['Sales'].mean()
df['City_Target_Encoding'] = df['City'].map(target_mean)
print(df)
Output:
City Sales City_Target_Encoding
0 New York 100 125.0
1 London 200 200.0
2 New York 150 125.0
3 Tokyo 300 300.0
6. Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder
df = pd.DataFrame({'Size': ['Small', 'Medium', 'Large']})
size_order = [['Small', 'Medium', 'Large']]
ordinal_encoder = OrdinalEncoder(categories=size_order)
df['Size_Ordinal'] = ordinal_encoder.fit_transform(df[['Size']])
print(df)
Output:
Size Size_Ordinal
0 Small 0.0
1 Medium 1.0
2 Large 2.0
7. Hash Encoding
!pip install category_encoders
import pandas as pd
from category_encoders import HashingEncoder
df = pd.DataFrame({'Fruit': ['Apple', 'Banana', 'Orange']})
hash_encoder = HashingEncoder(cols=['Fruit'], n_components=2)
df_hashed = hash_encoder.fit_transform(df)
print(df_hashed)
Output (example):
col_0 col_1
0 0.45 0.75
1 0.66 0.33
2 0.22 0.88