import boto3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import tempfile
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
bucket_name = 'employee-data'
file_key = 'inputfiles/employee_cleaned_data.csv'
s3_client = boto3.client('s3')
obj = s3_client.get_object(Bucket=bucket_name, Key=file_key)
df = pd.read_csv(obj['Body'])
df = df.drop(columns=['employee_id'])
df['region'] = df['region'].str.extract('(\d+)').astype(int)
X = df.drop(columns=['turnover'])
y = df['turnover']
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['age', 'salary']),
('cat', OneHotEncoder(), ['department', 'gender', 'education'])
],
remainder='passthrough'
)
X_transformed = preprocessor.fit_transform(X)
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X_transformed, y)
selected_features = selector.get_support(indices=True)
feature_names = preprocessor.get_feature_names_out()
selected_feature_names = feature_names[selected_features]
print(f'Selected features: {list(selected_feature_names)}')
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=0)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
model = LogisticRegression(random_state=0)
model.fit(X_train_scaled, y_train)
with tempfile.TemporaryFile() as temp_model_file:
joblib.dump(model, temp_model_file)
temp_model_file.seek(0)
s3_client.upload_fileobj(temp_model_file, bucket_name, 'ml-output/model.pkl')
print('Successfully pushed data to S3: model.pkl')
with tempfile.TemporaryFile() as temp_model_file:
s3_client.download_fileobj(bucket_name, 'ml-output/model.pkl', temp_model_file)
temp_model_file.seek(0)
loaded_model = joblib.load(temp_model_file)
print('Successfully loaded model from S3')
y_pred_loaded = loaded_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred_loaded)
precision = precision_score(y_test, y_pred_loaded)
recall = recall_score(y_test, y_pred_loaded)
f1 = f1_score(y_test, y_pred_loaded)
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-Score: {f1:.2f}')
cm = confusion_matrix(y_test, y_pred_loaded)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Turnover', 'Turnover'], yticklabels=['No Turnover', 'Turnover'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()