import boto3
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import sagemaker
role = sagemaker.get_execution_role()
bucket_name = "loan-data"
folder_name = "loan_cleaned_data"
file_name = "loan_cleaned_data.csv"
s3_url= f"s3://{bucket_name}/{folder_name}/{file_name}"
df = pd.read_csv(s3_url)
df_encoded = pd.get_dummies(df,columns=['purpose'], dtype=int)
class_counts = df_encoded['not_fully_paid'].value_counts()
majority_class = df_encoded[df_encoded['not_fully_paid']==0]
minority_class = df_encoded[df_encoded['not_fully_paid']==1]
minority_unsample = resample(minority_class,replace=True,n_samples=len(majority_class),random_state=42)
df_balanced = pd.concat([majority_class,minority_unsample])
X = df_balanced.drop(columns=['sl_no','not_fully_paid'])
y = df_balanced['not_fully_paid']
X_train , X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=42)
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))