ML troubleshooting

# Cloud driven Loan Defalut predictor using machine learning

## Task 1. Launching an Amazon sage make instance  
# Task 2. upload note book into jupiter
# Task 3. Data Loading

import boto3
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import sagemaker 

role = sagemaker.get_execution_role()

bucket_name = "loan-data"
folder_name = "loan_cleaned_data"
file_name = "loan_cleaned_data.csv"

s3_url= f"s3://{bucket_name}/{folder_name}/{file_name}"

df = pd.read_csv(s3_url)

#Task 4. Feature engineering (One hot Encoding)

df_encoded = pd.get_dummies(df,columns=['purpose'], dtype=int)

# Task 5. Data preprocessing (Handling imbalanced data)

class_counts = df_encoded['not_fully_paid'].value_counts()

majority_class = df_encoded[df_encoded['not_fully_paid']==0]
minority_class = df_encoded[df_encoded['not_fully_paid']==1]

minority_unsample = resample(minority_class,replace=True,n_samples=len(majority_class),random_state=42)

df_balanced = pd.concat([majority_class,minority_unsample])

# Task 6. Model Training

X = df_balanced.drop(columns=['sl_no','not_fully_paid'])

y = df_balanced['not_fully_paid']

X_train , X_test, y_train, y_test = train_test_split(X,y,test_size=0.4,random_state=42)

rf = RandomForestClassifier(random_state=42)

rf.fit(X_train, y_train)

# Task 7. Model Evaluation

y_pred = rf.predict(X_test)

print(classification_report(y_test,y_pred))