diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index 2e5da7d..c24f221 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc index 2f9a42a..9cc9d69 100644 Binary files a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/build.cpython-36.pyc b/q01_outlier_removal/__pycache__/build.cpython-36.pyc index 8248a16..143e3de 100644 Binary files a/q01_outlier_removal/__pycache__/build.cpython-36.pyc and b/q01_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q01_outlier_removal/build.py b/q01_outlier_removal/build.py index ec278ba..435bfbb 100644 --- a/q01_outlier_removal/build.py +++ b/q01_outlier_removal/build.py @@ -1,8 +1,28 @@ +# %load q01_outlier_removal/build.py # Default imports import pandas as pd loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') loan_data = loan_data.drop('Loan_ID', 1) +# Function will remove the outliers +def outlier_removal(loan_data): + a = loan_data['ApplicantIncome'].quantile(0.95) # It comes out to be 14583.0 + b = loan_data['CoapplicantIncome'].quantile(0.95) # It comes out to be 4997.4 + c = loan_data['LoanAmount'].quantile(0.95) # It comes out to be 297.8 + loan_data = loan_data.drop(loan_data[loan_data['ApplicantIncome'] > a].index) + loan_data = loan_data.drop(loan_data[loan_data['CoapplicantIncome'] > b].index) + loan_data = loan_data.drop(loan_data[loan_data['LoanAmount'] > c].index) + return loan_data + +outlier_removal(loan_data).shape + + + + + + + + + -# Write your Solution here: diff --git a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc index 5a057ff..8e40ab9 100644 Binary files a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc index 4c0b6c7..d28d92f 100644 Binary files a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..0424e33 Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..857985c Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/build.py b/q02_data_cleaning_all/build.py index b56e2bc..5f1faed 100644 --- a/q02_data_cleaning_all/build.py +++ b/q02_data_cleaning_all/build.py @@ -1,9 +1,11 @@ +# %load q02_data_cleaning_all/build.py # Default Imports import sys, os sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname('__file__')))) import pandas as pd import numpy as np from sklearn.model_selection import train_test_split +from sklearn.preprocessing import Imputer from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') @@ -12,3 +14,24 @@ # Write your solution here : +def data_cleaning(loan_data): + #Impute the values with mean and mode + loan_data['LoanAmount'].fillna(loan_data['LoanAmount'].mean(), inplace = True) + cat_col = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History'] + for col in cat_col: + loan_data['LoanAmount'].fillna(loan_data[col].mode(), inplace = True) + + #seperate the features and target variable + X = loan_data.iloc[:,:-1] + y = loan_data.iloc[:,-1] + + #train test split for ML + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 9) + return X, y, X_train, X_test, y_train, y_test + + + + + + + diff --git a/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..c80c374 Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc new file mode 100644 index 0000000..f6be6d5 Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..318489f Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..ca38115 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/build.py b/q02_data_cleaning_all_2/build.py index e20ff7b..796549f 100644 --- a/q02_data_cleaning_all_2/build.py +++ b/q02_data_cleaning_all_2/build.py @@ -1,3 +1,4 @@ +# %load q02_data_cleaning_all_2/build.py # Default Imports import pandas as pd import numpy as np @@ -11,3 +12,38 @@ # Write your solution here : +def data_cleaning_2(X_train, X_test, y_train, y_test): + cat_col = (X_train.select_dtypes(include=['object']).columns) + num_col = ['ApplicantIncome','CoapplicantIncome','LoanAmount'] + + X_train['ApplicantIncome_sqrt'] = np.sqrt(X_train['ApplicantIncome'] ) + X_test['ApplicantIncome_sqrt'] = np.sqrt(X_test['ApplicantIncome'] ) + X_train['CoapplicantIncome_sqrt'] = np.sqrt(X_train['CoapplicantIncome'] ) + X_test['CoapplicantIncome_sqrt'] = np.sqrt(X_test['CoapplicantIncome'] ) + X_train['LoanAmount_sqrt'] = np.sqrt(X_train['LoanAmount'] ) + X_test['LoanAmount_sqrt'] = np.sqrt(X_test['LoanAmount'] ) + + df_cat_train = pd.get_dummies(X_train[cat_col],drop_first=True) + df_cat_test = pd.get_dummies(X_test[cat_col],drop_first=True) + + X_train = pd.concat([X_train,df_cat_train],axis =1) + X_test = pd.concat([X_test,df_cat_test],axis =1) + + drop_col = list(cat_col) + num_col + X_train.drop(labels=drop_col,axis=1,inplace=True) + X_test.drop(labels=drop_col,axis=1,inplace=True) + + return X_train, X_test, y_train, y_test + + + + + + + + + + + + + diff --git a/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..a697e98 Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc new file mode 100644 index 0000000..9d90f6c Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..932cdd6 Binary files /dev/null and b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/build.cpython-36.pyc b/q03_logistic_regression/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..22dca85 Binary files /dev/null and b/q03_logistic_regression/__pycache__/build.cpython-36.pyc differ diff --git a/q03_logistic_regression/build.py b/q03_logistic_regression/build.py index cdbd506..5d243e5 100644 --- a/q03_logistic_regression/build.py +++ b/q03_logistic_regression/build.py @@ -1,12 +1,16 @@ +# %load q03_logistic_regression/build.py # Default Imports import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression +import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix from greyatomlib.logistic_regression_project.q01_outlier_removal.build import outlier_removal from greyatomlib.logistic_regression_project.q02_data_cleaning_all.build import data_cleaning from greyatomlib.logistic_regression_project.q02_data_cleaning_all_2.build import data_cleaning_2 + + loan_data = pd.read_csv('data/loan_prediction_uncleaned.csv') loan_data = loan_data.drop('Loan_ID', 1) loan_data = outlier_removal(loan_data) @@ -15,4 +19,35 @@ # Write your solution code here: +def logistic_regression(X_train, X_test, y_train, y_test): + std_scl = StandardScaler() + scale_df = std_scl.fit_transform(X=X_train[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']]) + scale_df = pd.DataFrame(scale_df,columns=['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount'] , index=X_train.index) + + X_train['ApplicantIncome'] = scale_df['ApplicantIncome'] + X_train['CoapplicantIncome'] = scale_df['CoapplicantIncome'] + X_train['LoanAmount'] = scale_df['LoanAmount'] + + std_scl2 = StandardScaler() + scale_df2 = std_scl2.fit_transform(X=X_test[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']]) + scale_df2 = pd.DataFrame(scale_df2,columns=['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount'] , index=X_test.index) + + X_test['ApplicantIncome'] = scale_df2['ApplicantIncome'] + X_test['CoapplicantIncome'] = scale_df2['CoapplicantIncome'] + X_test['LoanAmount'] = scale_df2['LoanAmount'] + + model = LogisticRegression(random_state=9) + model.fit(X_train,y_train) + + y_pred = model.predict(X_test) + + cm = confusion_matrix(y_test,y_pred) + return cm + + + + + + + diff --git a/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..78e86b1 Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc new file mode 100644 index 0000000..e899608 Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc differ