diff --git a/__pycache__/__init__.cpython-36.pyc b/__pycache__/__init__.cpython-36.pyc index 2e5da7d..5386a13 100644 Binary files a/__pycache__/__init__.cpython-36.pyc and b/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc index 2f9a42a..3dca535 100644 Binary files a/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/__pycache__/build.cpython-36.pyc b/q01_outlier_removal/__pycache__/build.cpython-36.pyc index 8248a16..16d8794 100644 Binary files a/q01_outlier_removal/__pycache__/build.cpython-36.pyc and b/q01_outlier_removal/__pycache__/build.cpython-36.pyc differ diff --git a/q01_outlier_removal/build.py b/q01_outlier_removal/build.py index ec278ba..41105c5 100644 --- a/q01_outlier_removal/build.py +++ b/q01_outlier_removal/build.py @@ -1,3 +1,4 @@ +# %load q01_outlier_removal/build.py # Default imports import pandas as pd @@ -6,3 +7,19 @@ # Write your Solution here: +def outlier_removal(data): + + loan_data = data + loan_data = loan_data[loan_data['ApplicantIncome'] < loan_data['ApplicantIncome'].quantile(0.97)] + loan_data = loan_data[loan_data['CoapplicantIncome'] < loan_data['CoapplicantIncome'].quantile(0.98)] + loan_data = loan_data[loan_data['LoanAmount'] < loan_data['LoanAmount'].quantile(0.97)] + + return loan_data +# loan_data = loan_data[loan_data['ApplicantIncome'] < loan_data['ApplicantIncome'].quantile(0.97)] +# loan_data = loan_data[loan_data['CoapplicantIncome'] < loan_data['CoapplicantIncome'].quantile(0.98)] +# loan_data = loan_data[loan_data['LoanAmount'] < loan_data['LoanAmount'].quantile(0.97)] +# loan_data = loan_data[(loan_data['ApplicantIncome'] < loan_data['ApplicantIncome'].quantile(0.95)) & (loan_data['CoapplicantIncome'] < loan_data['CoapplicantIncome'].quantile(0.95)) & (loan_data['LoanAmount'] < loan_data['LoanAmount'].quantile(0.95))] +# loan_data.shape + + + diff --git a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc index 5a057ff..4ddb609 100644 Binary files a/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc index 4c0b6c7..c92128b 100644 Binary files a/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc and b/q01_outlier_removal/tests/__pycache__/test_q01_outlier_removal.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..0b6a2eb Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..ac010dc Binary files /dev/null and b/q02_data_cleaning_all/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/build.py b/q02_data_cleaning_all/build.py index b56e2bc..56a42f1 100644 --- a/q02_data_cleaning_all/build.py +++ b/q02_data_cleaning_all/build.py @@ -1,3 +1,4 @@ +# %load q02_data_cleaning_all/build.py # Default Imports import sys, os sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname('__file__')))) @@ -12,3 +13,19 @@ # Write your solution here : + +def data_cleaning(data): + + data['LoanAmount'].fillna(data['LoanAmount'].mean(),inplace = True) + cat_list = ['Gender','Married', 'Dependents', 'Self_Employed','Loan_Amount_Term','Credit_History'] + for e in cat_list: + data[e].fillna(data[e].mode()[0],inplace = True) + X = data.iloc[:,:-1] + y = data.iloc[:,-1] + X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state= 9) + + return X,y, X_train, X_test, y_train, y_test + +data_cleaning(loan_data) + + diff --git a/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..e116bcf Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc new file mode 100644 index 0000000..717ae05 Binary files /dev/null and b/q02_data_cleaning_all/tests/__pycache__/test_q02_data_cleaning.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..cf70d13 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..ea9f5c9 Binary files /dev/null and b/q02_data_cleaning_all_2/__pycache__/build.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/build.py b/q02_data_cleaning_all_2/build.py index e20ff7b..23a901d 100644 --- a/q02_data_cleaning_all_2/build.py +++ b/q02_data_cleaning_all_2/build.py @@ -1,3 +1,4 @@ +# %load q02_data_cleaning_all_2/build.py # Default Imports import pandas as pd import numpy as np @@ -11,3 +12,61 @@ # Write your solution here : +def data_cleaning_2(X_train, X_test, y_train, y_test): + + num_col = X_train.select_dtypes(['int','float']).columns + X_train[num_col] = X_train[num_col].apply(np.sqrt) + X_train['Gender'].replace({'Male':0, 'Female':1},inplace = True) + X_train['Married'].replace({'Yes':1, 'No': 0}, inplace= True) + X_train['Education'].replace({'Graduate':1, 'Not Graduate':0}, inplace = True) + X_train['Self_Employed'].replace({'Yes':1, 'No':0},inplace = True) + new_pro = pd.get_dummies(X_train['Property_Area']).reset_index() + X_train = X_train.reset_index() + X_train = X_train.merge(new_pro, how='left', left_on = 'index', right_on = 'index') + X_train.index = X_train['index'] + new_dependent = pd.get_dummies(X_train['Dependents']).reset_index() + X_train = X_train.merge(new_dependent, how = 'left', left_on = 'index' ,right_on = 'index') + X_train.drop(['Property_Area', 'Dependents', 'Rural','index', '0'], axis = 1, inplace = True) + + X_train.rename(columns={'1':'Dependents_1', '2':'Dependents_2','3+':'Dependents_3'}, inplace = True) + X_train.rename(columns={'Semiurban':'Property_Area_Semiurban','Urban':'Property_Area_Urban'}, inplace = True) + + + num_col = X_test.select_dtypes(['int','float']).columns + X_test[num_col] = X_test[num_col].apply(np.sqrt) + X_test['Gender'].replace({'Male':0, 'Female':1},inplace = True) + X_test['Married'].replace({'Yes':1, 'No': 0}, inplace= True) + X_test['Education'].replace({'Graduate':1, 'Not Graduate':0}, inplace = True) + X_test['Self_Employed'].replace({'Yes':1, 'No':0},inplace = True) + new_pro = pd.get_dummies(X_test['Property_Area']).reset_index() + X_test = X_test.reset_index() + X_test = X_test.merge(new_pro, how='left', left_on = 'index', right_on = 'index') + X_test.index = X_test['index'] + new_dependent = pd.get_dummies(X_test['Dependents']).reset_index() + X_test = X_test.merge(new_dependent, how = 'left', left_on = 'index' ,right_on = 'index') + X_test.drop(['Property_Area', 'Dependents', 'Rural','index', '0'], axis = 1, inplace = True) + + X_test.rename(columns={'1':'Dependents_1', '2':'Dependents_2','3+':'Dependents_3'}, inplace = True) + X_test.rename(columns={'Semiurban':'Property_Area_Semiurban','Urban':'Property_Area_Urban'}, inplace = True) + + + return X_train, X_test, y_train, y_test + +# num_col = X_test.select_dtypes(['int','float']).columns +# X_test[num_col] = X_test[num_col].apply(np.sqrt) +# X_test['Gender'].replace({'Male':0, 'Female':1},inplace = True) +# X_test['Married'].replace({'Yes':1, 'No': 0}, inplace= True) +# X_test['Education'].replace({'Graduate':1, 'Not Graduate':0}, inplace = True) +# X_test['Self_Employed'].replace({'Yes':1, 'No':0},inplace = True) +# new_pro = pd.get_dummies(X_test['Property_Area']).reset_index() +# X_test = X_test.reset_index() +# X_test = X_test.merge(new_pro, how='left', left_on = 'index', right_on = 'index') +# X_test.index = X_test['index'] +# new_dependent = pd.get_dummies(X_test['Dependents']).reset_index() +# X_test = X_test.merge(new_dependent, how = 'left', left_on = 'index' ,right_on = 'index') +# X_test.drop(['Property_Area', 'Dependents', 'Rural','index', '0'], axis = 1, inplace = True) + + + + + diff --git a/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..a84e1c4 Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc new file mode 100644 index 0000000..66819a1 Binary files /dev/null and b/q02_data_cleaning_all_2/tests/__pycache__/q02_test_data_cleaning_2.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..1871534 Binary files /dev/null and b/q03_logistic_regression/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/__pycache__/build.cpython-36.pyc b/q03_logistic_regression/__pycache__/build.cpython-36.pyc new file mode 100644 index 0000000..7f3d2cb Binary files /dev/null and b/q03_logistic_regression/__pycache__/build.cpython-36.pyc differ diff --git a/q03_logistic_regression/build.py b/q03_logistic_regression/build.py index cdbd506..b93fc1d 100644 --- a/q03_logistic_regression/build.py +++ b/q03_logistic_regression/build.py @@ -1,3 +1,4 @@ +# %load q03_logistic_regression/build.py # Default Imports import pandas as pd from sklearn.preprocessing import StandardScaler @@ -15,4 +16,20 @@ # Write your solution code here: +def logistic_regression(X_train, X_test, y_train, y_test): + + scaler = StandardScaler() + #feature scaling on train + num_col = X_train.select_dtypes(['float']).columns + X_train[num_col] = scaler.fit_transform(X_train[num_col]) + #feature scaling on test + num_col = X_test.select_dtypes(['float']).columns + X_test[num_col] = scaler.fit_transform(X_test[num_col]) + + logistic_model = LogisticRegression() + logistic_model.fit(X_train, y_train) + y_pred = logistic_model.predict(X_test) + cm = confusion_matrix(y_test, y_pred) + return cm + diff --git a/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc new file mode 100644 index 0000000..2b945b4 Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/__init__.cpython-36.pyc differ diff --git a/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc new file mode 100644 index 0000000..6dd09a4 Binary files /dev/null and b/q03_logistic_regression/tests/__pycache__/test_q03_logistic_regression.cpython-36.pyc differ