Merge pull request #29 from UBC-MDS/autformat-script

micahkwok · web-flow · commit d0742434d62c · 2021-03-06T11:32:05.000-08:00
Autoformat sctipt and make columns a required argument in scale function
diff --git a/eda_utils_py/eda_utils_py.py b/eda_utils_py/eda_utils_py.py
@@ -5,7 +5,6 @@
 import numpy as np
 
 
-
 def imputer(df, strategy="mean", fill_value=None):
     """
     A function to implement imputation functionality for completing missing values.
@@ -68,7 +67,6 @@ def imputer(df, strategy="mean", fill_value=None):
     if isinstance(fill_value, type(None)) and strategy == "constant":
         raise Exception("fill_value should be a number when strategy is 'constant'")
 
-
     result = pd.DataFrame()
     if strategy == "mean":
         result = df.apply(lambda x: x.fillna(x.mean()), axis=0)
@@ -226,59 +224,54 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
     if columns is None:
         for col in dataframe.columns:
             if not is_numeric_dtype(dataframe[col]):
-                raise Exception("The given dataframe contains column that is not numeric column.")  
-                
+                raise Exception("The given dataframe contains column that is not numeric column.")
+
     if columns is not None:
         if not isinstance(columns, list):
             raise TypeError("The argument @columns must be of type list")
-          
-        
+
         for col in columns:
             if col not in list(dataframe.columns):
-                raise Exception("The given column list contains column that is not exist in the given dataframe.")    
+                raise Exception("The given column list contains column that is not exist in the given dataframe.")
             if not is_numeric_dtype(dataframe[col]):
                 raise Exception("The given column list contains column that is not numeric column.")
- 
+
     if method not in ("trim", "median", "mean"):
         raise Exception("The method must be -trim- or -median- or -mean-")
-    
+
     df = dataframe.copy()
     target_columns = []
-    if(columns is None):
-        target_columns = list(df.columns.values.tolist()) 
+    if (columns is None):
+        target_columns = list(df.columns.values.tolist())
     else:
         target_columns = columns
-        
+
     outlier_index = []
     for column in target_columns:
         current_column = df[column]
         mean = np.mean(current_column)
         std = np.std(current_column)
-        threshold = 3 
-        
-        
+        threshold = 3
+
         for i in range(len(current_column)):
             current_item = current_column[i]
             z = (current_item - mean) / std
             if z >= threshold:
-                if(i not in outlier_index):
+                if (i not in outlier_index):
                     outlier_index.append(i)
-                if(method == "mean"):
+                if (method == "mean"):
                     df.at[i, column] = round(mean, 2)
-                if(method == "median"):
+                if (method == "median"):
                     df.at[i, column] = np.median(current_column)
-                
-    
-    if(method == "trim"):
+
+    if (method == "trim"):
         df = df.drop(outlier_index)
-        
+
     df.index = range(len(df))
     return df
 
 
-
-  
-def scale(dataframe, columns=None, scaler="standard"):
+def scale(dataframe, columns, scaler="standard"):
     """
     A function to scale features either by using standard scaler or minmax scaler method
 
@@ -415,5 +408,3 @@ def _minmax(dataframe):
         res[feature_name] = (dataframe[feature_name] - min) / (max - min)
 
     return res
-
-
diff --git a/tests/test_eda_utils_py.py b/tests/test_eda_utils_py.py
@@ -6,7 +6,6 @@
 import numpy as np
 
 
-
 def test_version():
     assert __version__ == "0.1.0"
 
@@ -188,8 +187,10 @@ def test_scaler():
     )
 
     mock_df_1_standard = pd.DataFrame(
-        {"col1": [-0.3302891295379082, -0.8807710121010884, -0.8807710121010884, 0.7706746355884523, 1.3211565181516325],
-         "col2": [1.714389230829046, -0.26375218935831474, -0.26375218935831474, -0.9231326627541017, -0.26375218935831474],
+        {"col1": [-0.3302891295379082, -0.8807710121010884, -0.8807710121010884, 0.7706746355884523,
+                  1.3211565181516325],
+         "col2": [1.714389230829046, -0.26375218935831474, -0.26375218935831474, -0.9231326627541017,
+                  -0.26375218935831474],
          "col3": [1.0, -1.0, -1.0, 1.0, 0.0]}
     )
 
@@ -233,7 +234,6 @@ def test_scaler():
         minmax_scaled_mock_df_1, mock_df_1_minmax
     ), "The returned dataframe using standard scaler method is not correct"
 
-
     assert pd.DataFrame.equals(
         standard_scaled_mock_df_2, mock_df_2_standard
     ), "The returned dataframe using most_frequent inputer is not correct"
@@ -242,43 +242,47 @@ def test_scaler():
     ), "The returned dataframe using constant imputer is not correct"
 
 
-
 def test_outlier_identifier():
     test_df = pd.DataFrame({
         'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 50, 5.4, 5.0, 5.2, 5.3, 5.1],
         'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
-        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
-        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+        'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
+        'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
+                    'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
     })
 
     test_column = ['SepalLengthCm', 'SepalWidthCm', 'PetalWidthCm']
 
     median_output = pd.DataFrame({
         'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 5.1, 5.4, 5.0, 5.2, 5.3, 5.1],
         'SepalWidthCm': [1.4, 1.4, 1.5, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
-        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
-    'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+        'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
+        'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
+                    'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
     })
 
     trim_output = pd.DataFrame({
         'SepalLengthCm': [5.1, 4.9, 5.5, 5.1, 5.4, 5.0, 5.2, 5.3],
         'SepalWidthCm': [1.4, 1.4, 2.0, 0.7, 1.2, 1.4, 1.8, 1.5],
-        'PetalWidthCm' :[0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.4, 0.2],
-        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+        'PetalWidthCm': [0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.4, 0.2],
+        'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
+                    'Iris-setosa', 'Iris-setosa']
     })
 
     mean_output = pd.DataFrame({
         'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
         'SepalWidthCm': [1.4, 1.4, 3.19, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
-        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
-        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+        'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
+        'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
+                    'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
     })
 
-    column_output= pd.DataFrame({
+    column_output = pd.DataFrame({
         'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
         'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
-        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
-        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+        'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
+        'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
+                    'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
     })
 
     # Test if the imput is not dataFrame
@@ -295,7 +299,7 @@ def test_outlier_identifier():
 
     # Test if method input is not one of three methods provided
     with raises(Exception):
-        eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method = "no")
+        eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method="no")
 
     # Test if column selected included non-numeric columns
     with raises(Exception):
@@ -305,12 +309,11 @@ def test_outlier_identifier():
         eda_utils_py.outlier_identifier(test_df, test_column), trim_output
     ), "Default test not pass"
     assert pd.DataFrame.equals(
-        eda_utils_py.outlier_identifier(test_df, test_column,method = "median"), median_output
+        eda_utils_py.outlier_identifier(test_df, test_column, method="median"), median_output
     ), "The median method is not correct"
     assert pd.DataFrame.equals(
-        eda_utils_py.outlier_identifier(test_df, test_column, method = "mean"), mean_output
+        eda_utils_py.outlier_identifier(test_df, test_column, method="mean"), mean_output
     ), "The mean method is not correct"
     assert pd.DataFrame.equals(
-        eda_utils_py.outlier_identifier(test_df, columns = ["SepalLengthCm"], method = "mean"), column_output
+        eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method="mean"), column_output
     ), "The selected column method is not correct"
-