Merge pull request #28 from wangjc640/main

micahkwok · web-flow · commit a34d4b5282b2 · 2021-03-05T21:55:37.000-08:00
Update outlier function
diff --git a/eda_utils_py/eda_utils_py.py b/eda_utils_py/eda_utils_py.py
@@ -2,6 +2,8 @@
 import pandas as pd
 from pandas.api.types import is_numeric_dtype
 import numbers
+import numpy as np
+
 
 
 def imputer(df, strategy="mean", fill_value=None):
@@ -66,6 +68,7 @@ def imputer(df, strategy="mean", fill_value=None):
     if isinstance(fill_value, type(None)) and strategy == "constant":
         raise Exception("fill_value should be a number when strategy is 'constant'")
 
+
     result = pd.DataFrame()
     if strategy == "mean":
         result = df.apply(lambda x: x.fillna(x.mean()), axis=0)
@@ -181,7 +184,7 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
 
 def outlier_identifier(dataframe, columns=None, method="trim"):
     """
-    A function that identify and deal with outliers based on the method the user choose
+    A function that identify by z-test with threshold of 3, and deal with outliers based on the method the user choose
 
     Parameters
     ----------
@@ -193,6 +196,8 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
         The method of dealing with outliers.
             - if "trim" : we completely remove data points that are outliers.
             - if "median" : we replace outliers with median values
+            - if "mean" : we replace outliers with mean values
+        
 
     Returns
     -------
@@ -201,22 +206,78 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
 
     Examples
     --------
-    >> import pandas as pd
-    >> from eda_utils_py import cor_map
+    >>> import pandas as pd
+    >>> from eda_utils_py import cor_map
+        
+    >>> data = pd.DataFrame({
+    >>>    'SepalLengthCm':[5.1, 4.9, 4.7],
+    >>>    'SepalWidthCm':[1.4, 1.4, 99],
+    >>>    'PetalWidthCm:[0.2, 0.2, 0.2],
+    >>>    'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
+    >>> })
 
-    >> data = pd.DataFrame({
-    >>    'SepalLengthCm':[5.1, 4.9, 4.7],
-    >>    'SepalWidthCm':[1.4, 1.4, 9999999.99],
-    >>    'PetalWidthCm:[0.2, 0.2, 0.2],
-    >>    'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
-    >> })
+    >>> outlier_identifier(data)
 
-    >> outlier_identifier(data)
 
     """
-    pass
-
-
+    if not isinstance(dataframe, pd.DataFrame):
+        raise TypeError("The argument @dataframe must be of pd.DataFrame")
+
+    if columns is None:
+        for col in dataframe.columns:
+            if not is_numeric_dtype(dataframe[col]):
+                raise Exception("The given dataframe contains column that is not numeric column.")  
+                
+    if columns is not None:
+        if not isinstance(columns, list):
+            raise TypeError("The argument @columns must be of type list")
+          
+        
+        for col in columns:
+            if col not in list(dataframe.columns):
+                raise Exception("The given column list contains column that is not exist in the given dataframe.")    
+            if not is_numeric_dtype(dataframe[col]):
+                raise Exception("The given column list contains column that is not numeric column.")
+ 
+    if method not in ("trim", "median", "mean"):
+        raise Exception("The method must be -trim- or -median- or -mean-")
+    
+    df = dataframe.copy()
+    target_columns = []
+    if(columns is None):
+        target_columns = list(df.columns.values.tolist()) 
+    else:
+        target_columns = columns
+        
+    outlier_index = []
+    for column in target_columns:
+        current_column = df[column]
+        mean = np.mean(current_column)
+        std = np.std(current_column)
+        threshold = 3 
+        
+        
+        for i in range(len(current_column)):
+            current_item = current_column[i]
+            z = (current_item - mean) / std
+            if z >= threshold:
+                if(i not in outlier_index):
+                    outlier_index.append(i)
+                if(method == "mean"):
+                    df.at[i, column] = round(mean, 2)
+                if(method == "median"):
+                    df.at[i, column] = np.median(current_column)
+                
+    
+    if(method == "trim"):
+        df = df.drop(outlier_index)
+        
+    df.index = range(len(df))
+    return df
+
+
+
+  
 def scale(dataframe, columns=None, scaler="standard"):
     """
     A function to scale features either by using standard scaler or minmax scaler method
@@ -317,6 +378,7 @@ def _standardize(dataframe):
     self : object
         Scaled dataset
     """
+
     res = dataframe.copy()
     for feature_name in dataframe.columns:
         mean = dataframe[feature_name].mean()
@@ -351,4 +413,7 @@ def _minmax(dataframe):
         max = dataframe[feature_name].max()
         min = dataframe[feature_name].min()
         res[feature_name] = (dataframe[feature_name] - min) / (max - min)
+
     return res
+
+
diff --git a/tests/test_eda_utils_py.py b/tests/test_eda_utils_py.py
@@ -3,6 +3,8 @@
 from pytest import raises
 import pandas as pd
 from pandas._testing import assert_frame_equal
+import numpy as np
+
 
 
 def test_version():
@@ -238,3 +240,77 @@ def test_scaler():
     assert pd.DataFrame.equals(
         minmax_scaled_mock_df_2, mock_df_2_minmax
     ), "The returned dataframe using constant imputer is not correct"
+
+
+
+def test_outlier_identifier():
+    test_df = pd.DataFrame({
+        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 50, 5.4, 5.0, 5.2, 5.3, 5.1],
+        'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+    })
+
+    test_column = ['SepalLengthCm', 'SepalWidthCm', 'PetalWidthCm']
+
+    median_output = pd.DataFrame({
+        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 5.1, 5.4, 5.0, 5.2, 5.3, 5.1],
+        'SepalWidthCm': [1.4, 1.4, 1.5, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
+    'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+    })
+
+    trim_output = pd.DataFrame({
+        'SepalLengthCm': [5.1, 4.9, 5.5, 5.1, 5.4, 5.0, 5.2, 5.3],
+        'SepalWidthCm': [1.4, 1.4, 2.0, 0.7, 1.2, 1.4, 1.8, 1.5],
+        'PetalWidthCm' :[0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.4, 0.2],
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+    })
+
+    mean_output = pd.DataFrame({
+        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
+        'SepalWidthCm': [1.4, 1.4, 3.19, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+    })
+
+    column_output= pd.DataFrame({
+        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
+        'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+    })
+
+    # Test if the imput is not dataFrame
+    with raises(TypeError):
+        eda_utils_py.outlier_identifier("not dataframe")
+
+    # Test if columns input is not list
+    with raises(TypeError):
+        eda_utils_py.outlier_identifier(test_df, columns=2)
+
+    # Test if input column list is in the dataframe
+    with raises(Exception):
+        eda_utils_py.outlier_identifier(test_df, columns=["not in"])
+
+    # Test if method input is not one of three methods provided
+    with raises(Exception):
+        eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method = "no")
+
+    # Test if column selected included non-numeric columns
+    with raises(Exception):
+        eda_utils_py.outlier_identifier(test_df, columns=["Species"])
+
+    assert pd.DataFrame.equals(
+        eda_utils_py.outlier_identifier(test_df, test_column), trim_output
+    ), "Default test not pass"
+    assert pd.DataFrame.equals(
+        eda_utils_py.outlier_identifier(test_df, test_column,method = "median"), median_output
+    ), "The median method is not correct"
+    assert pd.DataFrame.equals(
+        eda_utils_py.outlier_identifier(test_df, test_column, method = "mean"), mean_output
+    ), "The mean method is not correct"
+    assert pd.DataFrame.equals(
+        eda_utils_py.outlier_identifier(test_df, columns = ["SepalLengthCm"], method = "mean"), column_output
+    ), "The selected column method is not correct"
+