test added

wangjc640 · wangjc640 · commit 8d68b8464748 · 2021-03-05T21:36:43.000-05:00
diff --git a/eda_utils_py/eda_utils_py.py b/eda_utils_py/eda_utils_py.py
@@ -1,3 +1,9 @@
+import pandas as pd
+import altair as alt
+from pandas.api.types import is_numeric_dtype
+import numpy as np
+
+
 def imputer(dataframe, strategy="mean", fill_value=None):
     """
     A function to implement imputation functionality for completing missing values.
@@ -77,7 +83,7 @@ def cor_map(dataframe, num_col):
 
 def outlier_identifier(dataframe, columns=None, method="trim"):
     """
-    A function that identify and deal with outliers based on the method the user choose
+    A function that identify by z-test with threshold of 3, and deal with outliers based on the method the user choose
 
     Parameters
     ---------- 
@@ -89,6 +95,7 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
         The method of dealing with outliers. 
             - if "trim" : we completely remove data points that are outliers.
             - if "median" : we replace outliers with median values
+            - if "mean" : we replace outliers with mean values
         
     Returns
     -------
@@ -114,31 +121,32 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
     if not isinstance(dataframe, pd.DataFrame):
         raise TypeError("The argument @dataframe must be of pd.DataFrame")
 
-    if not isinstance(columns, list):
-        raise TypeError("The argument @columns must be of type list")
+    if columns is not None:
+        if not isinstance(columns, list):
+            raise TypeError("The argument @columns must be of type list")
+        
+        for col in columns:
+            if col not in list(dataframe.columns):
+                raise Exception("The given column list contains column that is not exist in the given dataframe.")    
+            if not is_numeric_dtype(dataframe[col]):
+                raise Exception("The given column list contains column that is not numeric column.")
  
-    if method not in ("trim", "median"):
-        raise Exception("The method must be -trim- or -median-")
-
-    for col in columns:
-        if col not in list(dataframe.columns):
-            raise Exception("The given column list contains column that is not exist in the given dataframe.")    
+    if method not in ("trim", "median", "mean"):
+        raise Exception("The method must be -trim- or -median- or -mean-")
 
-    for col in columns:
-        if not is_numeric_dtype(dataframe[col]):
-            raise Exception("The given column list contains column that is not numeric column.")
     
     target_columns = []
     if(columns is None):
         target_columns = list(dataframe.columns.values.tolist()) 
         
-        
+    
+
     outlier_index = []
     for column in target_columns:
         current_column = dataframe[column]
         mean = np.mean(current_column)
         std = np.std(current_column)
-        threshold = 3
+        threshold = 3 
         
         for i in range(len(current_column)):
             current_item = current_column[i]
@@ -147,11 +155,13 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
                 if(i not in outlier_index):
                     outlier_index.append(i)
                 if(method == "median"):
-                    m = np.median(current_column)
-                    dataframe[column][i] = m
+                    dataframe[column][i] = np.median(current_column)
+                if(method == "mean"):
+                    dataframe[column][i] = np.mean(current_column)
     
     if(method == "trim"):
         dataframe = dataframe.drop(outlier_index)
+
     return dataframe
 
 
diff --git a/tests/test_eda_utils_py.py b/tests/test_eda_utils_py.py
@@ -2,4 +2,76 @@
 from eda_utils_py import eda_utils_py
 
 def test_version():
-    assert __version__ == '0.1.0'
+    assert __version__ == '0.1.0'
+
+
+def test_outlier_identifier():
+    test_df = pd.DataFrame({
+        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 50, 5.4, 5.0, 5.2, 5.3, 5.1],
+        'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+    })
+
+    test_column = ['SepalLengthCm', 'SepalWidthCm', 'PetalWidthCm']
+
+    median_output = pd.DataFrame({
+        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 5.1, 5.4, 5.0, 5.2, 5.3, 5.1],
+        'SepalWidthCm': [1.4, 1.4, 1.5, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+    })
+
+    trim_output = pd.DataFrame({
+        'SepalLengthCm': [5.1, 4.9, 5.5, 5.1, 5.4, 5.0, 5.2, 5.3],
+        'SepalWidthCm': [1.4, 1.4, 2.0, 0.7, 1.2, 1.4, 1.8, 1.5],
+        'PetalWidthCm' :[0.2, 0.2, 0.3, 0.4 0.5, 0.6, 0.4, 0.2],
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+    })
+
+    mean_output = pd.DataFrame({
+        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
+        'SepalWidthCm': [1.4, 1.4, 3.19, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+    })
+
+    column_output= pd.DataFrame({
+        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
+        'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+        'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+    })
+
+    # Test if the imput is not dataFrame
+    with raises(TypeError):
+        eda_utils_py.outlier_identifier("not dataframe")
+
+    # Test if columns input is not list
+    with raises(TypeError):
+        eda_utils_py.outlier_identifier(test_df, columns=2)
+
+    # Test if input column list is in the dataframe
+    with raises(TypeError):
+        eda_utils_py.outlier_identifier(test_df, columns=["not in"])
+
+    # Test if method input is not one of three methods provided
+    with raises(TypeError):
+        eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method = "no")
+
+    # Test if column selected included non-numeric columns
+    with raises(Exception):
+        eda_utils_py.outlier_identifier(test_df, columns=["Species"])
+
+    assert pd.DataFrame.equals(
+        eda_utils_py.outlier_identifier(test_df), trim_output
+    ), "Default test not pass"
+    assert pd.DataFrame.equals(
+        eda_utils_py.outlier_identifier(data, method = "median"), median_output
+    ), "The median method is not correct"
+    assert pd.DataFrame.equals(
+        eda_utils_py.outlier_identifier(data, method = "mean"), mean_output
+    ), "The mean method is not correct"
+    assert pd.DataFrame.equals(
+        eda_utils_py.outlier_identifier(data, columns = ["SepalLengthCm"], method = "mean"), column_output
+    ), "The selected column method is not correct"