debug

wangjc640 · wangjc640 · commit 99e9fc77106a · 2021-03-05T23:07:17.000-05:00
diff --git a/eda_utils_py/eda_utils_py.py b/eda_utils_py/eda_utils_py.py
@@ -109,21 +109,26 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
         
     >>> data = pd.DataFrame({
     >>>    'SepalLengthCm':[5.1, 4.9, 4.7],
-    >>>    'SepalWidthCm':[1.4, 1.4, 9999999.99],
+    >>>    'SepalWidthCm':[1.4, 1.4, 99],
     >>>    'PetalWidthCm:[0.2, 0.2, 0.2],
     >>>    'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
     >>> })
 
     >>> outlier_identifier(data)
 
     """
-
     if not isinstance(dataframe, pd.DataFrame):
         raise TypeError("The argument @dataframe must be of pd.DataFrame")
 
+    if columns is None:
+        for col in dataframe.columns:
+            if not is_numeric_dtype(dataframe[col]):
+                raise Exception("The given dataframe contains column that is not numeric column.")  
+                
     if columns is not None:
         if not isinstance(columns, list):
             raise TypeError("The argument @columns must be of type list")
+          
         
         for col in columns:
             if col not in list(dataframe.columns):
@@ -135,34 +140,38 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
         raise Exception("The method must be -trim- or -median- or -mean-")
 
     
+    df = dataframe.copy()
     target_columns = []
     if(columns is None):
-        target_columns = list(dataframe.columns.values.tolist()) 
+        target_columns = list(df.columns.values.tolist()) 
+    else:
+        target_columns = columns
         
-    
-
     outlier_index = []
     for column in target_columns:
-        current_column = dataframe[column]
+        current_column = df[column]
         mean = np.mean(current_column)
         std = np.std(current_column)
         threshold = 3 
         
+        
         for i in range(len(current_column)):
             current_item = current_column[i]
             z = (current_item - mean) / std
             if z >= threshold:
                 if(i not in outlier_index):
                     outlier_index.append(i)
-                if(method == "median"):
-                    dataframe[column][i] = np.median(current_column)
                 if(method == "mean"):
-                    dataframe[column][i] = np.mean(current_column)
+                    df.at[i, column] = round(mean, 2)
+                if(method == "median"):
+                    df.at[i, column] = np.median(current_column)
+                
     
     if(method == "trim"):
-        dataframe = dataframe.drop(outlier_index)
-
-    return dataframe
+        df = df.drop(outlier_index)
+        
+    df.index = range(len(df))
+    return df
 
 
 def scale(dataframe, columns=None):
diff --git a/tests/test_eda_utils_py.py b/tests/test_eda_utils_py.py
@@ -1,5 +1,10 @@
 from eda_utils_py import __version__
 from eda_utils_py import eda_utils_py
+import pandas as pd
+import altair as alt
+from pandas.api.types import is_numeric_dtype
+import numpy as np
+
 
 def test_version():
     assert __version__ == '0.1.0'
@@ -10,7 +15,7 @@ def test_outlier_identifier():
         'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 50, 5.4, 5.0, 5.2, 5.3, 5.1],
         'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
         'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
-        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
     })
 
     test_column = ['SepalLengthCm', 'SepalWidthCm', 'PetalWidthCm']
@@ -19,59 +24,59 @@ def test_outlier_identifier():
         'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 5.1, 5.4, 5.0, 5.2, 5.3, 5.1],
         'SepalWidthCm': [1.4, 1.4, 1.5, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
         'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
-        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+    'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
     })
 
     trim_output = pd.DataFrame({
         'SepalLengthCm': [5.1, 4.9, 5.5, 5.1, 5.4, 5.0, 5.2, 5.3],
         'SepalWidthCm': [1.4, 1.4, 2.0, 0.7, 1.2, 1.4, 1.8, 1.5],
-        'PetalWidthCm' :[0.2, 0.2, 0.3, 0.4 0.5, 0.6, 0.4, 0.2],
-        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+        'PetalWidthCm' :[0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.4, 0.2],
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
     })
 
     mean_output = pd.DataFrame({
         'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
         'SepalWidthCm': [1.4, 1.4, 3.19, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
         'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
-        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
     })
 
     column_output= pd.DataFrame({
         'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
         'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
         'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
-        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
+        'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
     })
 
     # Test if the imput is not dataFrame
-    with raises(TypeError):
+    with raise(TypeError):
         eda_utils_py.outlier_identifier("not dataframe")
 
     # Test if columns input is not list
-    with raises(TypeError):
+    with raise(TypeError):
         eda_utils_py.outlier_identifier(test_df, columns=2)
 
     # Test if input column list is in the dataframe
-    with raises(TypeError):
+    with raise(TypeError):
         eda_utils_py.outlier_identifier(test_df, columns=["not in"])
 
     # Test if method input is not one of three methods provided
-    with raises(TypeError):
+    with raise(TypeError):
         eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method = "no")
 
     # Test if column selected included non-numeric columns
-    with raises(Exception):
+    with raise(Exception):
         eda_utils_py.outlier_identifier(test_df, columns=["Species"])
 
     assert pd.DataFrame.equals(
-        eda_utils_py.outlier_identifier(test_df), trim_output
+        outlier_identifier(test_df, test_column), trim_output
     ), "Default test not pass"
     assert pd.DataFrame.equals(
-        eda_utils_py.outlier_identifier(data, method = "median"), median_output
+        outlier_identifier(test_df, test_column,method = "median"), median_output
     ), "The median method is not correct"
     assert pd.DataFrame.equals(
-        eda_utils_py.outlier_identifier(data, method = "mean"), mean_output
+        outlier_identifier(test_df, test_column, method = "mean"), mean_output
     ), "The mean method is not correct"
     assert pd.DataFrame.equals(
-        eda_utils_py.outlier_identifier(data, columns = ["SepalLengthCm"], method = "mean"), column_output
-    ), "The selected column method is not correct"
+        outlier_identifier(test_df, columns = ["SepalLengthCm"], method = "mean"), column_output
+    ), "The selected column method is not correct"