Skip to content

Commit 8d68b84

Browse files
committed
test added
1 parent 4b0b397 commit 8d68b84

File tree

2 files changed

+99
-17
lines changed

2 files changed

+99
-17
lines changed

eda_utils_py/eda_utils_py.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
import pandas as pd
2+
import altair as alt
3+
from pandas.api.types import is_numeric_dtype
4+
import numpy as np
5+
6+
17
def imputer(dataframe, strategy="mean", fill_value=None):
28
"""
39
A function to implement imputation functionality for completing missing values.
@@ -77,7 +83,7 @@ def cor_map(dataframe, num_col):
7783

7884
def outlier_identifier(dataframe, columns=None, method="trim"):
7985
"""
80-
A function that identify and deal with outliers based on the method the user choose
86+
A function that identify by z-test with threshold of 3, and deal with outliers based on the method the user choose
8187
8288
Parameters
8389
----------
@@ -89,6 +95,7 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
8995
The method of dealing with outliers.
9096
- if "trim" : we completely remove data points that are outliers.
9197
- if "median" : we replace outliers with median values
98+
- if "mean" : we replace outliers with mean values
9299
93100
Returns
94101
-------
@@ -114,31 +121,32 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
114121
if not isinstance(dataframe, pd.DataFrame):
115122
raise TypeError("The argument @dataframe must be of pd.DataFrame")
116123

117-
if not isinstance(columns, list):
118-
raise TypeError("The argument @columns must be of type list")
124+
if columns is not None:
125+
if not isinstance(columns, list):
126+
raise TypeError("The argument @columns must be of type list")
127+
128+
for col in columns:
129+
if col not in list(dataframe.columns):
130+
raise Exception("The given column list contains column that is not exist in the given dataframe.")
131+
if not is_numeric_dtype(dataframe[col]):
132+
raise Exception("The given column list contains column that is not numeric column.")
119133

120-
if method not in ("trim", "median"):
121-
raise Exception("The method must be -trim- or -median-")
122-
123-
for col in columns:
124-
if col not in list(dataframe.columns):
125-
raise Exception("The given column list contains column that is not exist in the given dataframe.")
134+
if method not in ("trim", "median", "mean"):
135+
raise Exception("The method must be -trim- or -median- or -mean-")
126136

127-
for col in columns:
128-
if not is_numeric_dtype(dataframe[col]):
129-
raise Exception("The given column list contains column that is not numeric column.")
130137

131138
target_columns = []
132139
if(columns is None):
133140
target_columns = list(dataframe.columns.values.tolist())
134141

135-
142+
143+
136144
outlier_index = []
137145
for column in target_columns:
138146
current_column = dataframe[column]
139147
mean = np.mean(current_column)
140148
std = np.std(current_column)
141-
threshold = 3
149+
threshold = 3
142150

143151
for i in range(len(current_column)):
144152
current_item = current_column[i]
@@ -147,11 +155,13 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
147155
if(i not in outlier_index):
148156
outlier_index.append(i)
149157
if(method == "median"):
150-
m = np.median(current_column)
151-
dataframe[column][i] = m
158+
dataframe[column][i] = np.median(current_column)
159+
if(method == "mean"):
160+
dataframe[column][i] = np.mean(current_column)
152161

153162
if(method == "trim"):
154163
dataframe = dataframe.drop(outlier_index)
164+
155165
return dataframe
156166

157167

tests/test_eda_utils_py.py

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,76 @@
22
from eda_utils_py import eda_utils_py
33

44
def test_version():
5-
assert __version__ == '0.1.0'
5+
assert __version__ == '0.1.0'
6+
7+
8+
def test_outlier_identifier():
9+
test_df = pd.DataFrame({
10+
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 50, 5.4, 5.0, 5.2, 5.3, 5.1],
11+
'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
12+
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
13+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
14+
})
15+
16+
test_column = ['SepalLengthCm', 'SepalWidthCm', 'PetalWidthCm']
17+
18+
median_output = pd.DataFrame({
19+
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 5.1, 5.4, 5.0, 5.2, 5.3, 5.1],
20+
'SepalWidthCm': [1.4, 1.4, 1.5, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
21+
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
22+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
23+
})
24+
25+
trim_output = pd.DataFrame({
26+
'SepalLengthCm': [5.1, 4.9, 5.5, 5.1, 5.4, 5.0, 5.2, 5.3],
27+
'SepalWidthCm': [1.4, 1.4, 2.0, 0.7, 1.2, 1.4, 1.8, 1.5],
28+
'PetalWidthCm' :[0.2, 0.2, 0.3, 0.4 0.5, 0.6, 0.4, 0.2],
29+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
30+
})
31+
32+
mean_output = pd.DataFrame({
33+
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
34+
'SepalWidthCm': [1.4, 1.4, 3.19, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
35+
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
36+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
37+
})
38+
39+
column_output= pd.DataFrame({
40+
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
41+
'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
42+
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
43+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
44+
})
45+
46+
# Test if the imput is not dataFrame
47+
with raises(TypeError):
48+
eda_utils_py.outlier_identifier("not dataframe")
49+
50+
# Test if columns input is not list
51+
with raises(TypeError):
52+
eda_utils_py.outlier_identifier(test_df, columns=2)
53+
54+
# Test if input column list is in the dataframe
55+
with raises(TypeError):
56+
eda_utils_py.outlier_identifier(test_df, columns=["not in"])
57+
58+
# Test if method input is not one of three methods provided
59+
with raises(TypeError):
60+
eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method = "no")
61+
62+
# Test if column selected included non-numeric columns
63+
with raises(Exception):
64+
eda_utils_py.outlier_identifier(test_df, columns=["Species"])
65+
66+
assert pd.DataFrame.equals(
67+
eda_utils_py.outlier_identifier(test_df), trim_output
68+
), "Default test not pass"
69+
assert pd.DataFrame.equals(
70+
eda_utils_py.outlier_identifier(data, method = "median"), median_output
71+
), "The median method is not correct"
72+
assert pd.DataFrame.equals(
73+
eda_utils_py.outlier_identifier(data, method = "mean"), mean_output
74+
), "The mean method is not correct"
75+
assert pd.DataFrame.equals(
76+
eda_utils_py.outlier_identifier(data, columns = ["SepalLengthCm"], method = "mean"), column_output
77+
), "The selected column method is not correct"

0 commit comments

Comments
 (0)