Skip to content

Commit a34d4b5

Browse files
authored
Merge pull request #28 from wangjc640/main
Update outlier function
2 parents 325b7b0 + 54b914e commit a34d4b5

File tree

2 files changed

+154
-13
lines changed

2 files changed

+154
-13
lines changed

eda_utils_py/eda_utils_py.py

Lines changed: 78 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import pandas as pd
33
from pandas.api.types import is_numeric_dtype
44
import numbers
5+
import numpy as np
6+
57

68

79
def imputer(df, strategy="mean", fill_value=None):
@@ -66,6 +68,7 @@ def imputer(df, strategy="mean", fill_value=None):
6668
if isinstance(fill_value, type(None)) and strategy == "constant":
6769
raise Exception("fill_value should be a number when strategy is 'constant'")
6870

71+
6972
result = pd.DataFrame()
7073
if strategy == "mean":
7174
result = df.apply(lambda x: x.fillna(x.mean()), axis=0)
@@ -181,7 +184,7 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
181184

182185
def outlier_identifier(dataframe, columns=None, method="trim"):
183186
"""
184-
A function that identify and deal with outliers based on the method the user choose
187+
A function that identify by z-test with threshold of 3, and deal with outliers based on the method the user choose
185188
186189
Parameters
187190
----------
@@ -193,6 +196,8 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
193196
The method of dealing with outliers.
194197
- if "trim" : we completely remove data points that are outliers.
195198
- if "median" : we replace outliers with median values
199+
- if "mean" : we replace outliers with mean values
200+
196201
197202
Returns
198203
-------
@@ -201,22 +206,78 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
201206
202207
Examples
203208
--------
204-
>> import pandas as pd
205-
>> from eda_utils_py import cor_map
209+
>>> import pandas as pd
210+
>>> from eda_utils_py import cor_map
211+
212+
>>> data = pd.DataFrame({
213+
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
214+
>>> 'SepalWidthCm':[1.4, 1.4, 99],
215+
>>> 'PetalWidthCm:[0.2, 0.2, 0.2],
216+
>>> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
217+
>>> })
206218
207-
>> data = pd.DataFrame({
208-
>> 'SepalLengthCm':[5.1, 4.9, 4.7],
209-
>> 'SepalWidthCm':[1.4, 1.4, 9999999.99],
210-
>> 'PetalWidthCm:[0.2, 0.2, 0.2],
211-
>> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
212-
>> })
219+
>>> outlier_identifier(data)
213220
214-
>> outlier_identifier(data)
215221
216222
"""
217-
pass
218-
219-
223+
if not isinstance(dataframe, pd.DataFrame):
224+
raise TypeError("The argument @dataframe must be of pd.DataFrame")
225+
226+
if columns is None:
227+
for col in dataframe.columns:
228+
if not is_numeric_dtype(dataframe[col]):
229+
raise Exception("The given dataframe contains column that is not numeric column.")
230+
231+
if columns is not None:
232+
if not isinstance(columns, list):
233+
raise TypeError("The argument @columns must be of type list")
234+
235+
236+
for col in columns:
237+
if col not in list(dataframe.columns):
238+
raise Exception("The given column list contains column that is not exist in the given dataframe.")
239+
if not is_numeric_dtype(dataframe[col]):
240+
raise Exception("The given column list contains column that is not numeric column.")
241+
242+
if method not in ("trim", "median", "mean"):
243+
raise Exception("The method must be -trim- or -median- or -mean-")
244+
245+
df = dataframe.copy()
246+
target_columns = []
247+
if(columns is None):
248+
target_columns = list(df.columns.values.tolist())
249+
else:
250+
target_columns = columns
251+
252+
outlier_index = []
253+
for column in target_columns:
254+
current_column = df[column]
255+
mean = np.mean(current_column)
256+
std = np.std(current_column)
257+
threshold = 3
258+
259+
260+
for i in range(len(current_column)):
261+
current_item = current_column[i]
262+
z = (current_item - mean) / std
263+
if z >= threshold:
264+
if(i not in outlier_index):
265+
outlier_index.append(i)
266+
if(method == "mean"):
267+
df.at[i, column] = round(mean, 2)
268+
if(method == "median"):
269+
df.at[i, column] = np.median(current_column)
270+
271+
272+
if(method == "trim"):
273+
df = df.drop(outlier_index)
274+
275+
df.index = range(len(df))
276+
return df
277+
278+
279+
280+
220281
def scale(dataframe, columns=None, scaler="standard"):
221282
"""
222283
A function to scale features either by using standard scaler or minmax scaler method
@@ -317,6 +378,7 @@ def _standardize(dataframe):
317378
self : object
318379
Scaled dataset
319380
"""
381+
320382
res = dataframe.copy()
321383
for feature_name in dataframe.columns:
322384
mean = dataframe[feature_name].mean()
@@ -351,4 +413,7 @@ def _minmax(dataframe):
351413
max = dataframe[feature_name].max()
352414
min = dataframe[feature_name].min()
353415
res[feature_name] = (dataframe[feature_name] - min) / (max - min)
416+
354417
return res
418+
419+

tests/test_eda_utils_py.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from pytest import raises
44
import pandas as pd
55
from pandas._testing import assert_frame_equal
6+
import numpy as np
7+
68

79

810
def test_version():
@@ -238,3 +240,77 @@ def test_scaler():
238240
assert pd.DataFrame.equals(
239241
minmax_scaled_mock_df_2, mock_df_2_minmax
240242
), "The returned dataframe using constant imputer is not correct"
243+
244+
245+
246+
def test_outlier_identifier():
247+
test_df = pd.DataFrame({
248+
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 50, 5.4, 5.0, 5.2, 5.3, 5.1],
249+
'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
250+
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
251+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
252+
})
253+
254+
test_column = ['SepalLengthCm', 'SepalWidthCm', 'PetalWidthCm']
255+
256+
median_output = pd.DataFrame({
257+
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 5.1, 5.4, 5.0, 5.2, 5.3, 5.1],
258+
'SepalWidthCm': [1.4, 1.4, 1.5, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
259+
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
260+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
261+
})
262+
263+
trim_output = pd.DataFrame({
264+
'SepalLengthCm': [5.1, 4.9, 5.5, 5.1, 5.4, 5.0, 5.2, 5.3],
265+
'SepalWidthCm': [1.4, 1.4, 2.0, 0.7, 1.2, 1.4, 1.8, 1.5],
266+
'PetalWidthCm' :[0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.4, 0.2],
267+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
268+
})
269+
270+
mean_output = pd.DataFrame({
271+
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
272+
'SepalWidthCm': [1.4, 1.4, 3.19, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
273+
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
274+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
275+
})
276+
277+
column_output= pd.DataFrame({
278+
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
279+
'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
280+
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
281+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
282+
})
283+
284+
# Test if the imput is not dataFrame
285+
with raises(TypeError):
286+
eda_utils_py.outlier_identifier("not dataframe")
287+
288+
# Test if columns input is not list
289+
with raises(TypeError):
290+
eda_utils_py.outlier_identifier(test_df, columns=2)
291+
292+
# Test if input column list is in the dataframe
293+
with raises(Exception):
294+
eda_utils_py.outlier_identifier(test_df, columns=["not in"])
295+
296+
# Test if method input is not one of three methods provided
297+
with raises(Exception):
298+
eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method = "no")
299+
300+
# Test if column selected included non-numeric columns
301+
with raises(Exception):
302+
eda_utils_py.outlier_identifier(test_df, columns=["Species"])
303+
304+
assert pd.DataFrame.equals(
305+
eda_utils_py.outlier_identifier(test_df, test_column), trim_output
306+
), "Default test not pass"
307+
assert pd.DataFrame.equals(
308+
eda_utils_py.outlier_identifier(test_df, test_column,method = "median"), median_output
309+
), "The median method is not correct"
310+
assert pd.DataFrame.equals(
311+
eda_utils_py.outlier_identifier(test_df, test_column, method = "mean"), mean_output
312+
), "The mean method is not correct"
313+
assert pd.DataFrame.equals(
314+
eda_utils_py.outlier_identifier(test_df, columns = ["SepalLengthCm"], method = "mean"), column_output
315+
), "The selected column method is not correct"
316+

0 commit comments

Comments
 (0)