Skip to content

Commit 99e9fc7

Browse files
committed
debug
1 parent 8d68b84 commit 99e9fc7

File tree

2 files changed

+42
-28
lines changed

2 files changed

+42
-28
lines changed

eda_utils_py/eda_utils_py.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -109,21 +109,26 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
109109
110110
>>> data = pd.DataFrame({
111111
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
112-
>>> 'SepalWidthCm':[1.4, 1.4, 9999999.99],
112+
>>> 'SepalWidthCm':[1.4, 1.4, 99],
113113
>>> 'PetalWidthCm:[0.2, 0.2, 0.2],
114114
>>> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
115115
>>> })
116116
117117
>>> outlier_identifier(data)
118118
119119
"""
120-
121120
if not isinstance(dataframe, pd.DataFrame):
122121
raise TypeError("The argument @dataframe must be of pd.DataFrame")
123122

123+
if columns is None:
124+
for col in dataframe.columns:
125+
if not is_numeric_dtype(dataframe[col]):
126+
raise Exception("The given dataframe contains column that is not numeric column.")
127+
124128
if columns is not None:
125129
if not isinstance(columns, list):
126130
raise TypeError("The argument @columns must be of type list")
131+
127132

128133
for col in columns:
129134
if col not in list(dataframe.columns):
@@ -135,34 +140,38 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
135140
raise Exception("The method must be -trim- or -median- or -mean-")
136141

137142

143+
df = dataframe.copy()
138144
target_columns = []
139145
if(columns is None):
140-
target_columns = list(dataframe.columns.values.tolist())
146+
target_columns = list(df.columns.values.tolist())
147+
else:
148+
target_columns = columns
141149

142-
143-
144150
outlier_index = []
145151
for column in target_columns:
146-
current_column = dataframe[column]
152+
current_column = df[column]
147153
mean = np.mean(current_column)
148154
std = np.std(current_column)
149155
threshold = 3
150156

157+
151158
for i in range(len(current_column)):
152159
current_item = current_column[i]
153160
z = (current_item - mean) / std
154161
if z >= threshold:
155162
if(i not in outlier_index):
156163
outlier_index.append(i)
157-
if(method == "median"):
158-
dataframe[column][i] = np.median(current_column)
159164
if(method == "mean"):
160-
dataframe[column][i] = np.mean(current_column)
165+
df.at[i, column] = round(mean, 2)
166+
if(method == "median"):
167+
df.at[i, column] = np.median(current_column)
168+
161169

162170
if(method == "trim"):
163-
dataframe = dataframe.drop(outlier_index)
164-
165-
return dataframe
171+
df = df.drop(outlier_index)
172+
173+
df.index = range(len(df))
174+
return df
166175

167176

168177
def scale(dataframe, columns=None):

tests/test_eda_utils_py.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
from eda_utils_py import __version__
22
from eda_utils_py import eda_utils_py
3+
import pandas as pd
4+
import altair as alt
5+
from pandas.api.types import is_numeric_dtype
6+
import numpy as np
7+
38

49
def test_version():
510
assert __version__ == '0.1.0'
@@ -10,7 +15,7 @@ def test_outlier_identifier():
1015
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 50, 5.4, 5.0, 5.2, 5.3, 5.1],
1116
'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
1217
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
13-
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
18+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
1419
})
1520

1621
test_column = ['SepalLengthCm', 'SepalWidthCm', 'PetalWidthCm']
@@ -19,59 +24,59 @@ def test_outlier_identifier():
1924
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 5.1, 5.4, 5.0, 5.2, 5.3, 5.1],
2025
'SepalWidthCm': [1.4, 1.4, 1.5, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
2126
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
22-
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
27+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
2328
})
2429

2530
trim_output = pd.DataFrame({
2631
'SepalLengthCm': [5.1, 4.9, 5.5, 5.1, 5.4, 5.0, 5.2, 5.3],
2732
'SepalWidthCm': [1.4, 1.4, 2.0, 0.7, 1.2, 1.4, 1.8, 1.5],
28-
'PetalWidthCm' :[0.2, 0.2, 0.3, 0.4 0.5, 0.6, 0.4, 0.2],
29-
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
33+
'PetalWidthCm' :[0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.4, 0.2],
34+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
3035
})
3136

3237
mean_output = pd.DataFrame({
3338
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
3439
'SepalWidthCm': [1.4, 1.4, 3.19, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
3540
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
36-
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
41+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
3742
})
3843

3944
column_output= pd.DataFrame({
4045
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
4146
'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
4247
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
43-
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
48+
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
4449
})
4550

4651
# Test if the imput is not dataFrame
47-
with raises(TypeError):
52+
with raise(TypeError):
4853
eda_utils_py.outlier_identifier("not dataframe")
4954

5055
# Test if columns input is not list
51-
with raises(TypeError):
56+
with raise(TypeError):
5257
eda_utils_py.outlier_identifier(test_df, columns=2)
5358

5459
# Test if input column list is in the dataframe
55-
with raises(TypeError):
60+
with raise(TypeError):
5661
eda_utils_py.outlier_identifier(test_df, columns=["not in"])
5762

5863
# Test if method input is not one of three methods provided
59-
with raises(TypeError):
64+
with raise(TypeError):
6065
eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method = "no")
6166

6267
# Test if column selected included non-numeric columns
63-
with raises(Exception):
68+
with raise(Exception):
6469
eda_utils_py.outlier_identifier(test_df, columns=["Species"])
6570

6671
assert pd.DataFrame.equals(
67-
eda_utils_py.outlier_identifier(test_df), trim_output
72+
outlier_identifier(test_df, test_column), trim_output
6873
), "Default test not pass"
6974
assert pd.DataFrame.equals(
70-
eda_utils_py.outlier_identifier(data, method = "median"), median_output
75+
outlier_identifier(test_df, test_column,method = "median"), median_output
7176
), "The median method is not correct"
7277
assert pd.DataFrame.equals(
73-
eda_utils_py.outlier_identifier(data, method = "mean"), mean_output
78+
outlier_identifier(test_df, test_column, method = "mean"), mean_output
7479
), "The mean method is not correct"
7580
assert pd.DataFrame.equals(
76-
eda_utils_py.outlier_identifier(data, columns = ["SepalLengthCm"], method = "mean"), column_output
77-
), "The selected column method is not correct"
81+
outlier_identifier(test_df, columns = ["SepalLengthCm"], method = "mean"), column_output
82+
), "The selected column method is not correct"

0 commit comments

Comments
 (0)