Skip to content

Commit d074243

Browse files
authored
Merge pull request #29 from UBC-MDS/autformat-script
Autoformat sctipt and make columns a required argument in scale function
2 parents ce8cfab + 3a34a14 commit d074243

File tree

2 files changed

+42
-48
lines changed

2 files changed

+42
-48
lines changed

eda_utils_py/eda_utils_py.py

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import numpy as np
66

77

8-
98
def imputer(df, strategy="mean", fill_value=None):
109
"""
1110
A function to implement imputation functionality for completing missing values.
@@ -68,7 +67,6 @@ def imputer(df, strategy="mean", fill_value=None):
6867
if isinstance(fill_value, type(None)) and strategy == "constant":
6968
raise Exception("fill_value should be a number when strategy is 'constant'")
7069

71-
7270
result = pd.DataFrame()
7371
if strategy == "mean":
7472
result = df.apply(lambda x: x.fillna(x.mean()), axis=0)
@@ -226,59 +224,54 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
226224
if columns is None:
227225
for col in dataframe.columns:
228226
if not is_numeric_dtype(dataframe[col]):
229-
raise Exception("The given dataframe contains column that is not numeric column.")
230-
227+
raise Exception("The given dataframe contains column that is not numeric column.")
228+
231229
if columns is not None:
232230
if not isinstance(columns, list):
233231
raise TypeError("The argument @columns must be of type list")
234-
235-
232+
236233
for col in columns:
237234
if col not in list(dataframe.columns):
238-
raise Exception("The given column list contains column that is not exist in the given dataframe.")
235+
raise Exception("The given column list contains column that is not exist in the given dataframe.")
239236
if not is_numeric_dtype(dataframe[col]):
240237
raise Exception("The given column list contains column that is not numeric column.")
241-
238+
242239
if method not in ("trim", "median", "mean"):
243240
raise Exception("The method must be -trim- or -median- or -mean-")
244-
241+
245242
df = dataframe.copy()
246243
target_columns = []
247-
if(columns is None):
248-
target_columns = list(df.columns.values.tolist())
244+
if (columns is None):
245+
target_columns = list(df.columns.values.tolist())
249246
else:
250247
target_columns = columns
251-
248+
252249
outlier_index = []
253250
for column in target_columns:
254251
current_column = df[column]
255252
mean = np.mean(current_column)
256253
std = np.std(current_column)
257-
threshold = 3
258-
259-
254+
threshold = 3
255+
260256
for i in range(len(current_column)):
261257
current_item = current_column[i]
262258
z = (current_item - mean) / std
263259
if z >= threshold:
264-
if(i not in outlier_index):
260+
if (i not in outlier_index):
265261
outlier_index.append(i)
266-
if(method == "mean"):
262+
if (method == "mean"):
267263
df.at[i, column] = round(mean, 2)
268-
if(method == "median"):
264+
if (method == "median"):
269265
df.at[i, column] = np.median(current_column)
270-
271-
272-
if(method == "trim"):
266+
267+
if (method == "trim"):
273268
df = df.drop(outlier_index)
274-
269+
275270
df.index = range(len(df))
276271
return df
277272

278273

279-
280-
281-
def scale(dataframe, columns=None, scaler="standard"):
274+
def scale(dataframe, columns, scaler="standard"):
282275
"""
283276
A function to scale features either by using standard scaler or minmax scaler method
284277
@@ -415,5 +408,3 @@ def _minmax(dataframe):
415408
res[feature_name] = (dataframe[feature_name] - min) / (max - min)
416409

417410
return res
418-
419-

tests/test_eda_utils_py.py

Lines changed: 24 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import numpy as np
77

88

9-
109
def test_version():
1110
assert __version__ == "0.1.0"
1211

@@ -188,8 +187,10 @@ def test_scaler():
188187
)
189188

190189
mock_df_1_standard = pd.DataFrame(
191-
{"col1": [-0.3302891295379082, -0.8807710121010884, -0.8807710121010884, 0.7706746355884523, 1.3211565181516325],
192-
"col2": [1.714389230829046, -0.26375218935831474, -0.26375218935831474, -0.9231326627541017, -0.26375218935831474],
190+
{"col1": [-0.3302891295379082, -0.8807710121010884, -0.8807710121010884, 0.7706746355884523,
191+
1.3211565181516325],
192+
"col2": [1.714389230829046, -0.26375218935831474, -0.26375218935831474, -0.9231326627541017,
193+
-0.26375218935831474],
193194
"col3": [1.0, -1.0, -1.0, 1.0, 0.0]}
194195
)
195196

@@ -233,7 +234,6 @@ def test_scaler():
233234
minmax_scaled_mock_df_1, mock_df_1_minmax
234235
), "The returned dataframe using standard scaler method is not correct"
235236

236-
237237
assert pd.DataFrame.equals(
238238
standard_scaled_mock_df_2, mock_df_2_standard
239239
), "The returned dataframe using most_frequent inputer is not correct"
@@ -242,43 +242,47 @@ def test_scaler():
242242
), "The returned dataframe using constant imputer is not correct"
243243

244244

245-
246245
def test_outlier_identifier():
247246
test_df = pd.DataFrame({
248247
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 50, 5.4, 5.0, 5.2, 5.3, 5.1],
249248
'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
250-
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
251-
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
249+
'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
250+
'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
251+
'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
252252
})
253253

254254
test_column = ['SepalLengthCm', 'SepalWidthCm', 'PetalWidthCm']
255255

256256
median_output = pd.DataFrame({
257257
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 5.1, 5.4, 5.0, 5.2, 5.3, 5.1],
258258
'SepalWidthCm': [1.4, 1.4, 1.5, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
259-
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
260-
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
259+
'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
260+
'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
261+
'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
261262
})
262263

263264
trim_output = pd.DataFrame({
264265
'SepalLengthCm': [5.1, 4.9, 5.5, 5.1, 5.4, 5.0, 5.2, 5.3],
265266
'SepalWidthCm': [1.4, 1.4, 2.0, 0.7, 1.2, 1.4, 1.8, 1.5],
266-
'PetalWidthCm' :[0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.4, 0.2],
267-
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
267+
'PetalWidthCm': [0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.4, 0.2],
268+
'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
269+
'Iris-setosa', 'Iris-setosa']
268270
})
269271

270272
mean_output = pd.DataFrame({
271273
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
272274
'SepalWidthCm': [1.4, 1.4, 3.19, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
273-
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
274-
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
275+
'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
276+
'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
277+
'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
275278
})
276279

277-
column_output= pd.DataFrame({
280+
column_output = pd.DataFrame({
278281
'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
279282
'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
280-
'PetalWidthCm' :[0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
281-
'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
283+
'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
284+
'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
285+
'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
282286
})
283287

284288
# Test if the imput is not dataFrame
@@ -295,7 +299,7 @@ def test_outlier_identifier():
295299

296300
# Test if method input is not one of three methods provided
297301
with raises(Exception):
298-
eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method = "no")
302+
eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method="no")
299303

300304
# Test if column selected included non-numeric columns
301305
with raises(Exception):
@@ -305,12 +309,11 @@ def test_outlier_identifier():
305309
eda_utils_py.outlier_identifier(test_df, test_column), trim_output
306310
), "Default test not pass"
307311
assert pd.DataFrame.equals(
308-
eda_utils_py.outlier_identifier(test_df, test_column,method = "median"), median_output
312+
eda_utils_py.outlier_identifier(test_df, test_column, method="median"), median_output
309313
), "The median method is not correct"
310314
assert pd.DataFrame.equals(
311-
eda_utils_py.outlier_identifier(test_df, test_column, method = "mean"), mean_output
315+
eda_utils_py.outlier_identifier(test_df, test_column, method="mean"), mean_output
312316
), "The mean method is not correct"
313317
assert pd.DataFrame.equals(
314-
eda_utils_py.outlier_identifier(test_df, columns = ["SepalLengthCm"], method = "mean"), column_output
318+
eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method="mean"), column_output
315319
), "The selected column method is not correct"
316-

0 commit comments

Comments
 (0)