|
5 | 5 | import numpy as np |
6 | 6 |
|
7 | 7 |
|
8 | | - |
9 | 8 | def imputer(df, strategy="mean", fill_value=None): |
10 | 9 | """ |
11 | 10 | A function to implement imputation functionality for completing missing values. |
@@ -68,7 +67,6 @@ def imputer(df, strategy="mean", fill_value=None): |
68 | 67 | if isinstance(fill_value, type(None)) and strategy == "constant": |
69 | 68 | raise Exception("fill_value should be a number when strategy is 'constant'") |
70 | 69 |
|
71 | | - |
72 | 70 | result = pd.DataFrame() |
73 | 71 | if strategy == "mean": |
74 | 72 | result = df.apply(lambda x: x.fillna(x.mean()), axis=0) |
@@ -226,59 +224,54 @@ def outlier_identifier(dataframe, columns=None, method="trim"): |
226 | 224 | if columns is None: |
227 | 225 | for col in dataframe.columns: |
228 | 226 | if not is_numeric_dtype(dataframe[col]): |
229 | | - raise Exception("The given dataframe contains column that is not numeric column.") |
230 | | - |
| 227 | + raise Exception("The given dataframe contains column that is not numeric column.") |
| 228 | + |
231 | 229 | if columns is not None: |
232 | 230 | if not isinstance(columns, list): |
233 | 231 | raise TypeError("The argument @columns must be of type list") |
234 | | - |
235 | | - |
| 232 | + |
236 | 233 | for col in columns: |
237 | 234 | if col not in list(dataframe.columns): |
238 | | - raise Exception("The given column list contains column that is not exist in the given dataframe.") |
| 235 | + raise Exception("The given column list contains column that is not exist in the given dataframe.") |
239 | 236 | if not is_numeric_dtype(dataframe[col]): |
240 | 237 | raise Exception("The given column list contains column that is not numeric column.") |
241 | | - |
| 238 | + |
242 | 239 | if method not in ("trim", "median", "mean"): |
243 | 240 | raise Exception("The method must be -trim- or -median- or -mean-") |
244 | | - |
| 241 | + |
245 | 242 | df = dataframe.copy() |
246 | 243 | target_columns = [] |
247 | | - if(columns is None): |
248 | | - target_columns = list(df.columns.values.tolist()) |
| 244 | + if (columns is None): |
| 245 | + target_columns = list(df.columns.values.tolist()) |
249 | 246 | else: |
250 | 247 | target_columns = columns |
251 | | - |
| 248 | + |
252 | 249 | outlier_index = [] |
253 | 250 | for column in target_columns: |
254 | 251 | current_column = df[column] |
255 | 252 | mean = np.mean(current_column) |
256 | 253 | std = np.std(current_column) |
257 | | - threshold = 3 |
258 | | - |
259 | | - |
| 254 | + threshold = 3 |
| 255 | + |
260 | 256 | for i in range(len(current_column)): |
261 | 257 | current_item = current_column[i] |
262 | 258 | z = (current_item - mean) / std |
263 | 259 | if z >= threshold: |
264 | | - if(i not in outlier_index): |
| 260 | + if (i not in outlier_index): |
265 | 261 | outlier_index.append(i) |
266 | | - if(method == "mean"): |
| 262 | + if (method == "mean"): |
267 | 263 | df.at[i, column] = round(mean, 2) |
268 | | - if(method == "median"): |
| 264 | + if (method == "median"): |
269 | 265 | df.at[i, column] = np.median(current_column) |
270 | | - |
271 | | - |
272 | | - if(method == "trim"): |
| 266 | + |
| 267 | + if (method == "trim"): |
273 | 268 | df = df.drop(outlier_index) |
274 | | - |
| 269 | + |
275 | 270 | df.index = range(len(df)) |
276 | 271 | return df |
277 | 272 |
|
278 | 273 |
|
279 | | - |
280 | | - |
281 | | -def scale(dataframe, columns=None, scaler="standard"): |
| 274 | +def scale(dataframe, columns, scaler="standard"): |
282 | 275 | """ |
283 | 276 | A function to scale features either by using standard scaler or minmax scaler method |
284 | 277 |
|
@@ -415,5 +408,3 @@ def _minmax(dataframe): |
415 | 408 | res[feature_name] = (dataframe[feature_name] - min) / (max - min) |
416 | 409 |
|
417 | 410 | return res |
418 | | - |
419 | | - |
0 commit comments