55import numpy as np
66
77
8-
98def imputer (df , strategy = "mean" , fill_value = None ):
109 """
1110 A function to implement imputation functionality for completing missing values.
@@ -68,7 +67,6 @@ def imputer(df, strategy="mean", fill_value=None):
6867 if isinstance (fill_value , type (None )) and strategy == "constant" :
6968 raise Exception ("fill_value should be a number when strategy is 'constant'" )
7069
71-
7270 result = pd .DataFrame ()
7371 if strategy == "mean" :
7472 result = df .apply (lambda x : x .fillna (x .mean ()), axis = 0 )
@@ -115,7 +113,7 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
115113 >> data = pd.DataFrame({
116114 >> 'SepalLengthCm':[5.1, 4.9, 4.7],
117115 >> 'SepalWidthCm':[1.4, 1.4, 1.3],
118- >> 'PetalWidthCm':[0.2, 0.2 , 0.2],
116+ >> 'PetalWidthCm':[0.2, 0.1 , 0.2],
119117 >> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
120118 >> })
121119
@@ -165,7 +163,7 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
165163 .encode (
166164 x = alt .X ("var1" , title = None ),
167165 y = alt .Y ("var2" , title = None ),
168- color = alt .Color ("cor" , legend = None , scale = alt .Scale (scheme = col_scheme )),
166+ color = alt .Color ("cor" , title = 'Correlation' , scale = alt .Scale (scheme = col_scheme , domain = ( - 1 , 1 ) )),
169167 )
170168 .properties (title = "Correlation Matrix" , width = 400 , height = 400 )
171169 )
@@ -206,17 +204,17 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
206204
207205 Examples
208206 --------
209- >>> import pandas as pd
210- >>> from eda_utils_py import cor_map
207+ >> import pandas as pd
208+ >> from eda_utils_py import cor_map
211209
212- >>> data = pd.DataFrame({
213- >>> 'SepalLengthCm':[5.1, 4.9, 4.7],
214- >>> 'SepalWidthCm':[1.4, 1.4, 99],
215- >>> 'PetalWidthCm:[0.2, 0.2, 0.2],
216- >>> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
217- >>> })
210+ >> data = pd.DataFrame({
211+ >> 'SepalLengthCm':[5.1, 4.9, 4.7],
212+ >> 'SepalWidthCm':[1.4, 1.4, 99],
213+ >> 'PetalWidthCm:[0.2, 0.2, 0.2],
214+ >> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
215+ >> })
218216
219- >>> outlier_identifier(data)
217+ >> outlier_identifier(data)
220218
221219
222220 """
@@ -226,59 +224,54 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
226224 if columns is None :
227225 for col in dataframe .columns :
228226 if not is_numeric_dtype (dataframe [col ]):
229- raise Exception ("The given dataframe contains column that is not numeric column." )
230-
227+ raise Exception ("The given dataframe contains column that is not numeric column." )
228+
231229 if columns is not None :
232230 if not isinstance (columns , list ):
233231 raise TypeError ("The argument @columns must be of type list" )
234-
235-
232+
236233 for col in columns :
237234 if col not in list (dataframe .columns ):
238- raise Exception ("The given column list contains column that is not exist in the given dataframe." )
235+ raise Exception ("The given column list contains column that is not exist in the given dataframe." )
239236 if not is_numeric_dtype (dataframe [col ]):
240237 raise Exception ("The given column list contains column that is not numeric column." )
241-
238+
242239 if method not in ("trim" , "median" , "mean" ):
243240 raise Exception ("The method must be -trim- or -median- or -mean-" )
244-
241+
245242 df = dataframe .copy ()
246243 target_columns = []
247- if (columns is None ):
248- target_columns = list (df .columns .values .tolist ())
244+ if (columns is None ):
245+ target_columns = list (df .columns .values .tolist ())
249246 else :
250247 target_columns = columns
251-
248+
252249 outlier_index = []
253250 for column in target_columns :
254251 current_column = df [column ]
255252 mean = np .mean (current_column )
256253 std = np .std (current_column )
257- threshold = 3
258-
259-
254+ threshold = 3
255+
260256 for i in range (len (current_column )):
261257 current_item = current_column [i ]
262258 z = (current_item - mean ) / std
263259 if z >= threshold :
264- if (i not in outlier_index ):
260+ if (i not in outlier_index ):
265261 outlier_index .append (i )
266- if (method == "mean" ):
262+ if (method == "mean" ):
267263 df .at [i , column ] = round (mean , 2 )
268- if (method == "median" ):
264+ if (method == "median" ):
269265 df .at [i , column ] = np .median (current_column )
270-
271-
272- if (method == "trim" ):
266+
267+ if (method == "trim" ):
273268 df = df .drop (outlier_index )
274-
269+
275270 df .index = range (len (df ))
276271 return df
277272
278273
279-
280-
281- def scale (dataframe , columns = None , scaler = "standard" ):
274+ def scale (dataframe , columns , scaler = "standard" ):
282275 """
283276 A function to scale features either by using standard scaler or minmax scaler method
284277
@@ -304,15 +297,22 @@ def scale(dataframe, columns=None, scaler="standard"):
304297 >> from eda_utils_py import scale
305298
306299 >> data = pd.DataFrame({
307- >> 'SepalLengthCm':[5. 1, 4.9, 4.7 ],
308- >> 'SepalWidthCm':[1. 4, 1.4 , 1.3 ],
309- >> 'PetalWidthCm:[0. 2, 0.2 , 0.2 ],
300+ >> 'SepalLengthCm':[1, 0, 0, 3, 4 ],
301+ >> 'SepalWidthCm':[4, 1, 1, 0, 1 ],
302+ >> 'PetalWidthCm:[2, 0, 0, 2, 1 ],
310303 >> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
311304 >> })
312305
313306 >> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
314307
315308 >> scale(data, numerical_columns, scaler="minmax")
309+
310+ SepalLengthCm SepalWidthCm PetalWidthCm
311+ 0 0.25 1.00 1.0
312+ 1 0.00 0.25 0.0
313+ 2 0.00 0.25 0.0
314+ 3 0.75 0.00 1.0
315+ 4 1.00 0.25 0.5
316316 """
317317
318318 # Check if input data is of pd.DataFrame type
@@ -375,7 +375,7 @@ def _standardize(dataframe):
375375 The data frame to be used for EDA.
376376 Returns
377377 -------
378- self : object
378+ res : pandas.core.frame.DataFrame
379379 Scaled dataset
380380 """
381381
@@ -404,7 +404,7 @@ def _minmax(dataframe):
404404 The data frame to be used for EDA.
405405 Returns
406406 -------
407- self : object
407+ res : pandas.core.frame.DataFrame
408408 Scaled dataset
409409 """
410410
@@ -415,5 +415,3 @@ def _minmax(dataframe):
415415 res [feature_name ] = (dataframe [feature_name ] - min ) / (max - min )
416416
417417 return res
418-
419-
0 commit comments