@@ -113,7 +113,7 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
113113 >> data = pd.DataFrame({
114114 >> 'SepalLengthCm':[5.1, 4.9, 4.7],
115115 >> 'SepalWidthCm':[1.4, 1.4, 1.3],
116- >> 'PetalWidthCm':[0.2, 0.2 , 0.2],
116+ >> 'PetalWidthCm':[0.2, 0.1 , 0.2],
117117 >> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
118118 >> })
119119
@@ -163,7 +163,7 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
163163 .encode (
164164 x = alt .X ("var1" , title = None ),
165165 y = alt .Y ("var2" , title = None ),
166- color = alt .Color ("cor" , legend = None , scale = alt .Scale (scheme = col_scheme )),
166+ color = alt .Color ("cor" , title = 'Correlation' , scale = alt .Scale (scheme = col_scheme , domain = ( - 1 , 1 ) )),
167167 )
168168 .properties (title = "Correlation Matrix" , width = 400 , height = 400 )
169169 )
@@ -185,35 +185,37 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
185185 A function that identify by z-test with threshold of 3, and deal with outliers based on the method the user choose
186186
187187 Parameters
188- ----------
188+ ----------
189189 dataframe : pandas.core.frame.DataFrame
190190 The target dataframe where the function is performed.
191191 columns : list, default=None
192192 The target columns where the function needed to be performed. Defualt is None, the function will check all columns
193193 method : string
194- The method of dealing with outliers.
194+ The method of dealing with outliers.
195195 - if "trim" : we completely remove data points that are outliers.
196196 - if "median" : we replace outliers with median values
197197 - if "mean" : we replace outliers with mean values
198198
199+
199200 Returns
200201 -------
201202 pandas.core.frame.DataFrame
202203 a dataframe which the outlier has already process by the chosen method
203-
204+
204205 Examples
205206 --------
206- >>> import pandas as pd
207- >>> from eda_utils_py import cor_map
207+ >> import pandas as pd
208+ >> from eda_utils_py import cor_map
208209
209- >>> data = pd.DataFrame({
210- >>> 'SepalLengthCm':[5.1, 4.9, 4.7],
211- >>> 'SepalWidthCm':[1.4, 1.4, 99],
212- >>> 'PetalWidthCm:[0.2, 0.2, 0.2],
213- >>> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
214- >>> })
210+ >> data = pd.DataFrame({
211+ >> 'SepalLengthCm':[5.1, 4.9, 4.7],
212+ >> 'SepalWidthCm':[1.4, 1.4, 99],
213+ >> 'PetalWidthCm:[0.2, 0.2, 0.2],
214+ >> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
215+ >> })
216+
217+ >> outlier_identifier(data)
215218
216- >>> outlier_identifier(data)
217219
218220 """
219221 if not isinstance (dataframe , pd .DataFrame ):
@@ -222,58 +224,54 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
222224 if columns is None :
223225 for col in dataframe .columns :
224226 if not is_numeric_dtype (dataframe [col ]):
225- raise Exception ("The given dataframe contains column that is not numeric column." )
226-
227+ raise Exception ("The given dataframe contains column that is not numeric column." )
228+
227229 if columns is not None :
228230 if not isinstance (columns , list ):
229231 raise TypeError ("The argument @columns must be of type list" )
230-
231-
232+
232233 for col in columns :
233234 if col not in list (dataframe .columns ):
234- raise Exception ("The given column list contains column that is not exist in the given dataframe." )
235+ raise Exception ("The given column list contains column that is not exist in the given dataframe." )
235236 if not is_numeric_dtype (dataframe [col ]):
236237 raise Exception ("The given column list contains column that is not numeric column." )
237-
238+
238239 if method not in ("trim" , "median" , "mean" ):
239240 raise Exception ("The method must be -trim- or -median- or -mean-" )
240241
241-
242242 df = dataframe .copy ()
243243 target_columns = []
244- if (columns is None ):
245- target_columns = list (df .columns .values .tolist ())
244+ if (columns is None ):
245+ target_columns = list (df .columns .values .tolist ())
246246 else :
247247 target_columns = columns
248-
248+
249249 outlier_index = []
250250 for column in target_columns :
251251 current_column = df [column ]
252252 mean = np .mean (current_column )
253253 std = np .std (current_column )
254- threshold = 3
255-
256-
254+ threshold = 3
255+
257256 for i in range (len (current_column )):
258257 current_item = current_column [i ]
259258 z = (current_item - mean ) / std
260259 if z >= threshold :
261- if (i not in outlier_index ):
260+ if (i not in outlier_index ):
262261 outlier_index .append (i )
263- if (method == "mean" ):
262+ if (method == "mean" ):
264263 df .at [i , column ] = round (mean , 2 )
265- if (method == "median" ):
264+ if (method == "median" ):
266265 df .at [i , column ] = np .median (current_column )
267-
268-
269- if (method == "trim" ):
266+
267+ if (method == "trim" ):
270268 df = df .drop (outlier_index )
271-
269+
272270 df .index = range (len (df ))
273271 return df
274272
275273
276- def scale (dataframe , columns = None , scaler = "standard" ):
274+ def scale (dataframe , columns , scaler = "standard" ):
277275 """
278276 A function to scale features either by using standard scaler or minmax scaler method
279277
@@ -299,15 +297,22 @@ def scale(dataframe, columns=None, scaler="standard"):
299297 >> from eda_utils_py import scale
300298
301299 >> data = pd.DataFrame({
302- >> 'SepalLengthCm':[5. 1, 4.9, 4.7 ],
303- >> 'SepalWidthCm':[1. 4, 1.4 , 1.3 ],
304- >> 'PetalWidthCm:[0. 2, 0.2 , 0.2 ],
300+ >> 'SepalLengthCm':[1, 0, 0, 3, 4 ],
301+ >> 'SepalWidthCm':[4, 1, 1, 0, 1 ],
302+ >> 'PetalWidthCm:[2, 0, 0, 2, 1 ],
305303 >> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
306304 >> })
307305
308306 >> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
309307
310308 >> scale(data, numerical_columns, scaler="minmax")
309+
310+ SepalLengthCm SepalWidthCm PetalWidthCm
311+ 0 0.25 1.00 1.0
312+ 1 0.00 0.25 0.0
313+ 2 0.00 0.25 0.0
314+ 3 0.75 0.00 1.0
315+ 4 1.00 0.25 0.5
311316 """
312317
313318 # Check if input data is of pd.DataFrame type
@@ -370,9 +375,10 @@ def _standardize(dataframe):
370375 The data frame to be used for EDA.
371376 Returns
372377 -------
373- self : object
378+ res : pandas.core.frame.DataFrame
374379 Scaled dataset
375380 """
381+
376382 res = dataframe .copy ()
377383 for feature_name in dataframe .columns :
378384 mean = dataframe [feature_name ].mean ()
@@ -398,7 +404,7 @@ def _minmax(dataframe):
398404 The data frame to be used for EDA.
399405 Returns
400406 -------
401- self : object
407+ res : pandas.core.frame.DataFrame
402408 Scaled dataset
403409 """
404410
@@ -407,4 +413,5 @@ def _minmax(dataframe):
407413 max = dataframe [feature_name ].max ()
408414 min = dataframe [feature_name ].min ()
409415 res [feature_name ] = (dataframe [feature_name ] - min ) / (max - min )
410- return res
416+
417+ return res
0 commit comments