1- import pandas as pd
21import altair as alt
2+ import pandas as pd
33from pandas .api .types import is_numeric_dtype
44import numbers
55
@@ -28,16 +28,16 @@ def imputer(df, strategy="mean", fill_value=None):
2828
2929 Examples
3030 ---------
31- >>> import pandas as pd
32- >>> from eda_utils_py import cor_map
31+ >> import pandas as pd
32+ >> from eda_utils_py import cor_map
3333
34- >>> data = pd.DataFrame({
35- >>> 'SepalLengthCm':[5.1, 4.9, 4.7],
36- >>> 'SepalWidthCm':[1.4, 1.4, 1.3],
37- >>> 'PetalWidthCm':[0.2, None, 0.2]
38- >>> })
34+ >> data = pd.DataFrame({
35+ >> 'SepalLengthCm':[5.1, 4.9, 4.7],
36+ >> 'SepalWidthCm':[1.4, 1.4, 1.3],
37+ >> 'PetalWidthCm':[0.2, None, 0.2]
38+ >> })
3939
40- >>> imputer(data, numerical_columns)
40+ >> imputer(data, numerical_columns)
4141 SepalLengthCm SepalWidthCm PetalWidthCm
4242 0 5.1 1.4 0.2
4343 1 4.9 1.4 0.2
@@ -54,7 +54,7 @@ def imputer(df, strategy="mean", fill_value=None):
5454
5555 # Tests whether input fill_value is of type numbers or None
5656 if not isinstance (fill_value , type (None )) and not isinstance (
57- fill_value , numbers .Number
57+ fill_value , numbers .Number
5858 ):
5959 raise TypeError ("fill_value must be of type None or numeric type" )
6060
@@ -83,9 +83,7 @@ def imputer(df, strategy="mean", fill_value=None):
8383 return result
8484
8585
86-
8786def cor_map (dataframe , num_col , col_scheme = "purpleorange" ):
88-
8987 """
9088 A function to implement a correlation heatmap including coefficients based on given numeric columns of a data frame.
9189
@@ -108,19 +106,18 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
108106
109107 Examples
110108 ---------
111- >>> import pandas as pd
112- >>> from eda_utils_py import cor_map
113-
114- >>> data = pd.DataFrame({
115- >>> 'SepalLengthCm':[5.1, 4.9, 4.7],
116- >>> 'SepalWidthCm':[1.4, 1.4, 1.3],
117- >>> 'PetalWidthCm':[0.2, 0.2, 0.2],
118- >>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
119- >>> })
120-
121- >>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
122- >>> cor_map(data, numerical_columns, col_scheme = 'purpleorange')
123-
109+ >> import pandas as pd
110+ >> from eda_utils_py import cor_map
111+
112+ >> data = pd.DataFrame({
113+ >> 'SepalLengthCm':[5.1, 4.9, 4.7],
114+ >> 'SepalWidthCm':[1.4, 1.4, 1.3],
115+ >> 'PetalWidthCm':[0.2, 0.2, 0.2],
116+ >> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
117+ >> })
118+
119+ >> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
120+ >> cor_map(data, numerical_columns, col_scheme = 'purpleorange')
124121 """
125122
126123 # Tests whether input data is of pd.DataFrame type
@@ -161,13 +158,13 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
161158
162159 plot = (
163160 alt .Chart (corr_matrix )
164- .mark_rect ()
165- .encode (
161+ .mark_rect ()
162+ .encode (
166163 x = alt .X ("var1" , title = None ),
167164 y = alt .Y ("var2" , title = None ),
168165 color = alt .Color ("cor" , legend = None , scale = alt .Scale (scheme = col_scheme )),
169166 )
170- .properties (title = "Correlation Matrix" , width = 400 , height = 400 )
167+ .properties (title = "Correlation Matrix" , width = 400 , height = 400 )
171168 )
172169
173170 text = plot .mark_text (size = 15 ).encode (
@@ -204,52 +201,154 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
204201
205202 Examples
206203 --------
207- >>> import pandas as pd
208- >>> from eda_utils_py import cor_map
204+ >> import pandas as pd
205+ >> from eda_utils_py import cor_map
209206
210- >>> data = pd.DataFrame({
211- >>> 'SepalLengthCm':[5.1, 4.9, 4.7],
212- >>> 'SepalWidthCm':[1.4, 1.4, 9999999.99],
213- >>> 'PetalWidthCm:[0.2, 0.2, 0.2],
214- >>> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
215- >>> })
207+ >> data = pd.DataFrame({
208+ >> 'SepalLengthCm':[5.1, 4.9, 4.7],
209+ >> 'SepalWidthCm':[1.4, 1.4, 9999999.99],
210+ >> 'PetalWidthCm:[0.2, 0.2, 0.2],
211+ >> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
212+ >> })
216213
217- >>> outlier_identifier(data)
214+ >> outlier_identifier(data)
218215
219216 """
220217 pass
221218
222219
223- def scale (dataframe , columns = None ):
220+ def scale (dataframe , columns = None , scaler = "standard" ):
224221 """
225- A function to scale features by removing the mean and scaling to unit variance
222+ A function to scale features either by using standard scaler or minmax scaler method
226223
227224 Parameters
228225 ----------
229226 dataframe : pandas.DataFrame
230227 The data frame to be used for EDA.
231228 columns : list, default=None
232229 A list of string of column names with numeric data from the data frame that we wish to scale.
233-
230+ scaler: str, default="standard"
231+ A string to specify the sclaing method to be used
232+ - if "standard": it transforms features by centering the distribution of the data on the value 0 and the standard
233+ deviation to the value 1.
234+ - if "minmax": it transforms features by rescaling each feature to the range between 0 and 1.
234235 Returns
235236 -------
236237 dataframe : pandas.core.frame.DataFrame
237238 The scaled dataframe for numerical features
238239
239240 Examples
240241 --------
241- >>> import pandas as pd
242- >>> from eda_utils_py import scale
242+ >> import pandas as pd
243+ >> from eda_utils_py import scale
243244
244- >>> data = pd.DataFrame({
245- >>> 'SepalLengthCm':[5.1, 4.9, 4.7],
246- >>> 'SepalWidthCm':[1.4, 1.4, 1.3],
247- >>> 'PetalWidthCm:[0.2, 0.2, 0.2],
248- >>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
249- >>> })
245+ >> data = pd.DataFrame({
246+ >> 'SepalLengthCm':[5.1, 4.9, 4.7],
247+ >> 'SepalWidthCm':[1.4, 1.4, 1.3],
248+ >> 'PetalWidthCm:[0.2, 0.2, 0.2],
249+ >> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
250+ >> })
250251
251- >>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
252+ >> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
252253
253- >>> scale(data, numerical_columns)
254+ >> scale(data, numerical_columns, scaler="minmax" )
254255 """
255- pass
256+
257+ # Check if input data is of pd.DataFrame type
258+ if not isinstance (dataframe , pd .DataFrame ):
259+ raise TypeError ("The input dataframe must be of pd.DataFrame type" )
260+
261+ # Check if input num_col is of type list
262+ if not isinstance (columns , list ):
263+ raise TypeError ("The input columns must be of type list" )
264+
265+ # Check if values of columns are of type str
266+ for col in columns :
267+ if not isinstance (col , str ):
268+ raise TypeError ("The name of features in columns list must all be str" )
269+
270+ # Check if all input columns exist in the input data
271+ for col in columns :
272+ if col not in list (dataframe .columns ):
273+ raise Exception ("The given column names must exist in the given dataframe." )
274+
275+ # Check if all input columns in num_col are numeric columns
276+ for col in columns :
277+ if not is_numeric_dtype (dataframe [col ]):
278+ raise Exception ("The given numerical columns must all be numeric." )
279+
280+ # Check if scaler is of type str
281+ if not isinstance (scaler , str ):
282+ raise TypeError ("Scaler must be of type str" )
283+
284+ # Check if all input columns exist in the input data
285+ for col in columns :
286+ if col not in list (dataframe .columns ):
287+ raise Exception ("The given column names must exist in the given dataframe." )
288+
289+ # Check if all input columns in num_col are numeric columns
290+ for col in columns :
291+ if not is_numeric_dtype (dataframe [col ]):
292+ raise Exception ("The given columns must all be numeric." )
293+
294+ scaled_df = None
295+ if scaler == "minmax" :
296+ scaled_df = _minmax (dataframe [columns ])
297+ else :
298+ scaled_df = _standardize (dataframe [columns ])
299+
300+ return scaled_df
301+
302+
303+ def _standardize (dataframe ):
304+ """Transform features by centering the distribution of the data
305+ on the value 0 and the standard deviation to the value 1.
306+
307+ The transformation is given by:
308+
309+ scaled_value = (value - mean) / standard deviation
310+
311+ Parameters
312+ ----------
313+ dataframe : pandas.DataFrame
314+ The data frame to be used for EDA.
315+ Returns
316+ -------
317+ self : object
318+ Scaled dataset
319+ """
320+ res = dataframe .copy ()
321+ for feature_name in dataframe .columns :
322+ mean = dataframe [feature_name ].mean ()
323+ stdev = dataframe [feature_name ].std ()
324+ res [feature_name ] = (dataframe [feature_name ] - mean ) / stdev
325+ return res
326+
327+
328+ def _minmax (dataframe ):
329+ """Transform features by rescaling each feature to the range between 0 and 1.
330+ The transformation is given by:
331+
332+ scaled_value = (feature_value - min) / (mix - min)
333+
334+ where min, max = feature_range.
335+
336+ This transformation is often used as an alternative to zero mean,
337+ unit variance scaling.
338+
339+ Parameters
340+ ----------
341+ dataframe : pandas.DataFrame
342+ The data frame to be used for EDA.
343+ Returns
344+ -------
345+ self : object
346+ Scaled dataset
347+ """
348+
349+ res = dataframe .copy ()
350+ for feature_name in dataframe .columns :
351+ max = dataframe [feature_name ].max ()
352+ min = dataframe [feature_name ].min ()
353+ res [feature_name ] = (dataframe [feature_name ] - min ) / (max - min )
354+ return res
0 commit comments