11import pandas as pd
22import altair as alt
33from pandas .api .types import is_numeric_dtype
4+ import numbers
45
5- def imputer (dataframe , strategy = "mean" , fill_value = None ):
6+
7+ def imputer (df , strategy = "mean" , fill_value = None ):
68 """
79 A function to implement imputation functionality for completing missing values.
810
911 Parameters
1012 ----------
11- dataframe : pandas.core.frame.DataFrame
13+ df : pandas.core.frame.DataFrame
1214 a dataframe that might contain missing data
1315 strategy : string, default="mean"
1416 The imputation strategy.
15- - If “ mean” , then replace missing values using the mean along each column. Can only be used with numeric data.
16- - If “ median” , then replace missing values using the median along each column. Can only be used with numeric data.
17- - If “ most_frequent” , then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.
18- - If “ constant” , then replace missing values with fill_value. Can be used with strings or numeric data.
19- fill_value : string or numerical value, default=None
20- When strategy == “ constant” , fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and “missing_value” for strings or object data types .
21-
17+ - If " mean" , then replace missing values using the mean along each column. Can only be used with numeric data.
18+ - If " median" , then replace missing values using the median along each column. Can only be used with numeric data.
19+ - If " most_frequent" , then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.
20+ - If " constant" , then replace missing values with fill_value. Can be used with strings or numeric data.
21+ fill_value : numerical value, default=None
22+ When strategy == " constant" , fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data.
23+
2224 Returns
2325 -------
2426 pandas.core.frame.DataFrame
2527 a dataframe that contains no missing data
26-
28+
2729 Examples
2830 ---------
2931 >>> import pandas as pd
3032 >>> from eda_utils_py import cor_map
31-
33+
3234 >>> data = pd.DataFrame({
3335 >>> 'SepalLengthCm':[5.1, 4.9, 4.7],
3436 >>> 'SepalWidthCm':[1.4, 1.4, 1.3],
@@ -41,24 +43,62 @@ def imputer(dataframe, strategy="mean", fill_value=None):
4143 1 4.9 1.4 0.2
4244 2 4.7 1.3 0.2
4345 """
44- pass
46+
47+ # Tests whether input data is of pd.DataFrame type
48+ if not isinstance (df , pd .DataFrame ):
49+ raise TypeError ("The input dataframe must be of pd.DataFrame type" )
50+
51+ # Tests whether input strategy is of type str
52+ if not isinstance (strategy , str ):
53+ raise TypeError ("strategy must be of type str" )
54+
55+ # Tests whether input fill_value is of type numbers or None
56+ if not isinstance (fill_value , type (None )) and not isinstance (
57+ fill_value , numbers .Number
58+ ):
59+ raise TypeError ("fill_value must be of type None or numeric type" )
60+
61+ # Tests whether the inputs for strategy and fill_value are consistent
62+ if isinstance (fill_value , numbers .Number ) and strategy != "constant" :
63+ raise Exception ("fill_value can be a number only if strategy is 'constant'" )
64+
65+ # Tests whether the inputs for strategy and fill_value are consistent
66+ if isinstance (fill_value , type (None )) and strategy == "constant" :
67+ raise Exception ("fill_value should be a number when strategy is 'constant'" )
68+
69+ result = pd .DataFrame ()
70+ if strategy == "mean" :
71+ result = df .apply (lambda x : x .fillna (x .mean ()), axis = 0 )
72+ elif strategy == "median" :
73+ result = df .apply (lambda x : x .fillna (x .median ()), axis = 0 )
74+ elif strategy == "most_frequent" :
75+ result = df .apply (lambda x : x .fillna (x .value_counts ().index [0 ]), axis = 0 )
76+ elif strategy == "constant" :
77+ result = df .apply (lambda x : x .fillna (fill_value ))
78+ else :
79+ raise Exception (
80+ "strategy should be one of 'mean', 'median', 'most_frequent' and 'constant'"
81+ )
82+
83+ return result
4584
4685
47- def cor_map (dataframe , num_col , col_scheme = 'purpleorange' ):
48-
86+
87+ def cor_map (dataframe , num_col , col_scheme = "purpleorange" ):
88+
4989 """
5090 A function to implement a correlation heatmap including coefficients based on given numeric columns of a data frame.
5191
5292 Parameters
5393 ----------
5494 dataframe : pandas.core.frame.DataFrame
5595 The data frame to be used for EDA.
56- num_col : list
96+ num_col : list
5797 A list of string of column names with numeric data from the data frame.
58- col_scheme : str, default = 'purpleorange'
98+ col_scheme : str, default = 'purpleorange'
5999 The color scheme of the heatmap desired, can only be one of the following;
60100 - 'purpleorange'
61- - 'blueorange'
101+ - 'blueorange'
62102 - 'redblue'
63103
64104 Returns
@@ -70,73 +110,73 @@ def cor_map(dataframe, num_col, col_scheme = 'purpleorange'):
70110 ---------
71111 >>> import pandas as pd
72112 >>> from eda_utils_py import cor_map
73-
113+
74114 >>> data = pd.DataFrame({
75115 >>> 'SepalLengthCm':[5.1, 4.9, 4.7],
76116 >>> 'SepalWidthCm':[1.4, 1.4, 1.3],
77117 >>> 'PetalWidthCm':[0.2, 0.2, 0.2],
78118 >>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
79119 >>> })
80120
81- >>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
121+ >>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
82122 >>> cor_map(data, numerical_columns, col_scheme = 'purpleorange')
83-
123+
84124 """
85-
125+
86126 # Tests whether input data is of pd.DataFrame type
87127 if not isinstance (dataframe , pd .DataFrame ):
88128 raise TypeError ("The input dataframe must be of pd.DataFrame type" )
89129
90130 # Tests whether input num_col is of type list
91131 if not isinstance (num_col , list ):
92132 raise TypeError ("The input num_col must be of type list" )
93-
94- # Tests whether values of num_col is of type str
133+
134+ # Tests whether values of num_col is of type str
95135 for x in num_col :
96136 if not isinstance (x , str ):
97137 raise TypeError ("The type of values in num_col must all be str" )
98138
99- # Tests whether input col_scheme is of type str
139+ # Tests whether input col_scheme is of type str
100140 if not isinstance (col_scheme , str ):
101141 raise TypeError ("col_scheme must be of type str" )
102142
103- # Tests whether col_scheme is one of three possible options
104- if col_scheme not in ('purpleorange' , 'blueorange' , 'redblue' ):
105- raise Exception ("This color scheme is not available, please use either 'purpleorange', 'blueorange' or 'redblue'" )
143+ # Tests whether col_scheme is one of three possible options
144+ if col_scheme not in ("purpleorange" , "blueorange" , "redblue" ):
145+ raise Exception (
146+ "This color scheme is not available, please use either 'purpleorange', 'blueorange' or 'redblue'"
147+ )
106148
107149 # Tests whether all input columns exist in the input data
108150 for x in num_col :
109151 if x not in list (dataframe .columns ):
110- raise Exception ("The given column names must exist in the given dataframe." )
111-
152+ raise Exception ("The given column names must exist in the given dataframe." )
153+
112154 # Tests whether all input columns in num_col are numeric columns
113155 for x in num_col :
114156 if not is_numeric_dtype (dataframe [x ]):
115157 raise Exception ("The given numerical columns must all be numeric." )
116-
117-
118- corr_matrix = dataframe [ num_col ]. corr (). reset_index (). melt ( 'index' )
119- corr_matrix . columns = [ 'var1' , 'var2' , 'cor' ]
120-
121- plot = alt .Chart (corr_matrix ). mark_rect (). encode (
122- x = alt . X ( 'var1' , title = None ),
123- y = alt . Y ( 'var2' , title = None ),
124- color = alt .Color ( 'cor' , legend = None ,
125- scale = alt .Scale ( scheme = col_scheme ) ),
126- ). properties (
127- title = 'Correlation Matrix' ,
128- width = 400 , height = 400
158+
159+ corr_matrix = dataframe [ num_col ]. corr (). reset_index (). melt ( "index" )
160+ corr_matrix . columns = [ "var1" , "var2" , "cor" ]
161+
162+ plot = (
163+ alt .Chart (corr_matrix )
164+ . mark_rect ()
165+ . encode (
166+ x = alt .X ( "var1" , title = None ) ,
167+ y = alt .Y ( "var2" , title = None ),
168+ color = alt . Color ( "cor" , legend = None , scale = alt . Scale ( scheme = col_scheme )),
169+ )
170+ . properties ( title = "Correlation Matrix" , width = 400 , height = 400 )
129171 )
130172
131173 text = plot .mark_text (size = 15 ).encode (
132- text = alt .Text (' cor' , format = ".2f" ),
174+ text = alt .Text (" cor" , format = ".2f" ),
133175 color = alt .condition (
134- "datum.cor > 0.5 | datum.cor < -0.3" ,
135- alt .value ('white' ),
136- alt .value ('black' )
137- )
176+ "datum.cor > 0.5 | datum.cor < -0.3" , alt .value ("white" ), alt .value ("black" )
177+ ),
138178 )
139-
179+
140180 cor_heatmap = plot + text
141181
142182 return cor_heatmap
@@ -147,26 +187,26 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
147187 A function that identify and deal with outliers based on the method the user choose
148188
149189 Parameters
150- ----------
190+ ----------
151191 dataframe : pandas.core.frame.DataFrame
152192 The target dataframe where the function is performed.
153193 columns : list, default=None
154194 The target columns where the function needed to be performed. Defualt is None, the function will check all columns
155195 method : string
156- The method of dealing with outliers.
196+ The method of dealing with outliers.
157197 - if "trim" : we completely remove data points that are outliers.
158198 - if "median" : we replace outliers with median values
159-
199+
160200 Returns
161201 -------
162202 pandas.core.frame.DataFrame
163203 a dataframe which the outlier has already process by the chosen method
164-
204+
165205 Examples
166206 --------
167207 >>> import pandas as pd
168208 >>> from eda_utils_py import cor_map
169-
209+
170210 >>> data = pd.DataFrame({
171211 >>> 'SepalLengthCm':[5.1, 4.9, 4.7],
172212 >>> 'SepalWidthCm':[1.4, 1.4, 9999999.99],
@@ -200,7 +240,7 @@ def scale(dataframe, columns=None):
200240 --------
201241 >>> import pandas as pd
202242 >>> from eda_utils_py import scale
203-
243+
204244 >>> data = pd.DataFrame({
205245 >>> 'SepalLengthCm':[5.1, 4.9, 4.7],
206246 >>> 'SepalWidthCm':[1.4, 1.4, 1.3],
@@ -209,7 +249,7 @@ def scale(dataframe, columns=None):
209249 >>> })
210250
211251 >>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
212-
252+
213253 >>> scale(data, numerical_columns)
214254 """
215255 pass
0 commit comments