Skip to content

Commit 325b7b0

Browse files
authored
Merge pull request #27 from UBC-MDS/implemennt-scale
First iteration of scale function
2 parents 71debca + 8405436 commit 325b7b0

File tree

2 files changed

+226
-60
lines changed

2 files changed

+226
-60
lines changed

eda_utils_py/eda_utils_py.py

Lines changed: 150 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import pandas as pd
21
import altair as alt
2+
import pandas as pd
33
from pandas.api.types import is_numeric_dtype
44
import numbers
55

@@ -28,16 +28,16 @@ def imputer(df, strategy="mean", fill_value=None):
2828
2929
Examples
3030
---------
31-
>>> import pandas as pd
32-
>>> from eda_utils_py import cor_map
31+
>> import pandas as pd
32+
>> from eda_utils_py import cor_map
3333
34-
>>> data = pd.DataFrame({
35-
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
36-
>>> 'SepalWidthCm':[1.4, 1.4, 1.3],
37-
>>> 'PetalWidthCm':[0.2, None, 0.2]
38-
>>> })
34+
>> data = pd.DataFrame({
35+
>> 'SepalLengthCm':[5.1, 4.9, 4.7],
36+
>> 'SepalWidthCm':[1.4, 1.4, 1.3],
37+
>> 'PetalWidthCm':[0.2, None, 0.2]
38+
>> })
3939
40-
>>> imputer(data, numerical_columns)
40+
>> imputer(data, numerical_columns)
4141
SepalLengthCm SepalWidthCm PetalWidthCm
4242
0 5.1 1.4 0.2
4343
1 4.9 1.4 0.2
@@ -54,7 +54,7 @@ def imputer(df, strategy="mean", fill_value=None):
5454

5555
# Tests whether input fill_value is of type numbers or None
5656
if not isinstance(fill_value, type(None)) and not isinstance(
57-
fill_value, numbers.Number
57+
fill_value, numbers.Number
5858
):
5959
raise TypeError("fill_value must be of type None or numeric type")
6060

@@ -83,9 +83,7 @@ def imputer(df, strategy="mean", fill_value=None):
8383
return result
8484

8585

86-
8786
def cor_map(dataframe, num_col, col_scheme="purpleorange"):
88-
8987
"""
9088
A function to implement a correlation heatmap including coefficients based on given numeric columns of a data frame.
9189
@@ -108,19 +106,18 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
108106
109107
Examples
110108
---------
111-
>>> import pandas as pd
112-
>>> from eda_utils_py import cor_map
113-
114-
>>> data = pd.DataFrame({
115-
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
116-
>>> 'SepalWidthCm':[1.4, 1.4, 1.3],
117-
>>> 'PetalWidthCm':[0.2, 0.2, 0.2],
118-
>>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
119-
>>> })
120-
121-
>>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
122-
>>> cor_map(data, numerical_columns, col_scheme = 'purpleorange')
123-
109+
>> import pandas as pd
110+
>> from eda_utils_py import cor_map
111+
112+
>> data = pd.DataFrame({
113+
>> 'SepalLengthCm':[5.1, 4.9, 4.7],
114+
>> 'SepalWidthCm':[1.4, 1.4, 1.3],
115+
>> 'PetalWidthCm':[0.2, 0.2, 0.2],
116+
>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
117+
>> })
118+
119+
>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
120+
>> cor_map(data, numerical_columns, col_scheme = 'purpleorange')
124121
"""
125122

126123
# Tests whether input data is of pd.DataFrame type
@@ -161,13 +158,13 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
161158

162159
plot = (
163160
alt.Chart(corr_matrix)
164-
.mark_rect()
165-
.encode(
161+
.mark_rect()
162+
.encode(
166163
x=alt.X("var1", title=None),
167164
y=alt.Y("var2", title=None),
168165
color=alt.Color("cor", legend=None, scale=alt.Scale(scheme=col_scheme)),
169166
)
170-
.properties(title="Correlation Matrix", width=400, height=400)
167+
.properties(title="Correlation Matrix", width=400, height=400)
171168
)
172169

173170
text = plot.mark_text(size=15).encode(
@@ -204,52 +201,154 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
204201
205202
Examples
206203
--------
207-
>>> import pandas as pd
208-
>>> from eda_utils_py import cor_map
204+
>> import pandas as pd
205+
>> from eda_utils_py import cor_map
209206
210-
>>> data = pd.DataFrame({
211-
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
212-
>>> 'SepalWidthCm':[1.4, 1.4, 9999999.99],
213-
>>> 'PetalWidthCm:[0.2, 0.2, 0.2],
214-
>>> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
215-
>>> })
207+
>> data = pd.DataFrame({
208+
>> 'SepalLengthCm':[5.1, 4.9, 4.7],
209+
>> 'SepalWidthCm':[1.4, 1.4, 9999999.99],
210+
>> 'PetalWidthCm:[0.2, 0.2, 0.2],
211+
>> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
212+
>> })
216213
217-
>>> outlier_identifier(data)
214+
>> outlier_identifier(data)
218215
219216
"""
220217
pass
221218

222219

223-
def scale(dataframe, columns=None):
220+
def scale(dataframe, columns=None, scaler="standard"):
224221
"""
225-
A function to scale features by removing the mean and scaling to unit variance
222+
A function to scale features either by using standard scaler or minmax scaler method
226223
227224
Parameters
228225
----------
229226
dataframe : pandas.DataFrame
230227
The data frame to be used for EDA.
231228
columns : list, default=None
232229
A list of string of column names with numeric data from the data frame that we wish to scale.
233-
230+
scaler: str, default="standard"
231+
A string to specify the sclaing method to be used
232+
- if "standard": it transforms features by centering the distribution of the data on the value 0 and the standard
233+
deviation to the value 1.
234+
- if "minmax": it transforms features by rescaling each feature to the range between 0 and 1.
234235
Returns
235236
-------
236237
dataframe : pandas.core.frame.DataFrame
237238
The scaled dataframe for numerical features
238239
239240
Examples
240241
--------
241-
>>> import pandas as pd
242-
>>> from eda_utils_py import scale
242+
>> import pandas as pd
243+
>> from eda_utils_py import scale
243244
244-
>>> data = pd.DataFrame({
245-
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
246-
>>> 'SepalWidthCm':[1.4, 1.4, 1.3],
247-
>>> 'PetalWidthCm:[0.2, 0.2, 0.2],
248-
>>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
249-
>>> })
245+
>> data = pd.DataFrame({
246+
>> 'SepalLengthCm':[5.1, 4.9, 4.7],
247+
>> 'SepalWidthCm':[1.4, 1.4, 1.3],
248+
>> 'PetalWidthCm:[0.2, 0.2, 0.2],
249+
>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
250+
>> })
250251
251-
>>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
252+
>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
252253
253-
>>> scale(data, numerical_columns)
254+
>> scale(data, numerical_columns, scaler="minmax")
254255
"""
255-
pass
256+
257+
# Check if input data is of pd.DataFrame type
258+
if not isinstance(dataframe, pd.DataFrame):
259+
raise TypeError("The input dataframe must be of pd.DataFrame type")
260+
261+
# Check if input num_col is of type list
262+
if not isinstance(columns, list):
263+
raise TypeError("The input columns must be of type list")
264+
265+
# Check if values of columns are of type str
266+
for col in columns:
267+
if not isinstance(col, str):
268+
raise TypeError("The name of features in columns list must all be str")
269+
270+
# Check if all input columns exist in the input data
271+
for col in columns:
272+
if col not in list(dataframe.columns):
273+
raise Exception("The given column names must exist in the given dataframe.")
274+
275+
# Check if all input columns in num_col are numeric columns
276+
for col in columns:
277+
if not is_numeric_dtype(dataframe[col]):
278+
raise Exception("The given numerical columns must all be numeric.")
279+
280+
# Check if scaler is of type str
281+
if not isinstance(scaler, str):
282+
raise TypeError("Scaler must be of type str")
283+
284+
# Check if all input columns exist in the input data
285+
for col in columns:
286+
if col not in list(dataframe.columns):
287+
raise Exception("The given column names must exist in the given dataframe.")
288+
289+
# Check if all input columns in num_col are numeric columns
290+
for col in columns:
291+
if not is_numeric_dtype(dataframe[col]):
292+
raise Exception("The given columns must all be numeric.")
293+
294+
scaled_df = None
295+
if scaler == "minmax":
296+
scaled_df = _minmax(dataframe[columns])
297+
else:
298+
scaled_df = _standardize(dataframe[columns])
299+
300+
return scaled_df
301+
302+
303+
def _standardize(dataframe):
304+
"""Transform features by centering the distribution of the data
305+
on the value 0 and the standard deviation to the value 1.
306+
307+
The transformation is given by:
308+
309+
scaled_value = (value - mean) / standard deviation
310+
311+
Parameters
312+
----------
313+
dataframe : pandas.DataFrame
314+
The data frame to be used for EDA.
315+
Returns
316+
-------
317+
self : object
318+
Scaled dataset
319+
"""
320+
res = dataframe.copy()
321+
for feature_name in dataframe.columns:
322+
mean = dataframe[feature_name].mean()
323+
stdev = dataframe[feature_name].std()
324+
res[feature_name] = (dataframe[feature_name] - mean) / stdev
325+
return res
326+
327+
328+
def _minmax(dataframe):
329+
"""Transform features by rescaling each feature to the range between 0 and 1.
330+
The transformation is given by:
331+
332+
scaled_value = (feature_value - min) / (mix - min)
333+
334+
where min, max = feature_range.
335+
336+
This transformation is often used as an alternative to zero mean,
337+
unit variance scaling.
338+
339+
Parameters
340+
----------
341+
dataframe : pandas.DataFrame
342+
The data frame to be used for EDA.
343+
Returns
344+
-------
345+
self : object
346+
Scaled dataset
347+
"""
348+
349+
res = dataframe.copy()
350+
for feature_name in dataframe.columns:
351+
max = dataframe[feature_name].max()
352+
min = dataframe[feature_name].min()
353+
res[feature_name] = (dataframe[feature_name] - min) / (max - min)
354+
return res

0 commit comments

Comments
 (0)