Skip to content

Commit 67363ce

Browse files
committed
add my function
1 parent be7c11f commit 67363ce

File tree

2 files changed

+488
-56
lines changed

2 files changed

+488
-56
lines changed

eda_utils_py/eda_utils_py.py

Lines changed: 253 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,104 @@
1-
import pandas as pd
21
import altair as alt
2+
import pandas as pd
33
from pandas.api.types import is_numeric_dtype
4+
import numbers
45
import numpy as np
56

67

7-
def imputer(dataframe, strategy="mean", fill_value=None):
8+
def imputer(df, strategy="mean", fill_value=None):
89
"""
910
A function to implement imputation functionality for completing missing values.
1011
1112
Parameters
1213
----------
13-
dataframe : pandas.core.frame.DataFrame
14+
df : pandas.core.frame.DataFrame
1415
a dataframe that might contain missing data
1516
strategy : string, default="mean"
1617
The imputation strategy.
17-
- If mean, then replace missing values using the mean along each column. Can only be used with numeric data.
18-
- If median, then replace missing values using the median along each column. Can only be used with numeric data.
19-
- If most_frequent, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.
20-
- If constant, then replace missing values with fill_value. Can be used with strings or numeric data.
21-
fill_value : string or numerical value, default=None
22-
When strategy == constant, fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and “missing_value” for strings or object data types.
23-
18+
- If "mean", then replace missing values using the mean along each column. Can only be used with numeric data.
19+
- If "median", then replace missing values using the median along each column. Can only be used with numeric data.
20+
- If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.
21+
- If "constant", then replace missing values with fill_value. Can be used with strings or numeric data.
22+
fill_value : numerical value, default=None
23+
When strategy == "constant", fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data.
24+
2425
Returns
2526
-------
2627
pandas.core.frame.DataFrame
2728
a dataframe that contains no missing data
28-
29+
2930
Examples
3031
---------
31-
>>> import pandas as pd
32-
>>> from eda_utils_py import cor_map
33-
34-
>>> data = pd.DataFrame({
35-
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
36-
>>> 'SepalWidthCm':[1.4, 1.4, 1.3],
37-
>>> 'PetalWidthCm':[0.2, None, 0.2]
38-
>>> })
32+
>> import pandas as pd
33+
>> from eda_utils_py import cor_map
34+
35+
>> data = pd.DataFrame({
36+
>> 'SepalLengthCm':[5.1, 4.9, 4.7],
37+
>> 'SepalWidthCm':[1.4, 1.4, 1.3],
38+
>> 'PetalWidthCm':[0.2, None, 0.2]
39+
>> })
3940
40-
>>> imputer(data, numerical_columns)
41+
>> imputer(data, numerical_columns)
4142
SepalLengthCm SepalWidthCm PetalWidthCm
4243
0 5.1 1.4 0.2
4344
1 4.9 1.4 0.2
4445
2 4.7 1.3 0.2
4546
"""
46-
pass
4747

48+
# Tests whether input data is of pd.DataFrame type
49+
if not isinstance(df, pd.DataFrame):
50+
raise TypeError("The input dataframe must be of pd.DataFrame type")
51+
52+
# Tests whether input strategy is of type str
53+
if not isinstance(strategy, str):
54+
raise TypeError("strategy must be of type str")
55+
56+
# Tests whether input fill_value is of type numbers or None
57+
if not isinstance(fill_value, type(None)) and not isinstance(
58+
fill_value, numbers.Number
59+
):
60+
raise TypeError("fill_value must be of type None or numeric type")
61+
62+
# Tests whether the inputs for strategy and fill_value are consistent
63+
if isinstance(fill_value, numbers.Number) and strategy != "constant":
64+
raise Exception("fill_value can be a number only if strategy is 'constant'")
65+
66+
# Tests whether the inputs for strategy and fill_value are consistent
67+
if isinstance(fill_value, type(None)) and strategy == "constant":
68+
raise Exception("fill_value should be a number when strategy is 'constant'")
69+
70+
result = pd.DataFrame()
71+
if strategy == "mean":
72+
result = df.apply(lambda x: x.fillna(x.mean()), axis=0)
73+
elif strategy == "median":
74+
result = df.apply(lambda x: x.fillna(x.median()), axis=0)
75+
elif strategy == "most_frequent":
76+
result = df.apply(lambda x: x.fillna(x.value_counts().index[0]), axis=0)
77+
elif strategy == "constant":
78+
result = df.apply(lambda x: x.fillna(fill_value))
79+
else:
80+
raise Exception(
81+
"strategy should be one of 'mean', 'median', 'most_frequent' and 'constant'"
82+
)
83+
84+
return result
4885

49-
def cor_map(dataframe, num_col):
86+
87+
def cor_map(dataframe, num_col, col_scheme="purpleorange"):
5088
"""
5189
A function to implement a correlation heatmap including coefficients based on given numeric columns of a data frame.
5290
5391
Parameters
5492
----------
5593
dataframe : pandas.core.frame.DataFrame
5694
The data frame to be used for EDA.
57-
num_col : list
95+
num_col : list
5896
A list of string of column names with numeric data from the data frame.
97+
col_scheme : str, default = 'purpleorange'
98+
The color scheme of the heatmap desired, can only be one of the following;
99+
- 'purpleorange'
100+
- 'blueorange'
101+
- 'redblue'
59102
60103
Returns
61104
-------
@@ -64,21 +107,77 @@ def cor_map(dataframe, num_col):
64107
65108
Examples
66109
---------
67-
>>> import pandas as pd
68-
>>> from eda_utils_py import cor_map
69-
70-
>>> data = pd.DataFrame({
71-
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
72-
>>> 'SepalWidthCm':[1.4, 1.4, 1.3],
73-
>>> 'PetalWidthCm':[0.2, 0.2, 0.2],
74-
>>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
75-
>>> })
110+
>> import pandas as pd
111+
>> from eda_utils_py import cor_map
76112
77-
>>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
78-
>>> cor_map(data, numerical_columns)
79-
113+
>> data = pd.DataFrame({
114+
>> 'SepalLengthCm':[5.1, 4.9, 4.7],
115+
>> 'SepalWidthCm':[1.4, 1.4, 1.3],
116+
>> 'PetalWidthCm':[0.2, 0.2, 0.2],
117+
>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
118+
>> })
119+
120+
>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
121+
>> cor_map(data, numerical_columns, col_scheme = 'purpleorange')
80122
"""
81-
pass
123+
124+
# Tests whether input data is of pd.DataFrame type
125+
if not isinstance(dataframe, pd.DataFrame):
126+
raise TypeError("The input dataframe must be of pd.DataFrame type")
127+
128+
# Tests whether input num_col is of type list
129+
if not isinstance(num_col, list):
130+
raise TypeError("The input num_col must be of type list")
131+
132+
# Tests whether values of num_col is of type str
133+
for x in num_col:
134+
if not isinstance(x, str):
135+
raise TypeError("The type of values in num_col must all be str")
136+
137+
# Tests whether input col_scheme is of type str
138+
if not isinstance(col_scheme, str):
139+
raise TypeError("col_scheme must be of type str")
140+
141+
# Tests whether col_scheme is one of three possible options
142+
if col_scheme not in ("purpleorange", "blueorange", "redblue"):
143+
raise Exception(
144+
"This color scheme is not available, please use either 'purpleorange', 'blueorange' or 'redblue'"
145+
)
146+
147+
# Tests whether all input columns exist in the input data
148+
for x in num_col:
149+
if x not in list(dataframe.columns):
150+
raise Exception("The given column names must exist in the given dataframe.")
151+
152+
# Tests whether all input columns in num_col are numeric columns
153+
for x in num_col:
154+
if not is_numeric_dtype(dataframe[x]):
155+
raise Exception("The given numerical columns must all be numeric.")
156+
157+
corr_matrix = dataframe[num_col].corr().reset_index().melt("index")
158+
corr_matrix.columns = ["var1", "var2", "cor"]
159+
160+
plot = (
161+
alt.Chart(corr_matrix)
162+
.mark_rect()
163+
.encode(
164+
x=alt.X("var1", title=None),
165+
y=alt.Y("var2", title=None),
166+
color=alt.Color("cor", legend=None, scale=alt.Scale(scheme=col_scheme)),
167+
)
168+
.properties(title="Correlation Matrix", width=400, height=400)
169+
)
170+
171+
text = plot.mark_text(size=15).encode(
172+
text=alt.Text("cor", format=".2f"),
173+
color=alt.condition(
174+
"datum.cor > 0.5 | datum.cor < -0.3", alt.value("white"), alt.value("black")
175+
),
176+
)
177+
178+
cor_heatmap = plot + text
179+
180+
return cor_heatmap
82181

83182

84183
def outlier_identifier(dataframe, columns=None, method="trim"):
@@ -174,36 +273,138 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
174273
return df
175274

176275

177-
def scale(dataframe, columns=None):
276+
def scale(dataframe, columns=None, scaler="standard"):
178277
"""
179-
A function to scale features by removing the mean and scaling to unit variance
278+
A function to scale features either by using standard scaler or minmax scaler method
180279
181280
Parameters
182281
----------
183282
dataframe : pandas.DataFrame
184283
The data frame to be used for EDA.
185284
columns : list, default=None
186285
A list of string of column names with numeric data from the data frame that we wish to scale.
187-
286+
scaler: str, default="standard"
287+
A string to specify the sclaing method to be used
288+
- if "standard": it transforms features by centering the distribution of the data on the value 0 and the standard
289+
deviation to the value 1.
290+
- if "minmax": it transforms features by rescaling each feature to the range between 0 and 1.
188291
Returns
189292
-------
190293
dataframe : pandas.core.frame.DataFrame
191294
The scaled dataframe for numerical features
192295
193296
Examples
194297
--------
195-
>>> import pandas as pd
196-
>>> from eda_utils_py import scale
197-
198-
>>> data = pd.DataFrame({
199-
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
200-
>>> 'SepalWidthCm':[1.4, 1.4, 1.3],
201-
>>> 'PetalWidthCm:[0.2, 0.2, 0.2],
202-
>>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
203-
>>> })
298+
>> import pandas as pd
299+
>> from eda_utils_py import scale
204300
205-
>>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
206-
207-
>>> scale(data, numerical_columns)
301+
>> data = pd.DataFrame({
302+
>> 'SepalLengthCm':[5.1, 4.9, 4.7],
303+
>> 'SepalWidthCm':[1.4, 1.4, 1.3],
304+
>> 'PetalWidthCm:[0.2, 0.2, 0.2],
305+
>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
306+
>> })
307+
308+
>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
309+
310+
>> scale(data, numerical_columns, scaler="minmax")
311+
"""
312+
313+
# Check if input data is of pd.DataFrame type
314+
if not isinstance(dataframe, pd.DataFrame):
315+
raise TypeError("The input dataframe must be of pd.DataFrame type")
316+
317+
# Check if input num_col is of type list
318+
if not isinstance(columns, list):
319+
raise TypeError("The input columns must be of type list")
320+
321+
# Check if values of columns are of type str
322+
for col in columns:
323+
if not isinstance(col, str):
324+
raise TypeError("The name of features in columns list must all be str")
325+
326+
# Check if all input columns exist in the input data
327+
for col in columns:
328+
if col not in list(dataframe.columns):
329+
raise Exception("The given column names must exist in the given dataframe.")
330+
331+
# Check if all input columns in num_col are numeric columns
332+
for col in columns:
333+
if not is_numeric_dtype(dataframe[col]):
334+
raise Exception("The given numerical columns must all be numeric.")
335+
336+
# Check if scaler is of type str
337+
if not isinstance(scaler, str):
338+
raise TypeError("Scaler must be of type str")
339+
340+
# Check if all input columns exist in the input data
341+
for col in columns:
342+
if col not in list(dataframe.columns):
343+
raise Exception("The given column names must exist in the given dataframe.")
344+
345+
# Check if all input columns in num_col are numeric columns
346+
for col in columns:
347+
if not is_numeric_dtype(dataframe[col]):
348+
raise Exception("The given columns must all be numeric.")
349+
350+
scaled_df = None
351+
if scaler == "minmax":
352+
scaled_df = _minmax(dataframe[columns])
353+
else:
354+
scaled_df = _standardize(dataframe[columns])
355+
356+
return scaled_df
357+
358+
359+
def _standardize(dataframe):
360+
"""Transform features by centering the distribution of the data
361+
on the value 0 and the standard deviation to the value 1.
362+
363+
The transformation is given by:
364+
365+
scaled_value = (value - mean) / standard deviation
366+
367+
Parameters
368+
----------
369+
dataframe : pandas.DataFrame
370+
The data frame to be used for EDA.
371+
Returns
372+
-------
373+
self : object
374+
Scaled dataset
208375
"""
209-
pass
376+
res = dataframe.copy()
377+
for feature_name in dataframe.columns:
378+
mean = dataframe[feature_name].mean()
379+
stdev = dataframe[feature_name].std()
380+
res[feature_name] = (dataframe[feature_name] - mean) / stdev
381+
return res
382+
383+
384+
def _minmax(dataframe):
385+
"""Transform features by rescaling each feature to the range between 0 and 1.
386+
The transformation is given by:
387+
388+
scaled_value = (feature_value - min) / (mix - min)
389+
390+
where min, max = feature_range.
391+
392+
This transformation is often used as an alternative to zero mean,
393+
unit variance scaling.
394+
395+
Parameters
396+
----------
397+
dataframe : pandas.DataFrame
398+
The data frame to be used for EDA.
399+
Returns
400+
-------
401+
self : object
402+
Scaled dataset
403+
"""
404+
405+
res = dataframe.copy()
406+
for feature_name in dataframe.columns:
407+
max = dataframe[feature_name].max()
408+
min = dataframe[feature_name].min()
409+
res[feature_name] = (dataframe[feature_name] - min) / (max - min)
410+
return res

0 commit comments

Comments
 (0)