Skip to content

Commit 71debca

Browse files
authored
Merge pull request #26 from UBC-MDS/imputer
add test and implementation for Imputer function
2 parents 849a208 + 981cb8f commit 71debca

File tree

4 files changed

+229
-98
lines changed

4 files changed

+229
-98
lines changed

eda_utils_py/eda_utils_py.py

Lines changed: 95 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,36 @@
11
import pandas as pd
22
import altair as alt
33
from pandas.api.types import is_numeric_dtype
4+
import numbers
45

5-
def imputer(dataframe, strategy="mean", fill_value=None):
6+
7+
def imputer(df, strategy="mean", fill_value=None):
68
"""
79
A function to implement imputation functionality for completing missing values.
810
911
Parameters
1012
----------
11-
dataframe : pandas.core.frame.DataFrame
13+
df : pandas.core.frame.DataFrame
1214
a dataframe that might contain missing data
1315
strategy : string, default="mean"
1416
The imputation strategy.
15-
- If mean, then replace missing values using the mean along each column. Can only be used with numeric data.
16-
- If median, then replace missing values using the median along each column. Can only be used with numeric data.
17-
- If most_frequent, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.
18-
- If constant, then replace missing values with fill_value. Can be used with strings or numeric data.
19-
fill_value : string or numerical value, default=None
20-
When strategy == constant, fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and “missing_value” for strings or object data types.
21-
17+
- If "mean", then replace missing values using the mean along each column. Can only be used with numeric data.
18+
- If "median", then replace missing values using the median along each column. Can only be used with numeric data.
19+
- If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.
20+
- If "constant", then replace missing values with fill_value. Can be used with strings or numeric data.
21+
fill_value : numerical value, default=None
22+
When strategy == "constant", fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data.
23+
2224
Returns
2325
-------
2426
pandas.core.frame.DataFrame
2527
a dataframe that contains no missing data
26-
28+
2729
Examples
2830
---------
2931
>>> import pandas as pd
3032
>>> from eda_utils_py import cor_map
31-
33+
3234
>>> data = pd.DataFrame({
3335
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
3436
>>> 'SepalWidthCm':[1.4, 1.4, 1.3],
@@ -41,24 +43,62 @@ def imputer(dataframe, strategy="mean", fill_value=None):
4143
1 4.9 1.4 0.2
4244
2 4.7 1.3 0.2
4345
"""
44-
pass
46+
47+
# Tests whether input data is of pd.DataFrame type
48+
if not isinstance(df, pd.DataFrame):
49+
raise TypeError("The input dataframe must be of pd.DataFrame type")
50+
51+
# Tests whether input strategy is of type str
52+
if not isinstance(strategy, str):
53+
raise TypeError("strategy must be of type str")
54+
55+
# Tests whether input fill_value is of type numbers or None
56+
if not isinstance(fill_value, type(None)) and not isinstance(
57+
fill_value, numbers.Number
58+
):
59+
raise TypeError("fill_value must be of type None or numeric type")
60+
61+
# Tests whether the inputs for strategy and fill_value are consistent
62+
if isinstance(fill_value, numbers.Number) and strategy != "constant":
63+
raise Exception("fill_value can be a number only if strategy is 'constant'")
64+
65+
# Tests whether the inputs for strategy and fill_value are consistent
66+
if isinstance(fill_value, type(None)) and strategy == "constant":
67+
raise Exception("fill_value should be a number when strategy is 'constant'")
68+
69+
result = pd.DataFrame()
70+
if strategy == "mean":
71+
result = df.apply(lambda x: x.fillna(x.mean()), axis=0)
72+
elif strategy == "median":
73+
result = df.apply(lambda x: x.fillna(x.median()), axis=0)
74+
elif strategy == "most_frequent":
75+
result = df.apply(lambda x: x.fillna(x.value_counts().index[0]), axis=0)
76+
elif strategy == "constant":
77+
result = df.apply(lambda x: x.fillna(fill_value))
78+
else:
79+
raise Exception(
80+
"strategy should be one of 'mean', 'median', 'most_frequent' and 'constant'"
81+
)
82+
83+
return result
4584

4685

47-
def cor_map(dataframe, num_col, col_scheme = 'purpleorange'):
48-
86+
87+
def cor_map(dataframe, num_col, col_scheme="purpleorange"):
88+
4989
"""
5090
A function to implement a correlation heatmap including coefficients based on given numeric columns of a data frame.
5191
5292
Parameters
5393
----------
5494
dataframe : pandas.core.frame.DataFrame
5595
The data frame to be used for EDA.
56-
num_col : list
96+
num_col : list
5797
A list of string of column names with numeric data from the data frame.
58-
col_scheme : str, default = 'purpleorange'
98+
col_scheme : str, default = 'purpleorange'
5999
The color scheme of the heatmap desired, can only be one of the following;
60100
- 'purpleorange'
61-
- 'blueorange'
101+
- 'blueorange'
62102
- 'redblue'
63103
64104
Returns
@@ -70,73 +110,73 @@ def cor_map(dataframe, num_col, col_scheme = 'purpleorange'):
70110
---------
71111
>>> import pandas as pd
72112
>>> from eda_utils_py import cor_map
73-
113+
74114
>>> data = pd.DataFrame({
75115
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
76116
>>> 'SepalWidthCm':[1.4, 1.4, 1.3],
77117
>>> 'PetalWidthCm':[0.2, 0.2, 0.2],
78118
>>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
79119
>>> })
80120
81-
>>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
121+
>>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
82122
>>> cor_map(data, numerical_columns, col_scheme = 'purpleorange')
83-
123+
84124
"""
85-
125+
86126
# Tests whether input data is of pd.DataFrame type
87127
if not isinstance(dataframe, pd.DataFrame):
88128
raise TypeError("The input dataframe must be of pd.DataFrame type")
89129

90130
# Tests whether input num_col is of type list
91131
if not isinstance(num_col, list):
92132
raise TypeError("The input num_col must be of type list")
93-
94-
# Tests whether values of num_col is of type str
133+
134+
# Tests whether values of num_col is of type str
95135
for x in num_col:
96136
if not isinstance(x, str):
97137
raise TypeError("The type of values in num_col must all be str")
98138

99-
# Tests whether input col_scheme is of type str
139+
# Tests whether input col_scheme is of type str
100140
if not isinstance(col_scheme, str):
101141
raise TypeError("col_scheme must be of type str")
102142

103-
# Tests whether col_scheme is one of three possible options
104-
if col_scheme not in ('purpleorange', 'blueorange', 'redblue'):
105-
raise Exception("This color scheme is not available, please use either 'purpleorange', 'blueorange' or 'redblue'")
143+
# Tests whether col_scheme is one of three possible options
144+
if col_scheme not in ("purpleorange", "blueorange", "redblue"):
145+
raise Exception(
146+
"This color scheme is not available, please use either 'purpleorange', 'blueorange' or 'redblue'"
147+
)
106148

107149
# Tests whether all input columns exist in the input data
108150
for x in num_col:
109151
if x not in list(dataframe.columns):
110-
raise Exception("The given column names must exist in the given dataframe.")
111-
152+
raise Exception("The given column names must exist in the given dataframe.")
153+
112154
# Tests whether all input columns in num_col are numeric columns
113155
for x in num_col:
114156
if not is_numeric_dtype(dataframe[x]):
115157
raise Exception("The given numerical columns must all be numeric.")
116-
117-
118-
corr_matrix = dataframe[num_col].corr().reset_index().melt('index')
119-
corr_matrix.columns = ['var1', 'var2', 'cor']
120-
121-
plot = alt.Chart(corr_matrix).mark_rect().encode(
122-
x=alt.X('var1', title=None),
123-
y=alt.Y('var2', title=None),
124-
color=alt.Color('cor',legend=None,
125-
scale = alt.Scale(scheme = col_scheme)),
126-
).properties(
127-
title = 'Correlation Matrix',
128-
width=400, height=400
158+
159+
corr_matrix = dataframe[num_col].corr().reset_index().melt("index")
160+
corr_matrix.columns = ["var1", "var2", "cor"]
161+
162+
plot = (
163+
alt.Chart(corr_matrix)
164+
.mark_rect()
165+
.encode(
166+
x=alt.X("var1", title=None),
167+
y=alt.Y("var2", title=None),
168+
color=alt.Color("cor", legend=None, scale=alt.Scale(scheme=col_scheme)),
169+
)
170+
.properties(title="Correlation Matrix", width=400, height=400)
129171
)
130172

131173
text = plot.mark_text(size=15).encode(
132-
text=alt.Text('cor', format=".2f"),
174+
text=alt.Text("cor", format=".2f"),
133175
color=alt.condition(
134-
"datum.cor > 0.5 | datum.cor < -0.3",
135-
alt.value('white'),
136-
alt.value('black')
137-
)
176+
"datum.cor > 0.5 | datum.cor < -0.3", alt.value("white"), alt.value("black")
177+
),
138178
)
139-
179+
140180
cor_heatmap = plot + text
141181

142182
return cor_heatmap
@@ -147,26 +187,26 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
147187
A function that identify and deal with outliers based on the method the user choose
148188
149189
Parameters
150-
----------
190+
----------
151191
dataframe : pandas.core.frame.DataFrame
152192
The target dataframe where the function is performed.
153193
columns : list, default=None
154194
The target columns where the function needed to be performed. Defualt is None, the function will check all columns
155195
method : string
156-
The method of dealing with outliers.
196+
The method of dealing with outliers.
157197
- if "trim" : we completely remove data points that are outliers.
158198
- if "median" : we replace outliers with median values
159-
199+
160200
Returns
161201
-------
162202
pandas.core.frame.DataFrame
163203
a dataframe which the outlier has already process by the chosen method
164-
204+
165205
Examples
166206
--------
167207
>>> import pandas as pd
168208
>>> from eda_utils_py import cor_map
169-
209+
170210
>>> data = pd.DataFrame({
171211
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
172212
>>> 'SepalWidthCm':[1.4, 1.4, 9999999.99],
@@ -200,7 +240,7 @@ def scale(dataframe, columns=None):
200240
--------
201241
>>> import pandas as pd
202242
>>> from eda_utils_py import scale
203-
243+
204244
>>> data = pd.DataFrame({
205245
>>> 'SepalLengthCm':[5.1, 4.9, 4.7],
206246
>>> 'SepalWidthCm':[1.4, 1.4, 1.3],
@@ -209,7 +249,7 @@ def scale(dataframe, columns=None):
209249
>>> })
210250
211251
>>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
212-
252+
213253
>>> scale(data, numerical_columns)
214254
"""
215255
pass

poetry.lock

Lines changed: 9 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ license = "MIT"
88
[tool.poetry.dependencies]
99
python = "^3.8"
1010
pandas = "^1.2.2"
11+
altair = "^4.1.0"
1112

1213
[tool.poetry.dev-dependencies]
1314
Sphinx = "^3.5.1"

0 commit comments

Comments
 (0)