|
1 | | -def cor_map(dataframe, num_col): |
| 1 | +def imputer(dataframe, strategy="mean", fill_value=None): |
2 | 2 | """ |
3 | | - A function to implement a correlation heatmap including coefficients based on given numeric columns of a data frame. |
4 | | -
|
5 | | - Args: |
6 | | - dataframe (pandas.DataFrame): The data frame to be used for EDA. |
7 | | - num_col (list): A list of string of column names with numeric data from the data frame. |
8 | | -
|
9 | | - Returns: |
10 | | - (altair): A correlation heatmap plot with correlation coefficient labels based on the numeric columns specified by user. |
| 3 | + A function to implement imputation functionality for completing missing values. |
| 4 | +
|
| 5 | + Parameters |
| 6 | + ---------- |
| 7 | + dataframe : pandas.core.frame.DataFrame |
| 8 | + a dataframe that might contain missing data |
| 9 | + strategy : string, default="mean" |
| 10 | + The imputation strategy. |
| 11 | + - If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data. |
| 12 | + - If “median”, then replace missing values using the median along each column. Can only be used with numeric data. |
| 13 | + - If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned. |
| 14 | + - If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data. |
| 15 | + fill_value : string or numerical value, default=None |
| 16 | + When strategy == “constant”, fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and “missing_value” for strings or object data types. |
| 17 | + |
| 18 | + Returns |
| 19 | + ------- |
| 20 | + pandas.core.frame.DataFrame |
| 21 | + a dataframe that contains no missing data |
| 22 | + |
| 23 | + Examples |
| 24 | + --------- |
| 25 | + >>> import pandas as pd |
| 26 | + >>> from eda_utils_py import cor_map |
| 27 | + |
| 28 | + >>> data = pd.DataFrame({ |
| 29 | + >>> 'SepalLengthCm':[5.1, 4.9, 4.7], |
| 30 | + >>> 'SepalWidthCm':[1.4, 1.4, 1.3], |
| 31 | + >>> 'PetalWidthCm':[0.2, None, 0.2] |
| 32 | + >>> }) |
| 33 | +
|
| 34 | + >>> imputer(data, numerical_columns) |
| 35 | + SepalLengthCm SepalWidthCm PetalWidthCm |
| 36 | + 0 5.1 1.4 0.2 |
| 37 | + 1 4.9 1.4 0.2 |
| 38 | + 2 4.7 1.3 0.2 |
| 39 | + """ |
| 40 | + pass |
11 | 41 |
|
12 | | - Examples: |
13 | | - import pandas as pd |
14 | | - from eda_utils_py import cor_map |
15 | 42 |
|
16 | | - data = pd.DataFrame({ |
17 | | - 'SepalLengthCm':[5.1, 4.9, 4.7], |
18 | | - 'SepalWidthCm':[1.4, 1.4, 1.3], |
19 | | - 'PetalWidthCm:[0.2, 0.2, 0.2], |
20 | | - 'Species':['Iris-setosa','Iris-virginica'] |
21 | | - }) |
| 43 | +def cor_map(dataframe, num_col): |
| 44 | + """ |
| 45 | + A function to implement a correlation heatmap including coefficients based on given numeric columns of a data frame. |
22 | 46 |
|
23 | | - numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm'] |
24 | | - |
25 | | - cor_map(data, numerical_columns) |
| 47 | + Parameters |
| 48 | + ---------- |
| 49 | + dataframe : pandas.core.frame.DataFrame |
| 50 | + The data frame to be used for EDA. |
| 51 | + num_col : list |
| 52 | + A list of string of column names with numeric data from the data frame. |
| 53 | +
|
| 54 | + Returns |
| 55 | + ------- |
| 56 | + altair.vegalite.v4.api.Chart |
| 57 | + A correlation heatmap plot with correlation coefficient labels based on the numeric columns specified by user. |
| 58 | +
|
| 59 | + Examples |
| 60 | + --------- |
| 61 | + >>> import pandas as pd |
| 62 | + >>> from eda_utils_py import cor_map |
| 63 | + |
| 64 | + >>> data = pd.DataFrame({ |
| 65 | + >>> 'SepalLengthCm':[5.1, 4.9, 4.7], |
| 66 | + >>> 'SepalWidthCm':[1.4, 1.4, 1.3], |
| 67 | + >>> 'PetalWidthCm':[0.2, 0.2, 0.2], |
| 68 | + >>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica'] |
| 69 | + >>> }) |
| 70 | +
|
| 71 | + >>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm'] |
| 72 | + >>> cor_map(data, numerical_columns) |
26 | 73 | |
27 | 74 | """ |
| 75 | + pass |
28 | 76 |
|
29 | 77 |
|
30 | | -def outlier_identifier(dataframe, columns=None, method="somefunction"): |
| 78 | +def outlier_identifier(dataframe, columns=None, method="trim"): |
31 | 79 | """ |
32 | 80 | A function that identify and deal with outliers based on the method the user choose |
33 | 81 |
|
34 | | - Key arguments: |
35 | | - dataframe [pandas.DataFrame]: |
36 | | - The target dataframe where the function is performed. |
37 | | - columns [list] : None |
38 | | - The target columns where the function needed to be performed. Defualt is None, the function will check all columns |
39 | | - method [string] : "somefunction" |
40 | | - The method of dealing with outliers. |
| 82 | + Parameters |
| 83 | + ---------- |
| 84 | + dataframe : pandas.core.frame.DataFrame |
| 85 | + The target dataframe where the function is performed. |
| 86 | + columns : list, default=None |
| 87 | + The target columns where the function needed to be performed. Defualt is None, the function will check all columns |
| 88 | + method : string |
| 89 | + The method of dealing with outliers. |
| 90 | + - if "trim" : we completely remove data points that are outliers. |
| 91 | + - if "median" : we replace outliers with median values |
41 | 92 | |
42 | | - Returns: |
43 | | - dataframe : |
44 | | - The dataframe which the outlier has already process by the chosen method |
| 93 | + Returns |
| 94 | + ------- |
| 95 | + pandas.core.frame.DataFrame |
| 96 | + a dataframe which the outlier has already process by the chosen method |
45 | 97 | |
46 | | - Examples: |
47 | | - data = pd.DataFrame({ |
48 | | - 'SepalLengthCm':[5.1, 4.9, 4.7], |
49 | | - 'SepalWidthCm':[1.4, 1.4, 9999999.99], |
50 | | - 'PetalWidthCm:[0.2, 0.2, 0.2], |
51 | | - 'Species':['Iris-setosa','Iris-virginica'] |
52 | | - }) |
| 98 | + Examples |
| 99 | + -------- |
| 100 | + >>> import pandas as pd |
| 101 | + >>> from eda_utils_py import cor_map |
| 102 | + |
| 103 | + >>> data = pd.DataFrame({ |
| 104 | + >>> 'SepalLengthCm':[5.1, 4.9, 4.7], |
| 105 | + >>> 'SepalWidthCm':[1.4, 1.4, 9999999.99], |
| 106 | + >>> 'PetalWidthCm:[0.2, 0.2, 0.2], |
| 107 | + >>> 'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica'] |
| 108 | + >>> }) |
53 | 109 |
|
54 | | - outlier_identifier(data) |
| 110 | + >>> outlier_identifier(data) |
55 | 111 |
|
56 | 112 | """ |
| 113 | + pass |
57 | 114 |
|
58 | 115 |
|
59 | 116 | def scale(dataframe, columns=None): |
60 | 117 | """ |
61 | 118 | A function to scale features by removing the mean and scaling to unit variance |
62 | | -. |
63 | | -
|
64 | | - Args: |
65 | | - dataframe (pandas.DataFrame): The data frame to be used for EDA. |
66 | | - columns (list): A list of string of column names with numeric data from the data frame that we wish to scale. |
67 | | -
|
68 | | - Returns: |
69 | | - dataframe : |
70 | | - The scaled dataframe for numerical features |
71 | | -
|
72 | | - Examples: |
73 | | - import pandas as pd |
74 | | - from eda_utils_py import scale |
75 | | -
|
76 | | - data = pd.DataFrame({ |
77 | | - 'SepalLengthCm':[5.1, 4.9, 4.7], |
78 | | - 'SepalWidthCm':[1.4, 1.4, 1.3], |
79 | | - 'PetalWidthCm:[0.2, 0.2, 0.2], |
80 | | - 'Species':['Iris-setosa','Iris-virginica'] |
81 | | - }) |
82 | | -
|
83 | | - numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm'] |
84 | | -
|
85 | | - scale(data, numerical_columns) |
86 | 119 |
|
| 120 | + Parameters |
| 121 | + ---------- |
| 122 | + dataframe : pandas.DataFrame |
| 123 | + The data frame to be used for EDA. |
| 124 | + columns : list, default=None |
| 125 | + A list of string of column names with numeric data from the data frame that we wish to scale. |
| 126 | +
|
| 127 | + Returns |
| 128 | + ------- |
| 129 | + dataframe : pandas.core.frame.DataFrame |
| 130 | + The scaled dataframe for numerical features |
| 131 | +
|
| 132 | + Examples |
| 133 | + -------- |
| 134 | + >>> import pandas as pd |
| 135 | + >>> from eda_utils_py import scale |
| 136 | + |
| 137 | + >>> data = pd.DataFrame({ |
| 138 | + >>> 'SepalLengthCm':[5.1, 4.9, 4.7], |
| 139 | + >>> 'SepalWidthCm':[1.4, 1.4, 1.3], |
| 140 | + >>> 'PetalWidthCm:[0.2, 0.2, 0.2], |
| 141 | + >>> 'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica'] |
| 142 | + >>> }) |
| 143 | +
|
| 144 | + >>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm'] |
| 145 | + |
| 146 | + >>> scale(data, numerical_columns) |
87 | 147 | """ |
88 | 148 | pass |
0 commit comments