Merge pull request #18 from UBC-MDS/imputer

micahkwok · web-flow · commit 03937e68540f · 2021-02-27T21:07:03.000-08:00
merge Imputer method specification and changes in CONTRIBUTING.rst
diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
@@ -7,6 +7,17 @@ Contributing
 Contributions are welcome, and they are greatly appreciated! Every little bit
 helps, and credit will always be given.
 
+Contributor Agreement
+----------------------
+
+By contributing,
+you agree that we may redistribute your work under our license.
+In exchange,
+we will address your issues and/or assess your change proposal as promptly as 
+we can, and help you become a member of our community.
+Everyone involved agrees to abide by our 
+code of conduct.
+
 You can contribute in many ways:
 
 Types of Contributions
diff --git a/docs/source/eda_utils_py.rst b/docs/source/eda_utils_py.rst
@@ -0,0 +1,21 @@
+eda\_utils\_py package
+======================
+
+Submodules
+----------
+
+eda\_utils\_py.eda\_utils\_py module
+------------------------------------
+
+.. automodule:: eda_utils_py.eda_utils_py
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Module contents
+---------------
+
+.. automodule:: eda_utils_py
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/modules.rst b/docs/source/modules.rst
@@ -0,0 +1,7 @@
+eda_utils_py
+============
+
+.. toctree::
+   :maxdepth: 4
+
+   eda_utils_py
diff --git a/eda_utils_py/eda_utils_py.py b/eda_utils_py/eda_utils_py.py
@@ -1,88 +1,148 @@
-def cor_map(dataframe, num_col):
+def imputer(dataframe, strategy="mean", fill_value=None):
     """
-    A function to implement a correlation heatmap including coefficients based on given numeric columns of a data frame.
-
-    Args:
-        dataframe (pandas.DataFrame): The data frame to be used for EDA.
-        num_col (list):  A list of string of column names with numeric data from the data frame.
-
-    Returns:
-        (altair): A correlation heatmap plot with correlation coefficient labels based on the numeric columns specified by user.
+    A function to implement imputation functionality for completing missing values.
+
+    Parameters
+    ----------
+    dataframe : pandas.core.frame.DataFrame
+        a dataframe that might contain missing data
+    strategy : string, default="mean"
+        The imputation strategy.
+            - If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.
+            - If “median”, then replace missing values using the median along each column. Can only be used with numeric data.
+            - If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.
+            - If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.
+    fill_value : string or numerical value, default=None
+        When strategy == “constant”, fill_value is used to replace all occurrences of missing_values. If left to the default, fill_value will be 0 when imputing numerical data and “missing_value” for strings or object data types.
+        
+    Returns
+    -------
+    pandas.core.frame.DataFrame
+        a dataframe that contains no missing data
+        
+    Examples
+    ---------
+    >>> import pandas as pd
+    >>> from eda_utils_py import cor_map
+    
+    >>> data = pd.DataFrame({
+    >>>     'SepalLengthCm':[5.1, 4.9, 4.7],
+    >>>     'SepalWidthCm':[1.4, 1.4, 1.3],
+    >>>     'PetalWidthCm':[0.2, None, 0.2]
+    >>> })
+
+    >>> imputer(data, numerical_columns)
+       SepalLengthCm  SepalWidthCm  PetalWidthCm
+    0            5.1           1.4           0.2
+    1            4.9           1.4           0.2
+    2            4.7           1.3           0.2
+    """
+    pass
 
-    Examples: 
-        import pandas as pd
-        from eda_utils_py import cor_map
 
-        data = pd.DataFrame({
-            'SepalLengthCm':[5.1, 4.9, 4.7],
-            'SepalWidthCm':[1.4, 1.4, 1.3],
-            'PetalWidthCm:[0.2, 0.2, 0.2],
-            'Species':['Iris-setosa','Iris-virginica']
-        })
+def cor_map(dataframe, num_col):
+    """
+    A function to implement a correlation heatmap including coefficients based on given numeric columns of a data frame.
 
-        numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
-        
-        cor_map(data, numerical_columns)
+    Parameters
+    ----------
+    dataframe : pandas.core.frame.DataFrame
+        The data frame to be used for EDA.
+    num_col : list  
+        A list of string of column names with numeric data from the data frame.
+
+    Returns
+    -------
+    altair.vegalite.v4.api.Chart
+        A correlation heatmap plot with correlation coefficient labels based on the numeric columns specified by user.
+
+    Examples
+    ---------
+    >>> import pandas as pd
+    >>> from eda_utils_py import cor_map
+    
+    >>> data = pd.DataFrame({
+    >>>     'SepalLengthCm':[5.1, 4.9, 4.7],
+    >>>     'SepalWidthCm':[1.4, 1.4, 1.3],
+    >>>     'PetalWidthCm':[0.2, 0.2, 0.2],
+    >>>     'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
+    >>> })
+
+    >>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']    
+    >>> cor_map(data, numerical_columns)
         
     """
+    pass
 
 
-def outlier_identifier(dataframe, columns=None, method="somefunction"):
+def outlier_identifier(dataframe, columns=None, method="trim"):
     """
     A function that identify and deal with outliers based on the method the user choose
 
-    Key arguments: 
-        dataframe [pandas.DataFrame]: 
-            The target dataframe where the function is performed.
-        columns [list] : None
-            The target columns where the function needed to be performed. Defualt is None, the function will check all columns
-        method [string] : "somefunction"
-            The method of dealing with outliers. 
+    Parameters
+    ---------- 
+    dataframe : pandas.core.frame.DataFrame
+        The target dataframe where the function is performed.
+    columns : list, default=None
+        The target columns where the function needed to be performed. Defualt is None, the function will check all columns
+    method : string
+        The method of dealing with outliers. 
+            - if "trim" : we completely remove data points that are outliers.
+            - if "median" : we replace outliers with median values
         
-    Returns:
-        dataframe :
-            The dataframe which the outlier has already process by the chosen method
+    Returns
+    -------
+    pandas.core.frame.DataFrame
+        a dataframe which the outlier has already process by the chosen method
     
-    Examples:
-        data = pd.DataFrame({
-            'SepalLengthCm':[5.1, 4.9, 4.7],
-            'SepalWidthCm':[1.4, 1.4, 9999999.99],
-            'PetalWidthCm:[0.2, 0.2, 0.2],
-            'Species':['Iris-setosa','Iris-virginica']
-        })
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> from eda_utils_py import cor_map
+        
+    >>> data = pd.DataFrame({
+    >>>    'SepalLengthCm':[5.1, 4.9, 4.7],
+    >>>    'SepalWidthCm':[1.4, 1.4, 9999999.99],
+    >>>    'PetalWidthCm:[0.2, 0.2, 0.2],
+    >>>    'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
+    >>> })
 
-        outlier_identifier(data)
+    >>> outlier_identifier(data)
 
     """
+    pass
 
 
 def scale(dataframe, columns=None):
     """
     A function to scale features by removing the mean and scaling to unit variance
-.
-
-    Args:
-        dataframe (pandas.DataFrame): The data frame to be used for EDA.
-        columns (list):  A list of string of column names with numeric data from the data frame that we wish to scale.
-
-    Returns:
-        dataframe :
-            The scaled dataframe for numerical features
-
-    Examples:
-        import pandas as pd
-        from eda_utils_py import scale
-
-        data = pd.DataFrame({
-            'SepalLengthCm':[5.1, 4.9, 4.7],
-            'SepalWidthCm':[1.4, 1.4, 1.3],
-            'PetalWidthCm:[0.2, 0.2, 0.2],
-            'Species':['Iris-setosa','Iris-virginica']
-        })
-
-        numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
-
-        scale(data, numerical_columns)
 
+    Parameters
+    ----------
+    dataframe : pandas.DataFrame
+        The data frame to be used for EDA.
+    columns : list, default=None
+        A list of string of column names with numeric data from the data frame that we wish to scale.
+
+    Returns
+    -------
+    dataframe : pandas.core.frame.DataFrame
+        The scaled dataframe for numerical features
+
+    Examples
+    --------
+    >>> import pandas as pd
+    >>> from eda_utils_py import scale
+    
+    >>> data = pd.DataFrame({
+    >>>     'SepalLengthCm':[5.1, 4.9, 4.7],
+    >>>     'SepalWidthCm':[1.4, 1.4, 1.3],
+    >>>     'PetalWidthCm:[0.2, 0.2, 0.2],
+    >>>     'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
+    >>> })
+
+    >>> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
+    
+    >>> scale(data, numerical_columns)
     """
     pass
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml