Merge branch 'main' into main

wangjc640 · web-flow · commit c385c7aa141e · 2021-03-13T22:01:56.000-05:00
diff --git a/README.md b/README.md
@@ -15,10 +15,10 @@ $ pip install -i https://test.pypi.org/simple/ eda_utils_py
 ## Functions
 
 The four functions contained in this package are as follows:
-- `cor_map`: A function to plot a correlation matrix of numeric columns in the dataframe
+- `imputer`: A function to impute missing values
 - `outlier_identifier`: A function to identify and deal with outliers
+- `cor_map`: A function to plot a correlation matrix of numeric columns in the dataframe
 - `scale` A function to scale numerical values in the dataset
-- `imputer`: A function to impute missing values
 
 
 ## Our Place in the Python Ecosystem
@@ -33,9 +33,9 @@ While Python packages with similar functionalities exist, this package aims to s
 - Please see a list of dependencies [here](pyproject.toml).
 
 ## Usage
-The eda_utils_py package help you to build exploratory data analysis.
+The eda_utils_py package will help you in your exploratory data analysis portion of your work.
 
-eda_utils_py includes multiple custom functions to perform initial exploratory analysis on any input data describing the structure and the relationships present in the data. The generated output can be obtained in both object and graphical form. 
+eda_utils_py includes multiple custom functions to perform initial exploratory analysis on any input data describing the structure and the relationships present in the data. Depending on the function, the generated output can be obtained in object or graphical form. 
 
 ```python
 import pandas as pd
@@ -59,39 +59,56 @@ data_with_outlier = pd.DataFrame({
          'SepalWidthCm':[1.4, 1.4, 1.3, 1.2, 1.2, 1.3, 1.6, 1.3],
          'PetalWidthCm':[0.2, 0.1, 30, 0.2, 0.3, 0.1, 0.4, 0.5]
          })
+         
+data_with_scale = pd.DataFrame({'SepalLengthCm':[1, 0, 0, 3, 4], 
+         'SepalWidthCm':[4, 1, 1, 0, 1], 
+         'PetalWidthCm':[2, 0, 0, 2, 1],
+         'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica', 'Iris-virginica','Iris-germanica']})      
 ```
 
-The eda_utils_py will help you to:
-- Diagnose data quality: Resolve skewed data by identifing missing data and outlier and provide corresponding remedy.
+The eda_utils_py package contains functions that will help you to:
+- **Impute**: Resolve skewed data by identifying missing data and outlier and provide corresponding remedy.
 
 ```python
 imputer(data_with_NA)
 ```
-Output:
+Output of `imputer()`:
 
 ![imputer_output](images/imputer_output.png)
 
+- **Identify Outliers**: Identify and deal with outliers in the dataset.
+
 ```python
 outlier_identifier(data_with_outlier, method = "median")
 ```
-Output:
+Output of `outlier_identifier()`:
 
 ![outlier_output](images/outlier_output.png)
 
-- This package can help you easily plot a correlation matrix along with its values to help explore data.
+- **Correlation Heatmap Plotting**: Easily plot a correlation matrix along with its values to help explore data.
 
 ```python
 numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
 
 cor_map(data, numerical_columns, col_scheme = 'purpleorange')
 
 ```
-Output:
+Output of `cor_map()`:
 
 ![cor_map_output](images/cor_map.output.png)
 
-- Machine learning pereperation: Perform column transformations, derive scaler automatically to fulfill further machine learning need
-    
+- **Scaling**: Scale the data in preperation for future use in machine learning projects.
+
+```python
+numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
+
+scale(data, numerical_columns, scaler="minmax")
+
+```
+Output of `scale()`:
+
+![scale_output](images/scale_output.png)
+
 ## Documentation
 
 The official documentation is hosted on Read the Docs: https://eda_utils_py.readthedocs.io/en/latest/
diff --git a/eda_utils_py/__init__.py b/eda_utils_py/__init__.py
@@ -1 +1,2 @@
-__version__ = '0.1.9'
+__version__ = '0.1.12'
+
diff --git a/eda_utils_py/eda_utils_py.py b/eda_utils_py/eda_utils_py.py
@@ -55,7 +55,7 @@ def imputer(df, strategy="mean", fill_value=None):
 
     # Tests whether input fill_value is of type numbers or None
     if not isinstance(fill_value, type(None)) and not isinstance(
-            fill_value, numbers.Number
+        fill_value, numbers.Number
     ):
         raise TypeError("fill_value must be of type None or numeric type")
 
@@ -159,13 +159,17 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
 
     plot = (
         alt.Chart(corr_matrix)
-            .mark_rect()
-            .encode(
+        .mark_rect()
+        .encode(
             x=alt.X("var1", title=None),
             y=alt.Y("var2", title=None),
-            color=alt.Color("cor", title = 'Correlation', scale=alt.Scale(scheme=col_scheme, domain = (-1,1))),
+            color=alt.Color(
+                "cor",
+                title="Correlation",
+                scale=alt.Scale(scheme=col_scheme, domain=(-1, 1)),
+            ),
         )
-            .properties(title="Correlation Matrix", width=400, height=400)
+        .properties(title="Correlation Matrix", width=400, height=400)
     )
 
     text = plot.mark_text(size=15).encode(
@@ -195,7 +199,7 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
             - if "trim" : we completely remove data points that are outliers.
             - if "median" : we replace outliers with median values
             - if "mean" : we replace outliers with mean values
-        
+
 
     Returns
     -------
@@ -206,13 +210,15 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
     --------
     >> import pandas as pd
     >> from eda_utils_py import cor_map
+
         
     >> df = pd.DataFrame({
     >>    'SepalLengthCm' : [5.1, 4.9, 4.7, 5.5, 5.1, 50, 5.4, 5.0, 5.2, 5.3, 5.1],
     >>    'SepalWidthCm' : [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
     >>    'PetalWidthCm' : [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5]
     >>})
 
+
     >> outlier_identifier(data)
     >> 	 SepalLengthCm  	SepalWidthCm	   PetalWidthCm
     >> 0	5.1	                1.4	                0.2
@@ -231,24 +237,30 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
     if columns is None:
         for col in dataframe.columns:
             if not is_numeric_dtype(dataframe[col]):
-                raise Exception("The given dataframe contains column that is not numeric column.")
+                raise Exception(
+                    "The given dataframe contains column that is not numeric column."
+                )
 
     if columns is not None:
         if not isinstance(columns, list):
             raise TypeError("The argument @columns must be of type list")
 
         for col in columns:
             if col not in list(dataframe.columns):
-                raise Exception("The given column list contains column that is not exist in the given dataframe.")
+                raise Exception(
+                    "The given column list contains column that is not exist in the given dataframe."
+                )
             if not is_numeric_dtype(dataframe[col]):
-                raise Exception("The given column list contains column that is not numeric column.")
+                raise Exception(
+                    "The given column list contains column that is not numeric column."
+                )
 
     if method not in ("trim", "median", "mean"):
         raise Exception("The method must be -trim- or -median- or -mean-")
 
     df = dataframe.copy()
     target_columns = []
-    if (columns is None):
+    if columns is None:
         target_columns = list(df.columns.values.tolist())
     else:
         target_columns = columns
@@ -264,14 +276,14 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
             current_item = current_column[i]
             z = (current_item - mean) / std
             if z >= threshold:
-                if (i not in outlier_index):
+                if i not in outlier_index:
                     outlier_index.append(i)
-                if (method == "mean"):
+                if method == "mean":
                     df.at[i, column] = round(mean, 2)
-                if (method == "median"):
+                if method == "median":
                     df.at[i, column] = np.median(current_column)
 
-    if (method == "trim"):
+    if method == "trim":
         df = df.drop(outlier_index)
 
     df.index = range(len(df))
@@ -314,12 +326,12 @@ def scale(dataframe, columns, scaler="standard"):
 
     >> scale(data, numerical_columns, scaler="minmax")
 
-       SepalLengthCm  SepalWidthCm  PetalWidthCm
-    0           0.25          1.00           1.0
-    1           0.00          0.25           0.0
-    2           0.00          0.25           0.0
-    3           0.75          0.00           1.0
-    4           1.00          0.25           0.5
+    >>    SepalLengthCm  SepalWidthCm  PetalWidthCm
+    >> 0           0.25          1.00           1.0
+    >> 1           0.00          0.25           0.0
+    >> 2           0.00          0.25           0.0
+    >> 3           0.75          0.00           1.0
+    >> 4           1.00          0.25           0.5
     """
 
     # Check if input data is of pd.DataFrame type
@@ -340,7 +352,7 @@ def scale(dataframe, columns, scaler="standard"):
         if col not in list(dataframe.columns):
             raise Exception("The given column names must exist in the given dataframe.")
 
-    # Check if all input columns in num_col are numeric columns
+    # Check if all input columns in columns are numeric columns
     for col in columns:
         if not is_numeric_dtype(dataframe[col]):
             raise Exception("The given numerical columns must all be numeric.")
@@ -349,16 +361,6 @@ def scale(dataframe, columns, scaler="standard"):
     if not isinstance(scaler, str):
         raise TypeError("Scaler must be of type str")
 
-    # Check if all input columns exist in the input data
-    for col in columns:
-        if col not in list(dataframe.columns):
-            raise Exception("The given column names must exist in the given dataframe.")
-
-    # Check if all input columns in num_col are numeric columns
-    for col in columns:
-        if not is_numeric_dtype(dataframe[col]):
-            raise Exception("The given columns must all be numeric.")
-
     scaled_df = None
     if scaler == "minmax":
         scaled_df = _minmax(dataframe[columns])
@@ -396,24 +398,24 @@ def _standardize(dataframe):
 
 def _minmax(dataframe):
     """Transform features by rescaling each feature to the range between 0 and 1.
-        The transformation is given by:
+    The transformation is given by:
 
-            scaled_value = (feature_value - min) / (mix - min)
+        scaled_value = (feature_value - min) / (mix - min)
 
-        where min, max = feature_range.
+    where min, max = feature_range.
 
-        This transformation is often used as an alternative to zero mean,
-        unit variance scaling.
+    This transformation is often used as an alternative to zero mean,
+    unit variance scaling.
 
-        Parameters
-        ----------
-        dataframe : pandas.DataFrame
-            The data frame to be used for EDA.
-        Returns
-        -------
-        res : pandas.core.frame.DataFrame
-            Scaled dataset
-        """
+    Parameters
+    ----------
+    dataframe : pandas.DataFrame
+        The data frame to be used for EDA.
+    Returns
+    -------
+    res : pandas.core.frame.DataFrame
+        Scaled dataset
+    """
 
     res = dataframe.copy()
     for feature_name in dataframe.columns:
diff --git a/images/scale_output.png b/images/scale_output.png
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "eda_utils_py"
-version = "0.1.9"
+version = "0.1.12"
 description = "Python package that contains util functions for eda process"
 authors = ["Chuang Wang <chuangw.sde@gmail.com>"]
 license = "MIT"
diff --git a/tests/test_eda_utils_py.py b/tests/test_eda_utils_py.py
@@ -173,6 +173,14 @@ def test_cor_map():
 
 
 def test_scaler():
+    data = pd.DataFrame(
+        {
+            "SepalLengthCm": [5.1, 4.9, 4.7],
+            "SepalWidthCm": [1.4, 1.4, 1.3],
+            "PetalWidthCm": [0.2, 0.1, 0.2],
+            "Species": ["Iris-setosa", "Iris-virginica", "Iris-germanica"],
+        }
+    )
     mock_df_1 = pd.DataFrame(
         {"col1": [1, 0, 0, 3, 4], "col2": [4, 1, 1, 0, 1], "col3": [2, 0, 0, 2, 1]}
     )
@@ -225,6 +233,25 @@ def test_scaler():
         mock_df_2, ["col1", "col2"], scaler="minmax"
     )
 
+    # Test if the imput is not dataFrame
+    with raises(TypeError):
+        eda_utils_py.scale("A string", ['one', 'two'])
+
+    # Tests if contents of columns is not of type str
+    with raises(TypeError):
+        eda_utils_py.scale(mock_df_1, [1, 2, 3, 4])
+
+    with raises(TypeError):
+        eda_utils_py.scale(mock_df_1, [None])
+
+    # Tests if columns do not exist in the dataframe
+    with raises(Exception):
+        eda_utils_py.scale(mock_df_1, ['one', 'two'])
+
+    # Tests if if not all columns in columns are numeric
+    with raises(Exception):
+        eda_utils_py.scale(data, ['Species'])
+
     # Tests whether data is not of type pd.Dataframe raises TypeError
     with raises(TypeError):
         eda_utils_py.scale([14, None, 3, 27])

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-__version__ = '0.1.9'`
	`1`	`+__version__ = '0.1.12'`
	`2`	`+`