UBC-MDS
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 36 additions & 0 deletions b/‎.github/workflows/build.yml‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎.github/workflows/deploy.yml‎
Lines changed: 70 additions & 0 deletions b/‎.github/workflows/deploy.yml‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 31 additions & 7 deletions b/‎README.md‎
Lines changed: 31 additions & 7 deletions
diff --git a/‎eda_utils_py/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎eda_utils_py/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎eda_utils_py/eda_utils_py.py‎
Lines changed: 48 additions & 41 deletions b/‎eda_utils_py/eda_utils_py.py‎
Lines changed: 48 additions & 41 deletions
diff --git a/‎images/cor_map.output.png‎
34.3 KB b/‎images/cor_map.output.png‎
34.3 KB
@@ -0,0 +1,36 @@
+name: build
+
+on:
+  # Trigger the workflow on push or pull request to main
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.8
+    - name: Install dependencies
+      run: |
+        pip install poetry
+        poetry install
+#     - name: Check style
+#       run: poetry run flake8 --exclude=docs*
+    - name: Test with pytest
+      run: poetry run pytest --cov=./ --cov-report=xml
+    - name: Upload coverage to Codecov  
+      uses: codecov/codecov-action@v1
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        file: ./coverage.xml
+        flags: unittests
+        name: codecov-umbrella
+        fail_ci_if_error: true
@@ -0,0 +1,70 @@
+name: Deploy
+
+on:
+  # Trigger the workflow on push or pull request to main
+  push:
+    branches:
+      - main
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.8
+    - name: Install dependencies
+      run: |
+        pip install poetry
+        poetry install
+    - name: Test with pytest
+      run: poetry run pytest --cov=./ --cov-report=xml
+    - name: Upload coverage to Codecov  
+      uses: codecov/codecov-action@v1
+      with:
+        token: ${{ secrets.CODECOV_TOKEN }}
+        file: ./coverage.xml
+        flags: unittests
+        name: codecov-umbrella
+        fail_ci_if_error: true
+    - name: checkout
+      uses: actions/checkout@master
+      with:
+        ref: main
+    - name: Bump version and tagging and publish
+      run: |
+        git config --local user.email "action@github.com"
+        git config --local user.name "GitHub Action"
+        git pull origin main
+        poetry run semantic-release version
+        poetry version $(grep "version" */__init__.py | cut -d "'" -f 2 | cut -d '"' -f 2)
+        git commit -m "Bump versions" -a
+    - name: Push package version changes
+      uses: ad-m/github-push-action@master
+      with:
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+    - name: Get release tag version from package version
+      run: |
+        echo ::set-output name=release_tag::$(grep "version" */__init__.py | cut -d "'" -f 2 | cut -d '"' -f 2)
+      id: release
+    - name: Create Release with new version
+      id: create_release
+      uses: actions/create-release@v1
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      with:
+        tag_name: ${{ steps.release.outputs.release_tag }}
+        release_name: ${{ steps.release.outputs.release_tag }}
+        draft: false
+        prerelease: false
+    - name: Build package and publish to test PyPI
+      env:
+        TEST_PYPI_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
+        TEST_PYPI_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
+      run: |
+        poetry config repositories.test-pypi https://test.pypi.org/legacy/
+        poetry build
+        poetry publish -r test-pypi -u $TEST_PYPI_USERNAME -p $TEST_PYPI_PASSWORD
+
@@ -1,6 +1,6 @@
 # eda_utils_py 
 
-![](https://github.com/chuangw46/eda_utils_py/workflows/build/badge.svg) [![codecov](https://codecov.io/gh/chuangw46/eda_utils_py/branch/main/graph/badge.svg)](https://codecov.io/gh/chuangw46/eda_utils_py) ![Release](https://github.com/chuangw46/eda_utils_py/workflows/Release/badge.svg) [![Documentation Status](https://readthedocs.org/projects/eda_utils_py/badge/?version=latest)](https://eda_utils_py.readthedocs.io/en/latest/?badge=latest)
+[![build](https://github.com/UBC-MDS/eda_utils_py/actions/workflows/build.yml/badge.svg)](https://github.com/UBC-MDS/eda_utils_py/actions/workflows/build.yml) ![](https://github.com/chuangw46/eda_utils_py/workflows/build/badge.svg) [![codecov](https://codecov.io/gh/UBC-MDS/eda_utils_py/branch/main/graph/badge.svg)](https://codecov.io/gh/UBC-MDS/eda_utils_py) [![Deploy](https://github.com/UBC-MDS/eda_utils_py/actions/workflows/deploy.yml/badge.svg)](https://github.com/UBC-MDS/eda_utils_py/actions/workflows/deploy.yml) [![Documentation Status](https://readthedocs.org/projects/eda_utils_py/badge/?version=latest)](https://eda_utils_py.readthedocs.io/en/latest/?badge=latest)
 
 ## Overview 
 
@@ -30,19 +30,43 @@ While Python packages with similar functionalities exist, this package aims to s
 
 ## Dependencies
 
-- TBD
+- Please see a list of dependencies [here](pyproject.toml).
 
 ## Usage
 The eda_utils_py package help you to build exploratory data analysis.
 
 eda_utils_py includes multiple custom functions to perform initial exploratory analysis on any input data describing the structure and the relationships present in the data. The generated output can be obtained in both object and graphical form. 
 
-The eda_utils_py is capable of :
-- Diagnose data quality : Resolve skewed data by identifing missing data and outlier and provide corresponding remedy.
-- Discover data: Plot correlation mattrix to help explore data to understand the data and find scenarios for performing the analysis.
-- Machine learning pereperation : Perform column transformations, derive scaler automatically to fulfill further machine learning need
-    
+```python
+import pandas as pd
+from eda_utils_py import eda_utils_py
+
+data = pd.DataFrame({
+         'SepalLengthCm':[5.1, 4.9, 4.7],
+         'SepalWidthCm':[1.4, 1.4, 1.3],
+         'PetalWidthCm':[0.2, 0.1, 0.2],
+         'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
+         })
+```
+
+The eda_utils_py will help you to:
+- Diagnose data quality: Resolve skewed data by identifing missing data and outlier and provide corresponding remedy.
+
+
+- This package can help you easily plot a correlation matrix along with its values to help explore data.
 
+```python
+numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
+
+cor_map(data, numerical_columns, col_scheme = 'purpleorange')
+
+```
+Output:
+
+![cor_map_output](images/cor_map.output.png)
+
+- Machine learning pereperation: Perform column transformations, derive scaler automatically to fulfill further machine learning need
+    
 ## Documentation
 
 The official documentation is hosted on Read the Docs: https://eda_utils_py.readthedocs.io/en/latest/
 
@@ -1 +1 @@
-__version__ = '0.1.0'
+__version__ = '0.1.3'
@@ -113,7 +113,7 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
     >> data = pd.DataFrame({
     >>     'SepalLengthCm':[5.1, 4.9, 4.7],
     >>     'SepalWidthCm':[1.4, 1.4, 1.3],
-    >>     'PetalWidthCm':[0.2, 0.2, 0.2],
+    >>     'PetalWidthCm':[0.2, 0.1, 0.2],
     >>     'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
     >> })
 
@@ -163,7 +163,7 @@ def cor_map(dataframe, num_col, col_scheme="purpleorange"):
             .encode(
             x=alt.X("var1", title=None),
             y=alt.Y("var2", title=None),
-            color=alt.Color("cor", legend=None, scale=alt.Scale(scheme=col_scheme)),
+            color=alt.Color("cor", title = 'Correlation', scale=alt.Scale(scheme=col_scheme, domain = (-1,1))),
         )
             .properties(title="Correlation Matrix", width=400, height=400)
     )
@@ -185,35 +185,37 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
     A function that identify by z-test with threshold of 3, and deal with outliers based on the method the user choose
 
     Parameters
-    ---------- 
+    ----------
     dataframe : pandas.core.frame.DataFrame
         The target dataframe where the function is performed.
     columns : list, default=None
         The target columns where the function needed to be performed. Defualt is None, the function will check all columns
     method : string
-        The method of dealing with outliers. 
+        The method of dealing with outliers.
             - if "trim" : we completely remove data points that are outliers.
             - if "median" : we replace outliers with median values
             - if "mean" : we replace outliers with mean values
         
+
     Returns
     -------
     pandas.core.frame.DataFrame
         a dataframe which the outlier has already process by the chosen method
-    
+
     Examples
     --------
-    >>> import pandas as pd
-    >>> from eda_utils_py import cor_map
+    >> import pandas as pd
+    >> from eda_utils_py import cor_map
         
-    >>> data = pd.DataFrame({
-    >>>    'SepalLengthCm':[5.1, 4.9, 4.7],
-    >>>    'SepalWidthCm':[1.4, 1.4, 99],
-    >>>    'PetalWidthCm:[0.2, 0.2, 0.2],
-    >>>    'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
-    >>> })
+    >> data = pd.DataFrame({
+    >>    'SepalLengthCm':[5.1, 4.9, 4.7],
+    >>    'SepalWidthCm':[1.4, 1.4, 99],
+    >>    'PetalWidthCm:[0.2, 0.2, 0.2],
+    >>    'Species':['Iris-setosa', 'Iris-virginica', 'Iris-germanica']
+    >> })
+
+    >> outlier_identifier(data)
 
-    >>> outlier_identifier(data)
 
     """
     if not isinstance(dataframe, pd.DataFrame):
@@ -222,58 +224,54 @@ def outlier_identifier(dataframe, columns=None, method="trim"):
     if columns is None:
         for col in dataframe.columns:
             if not is_numeric_dtype(dataframe[col]):
-                raise Exception("The given dataframe contains column that is not numeric column.")  
-                
+                raise Exception("The given dataframe contains column that is not numeric column.")
+
     if columns is not None:
         if not isinstance(columns, list):
             raise TypeError("The argument @columns must be of type list")
-          
-        
+
         for col in columns:
             if col not in list(dataframe.columns):
-                raise Exception("The given column list contains column that is not exist in the given dataframe.")    
+                raise Exception("The given column list contains column that is not exist in the given dataframe.")
             if not is_numeric_dtype(dataframe[col]):
                 raise Exception("The given column list contains column that is not numeric column.")
- 
+
     if method not in ("trim", "median", "mean"):
         raise Exception("The method must be -trim- or -median- or -mean-")
 
-    
     df = dataframe.copy()
     target_columns = []
-    if(columns is None):
-        target_columns = list(df.columns.values.tolist()) 
+    if (columns is None):
+        target_columns = list(df.columns.values.tolist())
     else:
         target_columns = columns
-        
+
     outlier_index = []
     for column in target_columns:
         current_column = df[column]
         mean = np.mean(current_column)
         std = np.std(current_column)
-        threshold = 3 
-        
-        
+        threshold = 3
+
         for i in range(len(current_column)):
             current_item = current_column[i]
             z = (current_item - mean) / std
             if z >= threshold:
-                if(i not in outlier_index):
+                if (i not in outlier_index):
                     outlier_index.append(i)
-                if(method == "mean"):
+                if (method == "mean"):
                     df.at[i, column] = round(mean, 2)
-                if(method == "median"):
+                if (method == "median"):
                     df.at[i, column] = np.median(current_column)
-                
-    
-    if(method == "trim"):
+
+    if (method == "trim"):
         df = df.drop(outlier_index)
-        
+
     df.index = range(len(df))
     return df
 
 
-def scale(dataframe, columns=None, scaler="standard"):
+def scale(dataframe, columns, scaler="standard"):
     """
     A function to scale features either by using standard scaler or minmax scaler method
 
@@ -299,15 +297,22 @@ def scale(dataframe, columns=None, scaler="standard"):
     >> from eda_utils_py import scale
 
     >> data = pd.DataFrame({
-    >>     'SepalLengthCm':[5.1, 4.9, 4.7],
-    >>     'SepalWidthCm':[1.4, 1.4, 1.3],
-    >>     'PetalWidthCm:[0.2, 0.2, 0.2],
+    >>     'SepalLengthCm':[1, 0, 0, 3, 4],
+    >>     'SepalWidthCm':[4, 1, 1, 0, 1],
+    >>     'PetalWidthCm:[2, 0, 0, 2, 1],
     >>     'Species':['Iris-setosa','Iris-virginica', 'Iris-germanica']
     >> })
 
     >> numerical_columns = ['SepalLengthCm','SepalWidthCm','PetalWidthCm']
 
     >> scale(data, numerical_columns, scaler="minmax")
+
+       SepalLengthCm  SepalWidthCm  PetalWidthCm
+    0           0.25          1.00           1.0
+    1           0.00          0.25           0.0
+    2           0.00          0.25           0.0
+    3           0.75          0.00           1.0
+    4           1.00          0.25           0.5
     """
 
     # Check if input data is of pd.DataFrame type
@@ -370,9 +375,10 @@ def _standardize(dataframe):
         The data frame to be used for EDA.
     Returns
     -------
-    self : object
+    res : pandas.core.frame.DataFrame
         Scaled dataset
     """
+
     res = dataframe.copy()
     for feature_name in dataframe.columns:
         mean = dataframe[feature_name].mean()
@@ -398,7 +404,7 @@ def _minmax(dataframe):
             The data frame to be used for EDA.
         Returns
         -------
-        self : object
+        res : pandas.core.frame.DataFrame
             Scaled dataset
         """
 
@@ -407,4 +413,5 @@ def _minmax(dataframe):
         max = dataframe[feature_name].max()
         min = dataframe[feature_name].min()
         res[feature_name] = (dataframe[feature_name] - min) / (max - min)
-    return res
+
+    return res
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '0.1.0'`
	`1`	`+__version__ = '0.1.3'`