From 416d88e03e733f8ca941829906258dfa07e9caa9 Mon Sep 17 00:00:00 2001 From: Frankie0609 <33159972+Frankie0609@users.noreply.github.com> Date: Thu, 14 Dec 2023 17:54:41 -0500 Subject: [PATCH 01/26] Updated --- HESC/baseline_models/cluster_ensembles.py | 2 +- HESC/baseline_models/{ensemble_sc_(1).py => ensemble_sc.py} | 4 ++-- HESC/baseline_models/parea_multi_view_ensemble_clustering.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename HESC/baseline_models/{ensemble_sc_(1).py => ensemble_sc.py} (99%) diff --git a/HESC/baseline_models/cluster_ensembles.py b/HESC/baseline_models/cluster_ensembles.py index 4fd5751..8a0d6fd 100644 --- a/HESC/baseline_models/cluster_ensembles.py +++ b/HESC/baseline_models/cluster_ensembles.py @@ -18,7 +18,7 @@ !pip install "dask[dataframe]" !pip install netCDF4 -!git clone https://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@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +!git clone https://""@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git # Commented out IPython magic to ensure Python compatibility. # %cd multivariate-weather-data-clustering diff --git a/HESC/baseline_models/ensemble_sc_(1).py b/HESC/baseline_models/ensemble_sc.py similarity index 99% rename from HESC/baseline_models/ensemble_sc_(1).py rename to HESC/baseline_models/ensemble_sc.py index b2335c2..3420bf8 100644 --- a/HESC/baseline_models/ensemble_sc_(1).py +++ b/HESC/baseline_models/ensemble_sc.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -"""Ensemble_SC (1).ipynb +"""Ensemble_SC.ipynb Automatically generated by Colaboratory. @@ -22,7 +22,7 @@ !pip install "dask[dataframe]" !pip install netCDF4 -!git clone https://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@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +!git clone https:// # Commented out IPython magic to ensure Python compatibility. # %cd multivariate-weather-data-clustering diff --git a/HESC/baseline_models/parea_multi_view_ensemble_clustering.py b/HESC/baseline_models/parea_multi_view_ensemble_clustering.py index 853b034..f6f4f62 100644 --- a/HESC/baseline_models/parea_multi_view_ensemble_clustering.py +++ b/HESC/baseline_models/parea_multi_view_ensemble_clustering.py @@ -25,7 +25,7 @@ !pip install "dask[dataframe]" !pip install netCDF4 -!git clone https://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@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +!git clone https://""@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git # Commented out IPython magic to ensure Python compatibility. # %cd multivariate-weather-data-clustering From be18cbca6d8d80730d186d4ada9d1c46c8aeb38d Mon Sep 17 00:00:00 2001 From: Frankie0609 <33159972+Frankie0609@users.noreply.github.com> Date: Thu, 14 Dec 2023 18:18:50 -0500 Subject: [PATCH 02/26] Updated Parea --- .../parea_multi_view_ensemble_clustering.py | 67 +------------------ 1 file changed, 1 insertion(+), 66 deletions(-) diff --git a/HESC/baseline_models/parea_multi_view_ensemble_clustering.py b/HESC/baseline_models/parea_multi_view_ensemble_clustering.py index f6f4f62..5a5f921 100644 --- a/HESC/baseline_models/parea_multi_view_ensemble_clustering.py +++ b/HESC/baseline_models/parea_multi_view_ensemble_clustering.py @@ -92,66 +92,6 @@ from netCDF4 import date2num,num2date from math import sqrt -# from sklearn import preprocessing -# from sklearn.preprocessing import MinMaxScaler - -# def data_preprocessing(data_path): -# rdata_daily = xr.open_dataset(data_path) # data_path = '/content/drive/MyDrive/ERA5_Dataset.nc' -# rdata_daily_np_array = np.array(rdata_daily.to_array()) # the shape of the dailt data is (7, 365, 41, 41) -# rdata_daily_np_array_latitude = np.concatenate((rdata_daily_np_array, np.zeros((7, 365, 41,7), dtype=int)), axis=3) -# rdata_daily_np_array_longitude = np.concatenate((rdata_daily_np_array_latitude, np.zeros((7, 365, 7, 48), dtype=int)), axis=2) -# rdata_daily_np_array = rdata_daily_np_array_longitude -# rdata_daily_np_array_T = rdata_daily_np_array.transpose(1,0,2,3) # transform the dailt data from (7, 365, 41, 41) to (365, 7, 41, 41) -# overall_mean = np.nanmean(rdata_daily_np_array_T[:, :, :, :]) -# for i in range(rdata_daily_np_array_T.shape[0]): -# for j in range(rdata_daily_np_array_T.shape[1]): -# for k in range(rdata_daily_np_array_T.shape[2]): -# for l in range(rdata_daily_np_array_T.shape[3]): -# if np.isnan(rdata_daily_np_array_T[i, j, k, l]): -# #print("NAN data in ", i, j, k, l) -# rdata_daily_np_array_T[i, j, k, l] = overall_mean -# rdata_daily_np_array_T = rdata_daily_np_array_T.transpose(0,2,3,1) -# rdata_daily_np_array_T_R = rdata_daily_np_array_T.reshape((rdata_daily_np_array_T.shape[0], -1)) # transform the dailt data from (365, 7, 41, 41) to (365, 11767) -# min_max_scaler = preprocessing.MinMaxScaler() # calling the function -# rdata_daily_np_array_T_R_nor = min_max_scaler.fit_transform(rdata_daily_np_array_T_R) # now normalize the data, otherwise the loss will be very big -# #rdata_daily_np_array_T_R_nor = np.float32(rdata_daily_np_array_T_R_nor) # convert the data type to float32, otherwise the loass will be out-of-limit -# rdata_daily_np_array_T_R_nor_R = rdata_daily_np_array_T_R_nor.reshape((rdata_daily_np_array_T_R_nor.shape[0], 1, rdata_daily_np_array.shape[2], rdata_daily_np_array.shape[3], rdata_daily_np_array.shape[0])) -# # return rdata_daily_np_array_T_R_nor, rdata_daily_np_array_T_R_nor_R - -# from sklearn import preprocessing -# from sklearn.preprocessing import MinMaxScaler - -# def datatransformation(data_path, variables): -# ''' The parameters accepted by this function are as follows: -# 1. "data_path" is the path of the netCDF4 dataset file. (data_path = '/content/drive/MyDrive/ERA5_meteo_sfc_2021_daily.nc') -# 2. "variables" is an array of the variable names of the netCDF4 dataset those we want to read. (variables = ['sst', 'sp']) -# If the "variables" array is empty the function will read the whole dataset. - -# Return value: -# The function will return the normalized values of the selected variables as a 2D NumPy array of size (365 x ___) and a 4D array as (365, 41, 41, ___). -# ''' - -# rdata_daily = xr.open_dataset(data_path) # data_path = '/content/drive/MyDrive/ERA5_Dataset.nc' -# if(len(variables)==0): -# rdata_daily_np_array = np.array(rdata_daily.to_array()) # the shape of the dailt data is (7, 365, 41, 41) -# else: -# rdata_daily_np_array = np.array(rdata_daily[variables].to_array()) -# rdata_daily_np_array_R = rdata_daily_np_array.reshape((rdata_daily_np_array.shape[0], -1)) #(7, 613565) -# for i in range (rdata_daily_np_array_R.shape[0]): -# tmp = rdata_daily_np_array_R[i] -# tmp[np.isnan(tmp)]=np.nanmean(tmp) -# rdata_daily_np_array_R[i] = tmp -# min_max_scaler = MinMaxScaler() # calling the function -# rdata_daily_np_array_nor = min_max_scaler.fit_transform(rdata_daily_np_array_R.T).T -# rdata_daily_np_array_nor_4D = rdata_daily_np_array_nor.reshape(rdata_daily_np_array.shape) # (7, 613565) to (7, 365, 41, 41) -# rdata_daily_np_array_nor_4D_T = rdata_daily_np_array_nor_4D.transpose(1,2,3,0) # (7, 365, 41, 41) to (365, 41, 41, 7) -# rdata_daily_np_array_nor_4D_T_R = rdata_daily_np_array_nor_4D_T.reshape((rdata_daily_np_array_nor_4D_T.shape[0], -1)) #(365, 11767) -# data_2d = rdata_daily_np_array_nor_4D_T_R -# data_4d = rdata_daily_np_array_nor_4D_T -# return data_2d, data_4d - - - from sklearn.metrics import silhouette_samples, silhouette_score def silhouette_score1(X, labels, *, metric="cosine", sample_size=None, random_state=None, **kwds): return np.mean(silhouette_samples(X, labels, metric="cosine", **kwds)) @@ -230,14 +170,9 @@ def parea(data): """# **Evaluation Metrics** -**Silhouette Score** +**Davies bouldin** """ -def silhouette_score1(X, labels, *, metric="cosine", sample_size=None, random_state=None, **kwds): - return np.mean(silhouette_samples(X, labels, metric=metric, **kwds)) - -"""**Davies bouldin**""" - def davies_bouldin_score(X, labels): return print("Davies-Bouldin score is ", davies_bouldin_score(X, labels)) From c83a38ac0bbd7e1d44b2d2681a713e5254ca9d3d Mon Sep 17 00:00:00 2001 From: Frankie0609 <33159972+Frankie0609@users.noreply.github.com> Date: Fri, 15 Dec 2023 12:43:20 -0500 Subject: [PATCH 03/26] Updated readme --- HESC/.Rhistory | 0 HESC/README_hesc.md | 163 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) create mode 100644 HESC/.Rhistory create mode 100644 HESC/README_hesc.md diff --git a/HESC/.Rhistory b/HESC/.Rhistory new file mode 100644 index 0000000..e69de29 diff --git a/HESC/README_hesc.md b/HESC/README_hesc.md new file mode 100644 index 0000000..0eccf16 --- /dev/null +++ b/HESC/README_hesc.md @@ -0,0 +1,163 @@ +# multivariate-weather-data-clustering for HESC branch + +## Download + +There are three ways to Download and Manage the MWDC package: + +1 - Use [GitHub Desktop](https://desktop.github.com/) (Recomended) + +2 - Use command line: + +```bash + git clone https://github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +``` + +\*Because the repository is private the command line method is not Recomended. + +3 - Download the `.zip` file and use it. + +4 - On Google Colab use the command below. + +```bash +!git clone https://{clasic_access_token}@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +``` + +\*\* This is how to generat [clasic_access_token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token#creating-a-personal-access-token-classic). + +## Installation + +#### 1. On PC + +To install the package you need to create an environment using [pip](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) or [conda](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). + +##### Conda environment setup +```bash +conda create -n mwdc pandas numpy xarray netCDF4 matplotlib scikit-learn scipy dask +conda activate mwdc +``` + +After that just clone this repository and install the ` setup.py` file inside it. + +```bash + cd multivariate-weather-data-clustering + python setup.py install +``` + +Note: If you are using macOS, you should use ` python3 setup.py install` instead. + +#### 2. On Google Colab + +After cloning the repository just run the command below to install it. + +```bash + %cd multivariate-weather-data-clustering + !python setup.py install +``` + +## Usage + +To use the functions you just need to import them from MWDC. Modules could be imported either seperately or all together. + +```python +from mwdc import * + +## or ## + +from mwdc.preprocessing import preprocessing +from mwdc.evaluation import st_evaluation +from mwdc.visualization import visualization + +``` + +Example: + +```python +trans_data = preprocessing.datatransformation(data) +``` + +## Modules Documentation + +### preprocessing + +| Functions | Description | +| :--------------------- | :------------------------------------------------------------------------------- | +| `transformddaily()` | Transformation function for Daily Data | +| `transformdmock()` | Transformation function for Mock Data | +| `transformqm()` | Variable for Quater Map | +| `datatransformation()` | Description in the Note below\* | +| `datanormalization()` | Input in this case will be the transformed pandas dataframe | +| `null_fill()` | Function to input NaN values across variables | +| `pca1()` | data is data to be input , n is the number of components | +| `pcacomponents()` | Showing the proper number of components for pca by computing cumulative variance | +| `data_preprocessing()` | Transforms the xArray input data into a 2D NumPy Array. | + +\*Note: This function is used to transform the xarray dataset into a pandas dataframe where the dimension "time" would become the index of the DataFrame and, +pairs of both dimensions "latitude" and "longitude" will become the columns for each variable + +### clustering + +#### - DBscan + +| Functions | Description | +| :------------------------------- | :-------------------------------------------------------------- | +| `dbscanreal(x, eps1=0.5, min=5)` | eps1 for epsilon , min for minimum samples, x is for data input | + +#### - Agglomerative Clustering + +| Functions | Description | +| :------------------------------- | :-------------------------------------------------------------- | +| `st_agglomerative(data, n, K, p=7, affinity, linkage)| n=PCA components, K=number of clusters, p=truncate_mode. + +#### - Kmeans + +| Functions | Description | +| :----------------------------------------------------------------------------- | :---------- | +| `Kmeans(n_cluster).fit(xarray_data, PCA=(boolian), pass_trans_data=(boolian))` | \* | +| `Kmeans(n_cluster).evaluate(z, PCA=(boolian), pass_trans_data=(boolian))` | \*\* | + +\* This function fits the K-means model to the data that is passed to it. + Parameters that this function will accept are as follows: + +1. xarray_data = string of the name of the original xarray file +2. PCA (bool) = whether or not PCA has to be applied. Default value is True. +3. pass_trans_data (bool) = whether saved data has to be passed. If False, data will be transformed instantly. Default value is True. + +\*\* This function evaluates and assigns data points to clusters. +Parameters that this function will accept are as follows: + +1. z = string of the name of the original xarray file. +2. PCA (bool) = whether or not PCA has to be applied. Default value is True. +3. pass_trans_data (bool) = whether saved data has to be passed. If False, data will be transformed instantly. Default value is True. + +#### - evaluation + +| Functions | Params | +| :--------------------------- | :---------------------------------------------------------------------------------------------------- | +| `st_rmse()` | input,formed_clusters | +| `st_corr()` | input,formed_clusters | +| `st_calinski()` | input,formed_clusters | +| `davies_bouldin()` | input, formed_clusters | +| `compute_silhouette_score()` | X, labels,transformation=False, \*, metric="euclidean", sample_size=None, random_state=None, \*\*kwds | + +#### - visualization + +| Functions | Params | +| :------------------- | :------------------------------------ | +| `visualization()` | data_file,cluster_filename,coast_file | +| `make_Csv_cluster()` | label,name | + +\* Parameters that `visualization()` will accept are as follows: + +1. data_file is the .nc file. + \- Example data_file = 'path/data.nc' It is the raw unprocessed data. +2. cluster_filename is the csv file which contains clusterid and time_step. + \- Example cluster_filename = 'path/clusters.csv' # This file contains what cluster belongs to what date. +3. coast_file = This file contains the data of how a coastline should look like in the result. + \- Example 'path/coast.txt'. + +#### + +\* Parameters that `make_Csv_cluster()` will accept are as follows: + +1. label contains the clusterids. +2. Name is the file name that will generated eg:('test.csv'). From da996ed2d71048bb2b7678b013e65998ded501e2 Mon Sep 17 00:00:00 2001 From: Frankie0609 <33159972+Frankie0609@users.noreply.github.com> Date: Fri, 16 Feb 2024 13:46:42 -0500 Subject: [PATCH 04/26] Updated agglomerative clustering Updated --- mwdc/clustering/st_agglomerative.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mwdc/clustering/st_agglomerative.py b/mwdc/clustering/st_agglomerative.py index 7fc5560..b90c19b 100644 --- a/mwdc/clustering/st_agglomerative.py +++ b/mwdc/clustering/st_agglomerative.py @@ -4,7 +4,7 @@ Automatically generated by Colaboratory. Original file is located at - https://colab.research.google.com/drive/1rffzeREHHxYtKe1WDhVz8nvwRHArX_Ob + https://colab.research.google.com/drive/10GIlSmNz4WCLYnDaaP65_I4Uxswerz8h """ import pandas as pd @@ -50,14 +50,14 @@ def st_agglomerative(input_path, input, variables,n, K, affinity, linkage, p, tr transformation: Boolean that accepts only "True" or "False" Output: - + formed_clusters: 1-D array of cluster labels classifying each data point along the time dimension to a cluster label A dataframe showing each cluster label and the correcponding cluster size. A dendrogram showing the steps in clustering - + ''' data = xr.open_dataset(input_path, decode_times=False) @@ -73,11 +73,11 @@ def st_agglomerative(input_path, input, variables,n, K, affinity, linkage, p, tr norm_data = pca1(norm_data,n) else: - + if dim_reduction==False: print("") else: - + if transformation==False: #High dimension reduction @@ -86,7 +86,7 @@ def st_agglomerative(input_path, input, variables,n, K, affinity, linkage, p, tr def plot_dendrogram(model, **kwargs): # Create linkage matrix and then plot the dendrogram - + # create the counts of samples under each node counts = np.zeros(model.children_.shape[0]) n_samples = len(model.labels_) @@ -100,9 +100,9 @@ def plot_dendrogram(model, **kwargs): counts[i] = current_count linkage_matrix = np.column_stack([model.children_, model.distances_,counts]).astype(float) - + # Plot the corresponding dendrogram - + dendrogram(linkage_matrix, **kwargs) #List of algorithms @@ -111,7 +111,7 @@ def plot_dendrogram(model, **kwargs): ('Average Linkage', 'average'), ('Complete Linkage', 'complete'), ('Ward Linkage', 'ward')) - + #distance metrics affinity_metrics = ['cosine', 'euclidean', 'manhattan'] @@ -119,7 +119,7 @@ def plot_dendrogram(model, **kwargs): for alg_name, alg in clustering_algorithms: if alg == 'ward' and metric != 'euclidean': continue model = AgglomerativeClustering(n_clusters=K, affinity=metric, linkage=alg, compute_distances=True) - + #model.fit(data) y_model = model.fit(norm_data) labels = y_model.labels_ @@ -129,11 +129,11 @@ def plot_dendrogram(model, **kwargs): df1['Cluster'].value_counts() print(labels) print("") - + #var = list(data.variables)[3:] - rmse = st_rmse(input_path, var, labels, transformation=True) + rmse = st_rmse_df(input_path, var, labels, transformation=True) print("This is the RMSE evaluation results:") print("") display(rmse) @@ -155,7 +155,7 @@ def plot_dendrogram(model, **kwargs): calinski_harabasz = calinski_harabasz_score(df1, labels) # It is also known as the Variance Ratio Criterion print("") - print("For n_clusters =", K,"The average calinski harabasz score is :", calinski_harabasz) #Higher value of CH index means the clusters are dense and well separated, + print("For n_clusters =", K,"The average calinski harabasz score is :", calinski_harabasz) #Higher value of CH index means the clusters are dense and well separated, #although there is no “acceptable” cut-off value. print("") print("") @@ -164,7 +164,7 @@ def plot_dendrogram(model, **kwargs): # graph size plt.figure(1, figsize = (18 ,12)) - + # plot the top 7 levels of the dendrogram # No more than p levels of the dendrogram tree are displayed. A “level” includes all nodes with p merges from the last merge. plot_dendrogram(model, truncate_mode='level',p = 7, get_leaves=True, orientation='top', labels=None) From bf253eb8d77ca1d7c3dd102970a758b119671d7f Mon Sep 17 00:00:00 2001 From: Francis_Nji <33159972+FrancisNji@users.noreply.github.com> Date: Fri, 22 Mar 2024 19:07:16 -0400 Subject: [PATCH 05/26] Update README.md I have removed this line for anonymity for paper submission: !git clone https://{clasic_access_token}@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0eccf16..0cb995d 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ There are three ways to Download and Manage the MWDC package: 4 - On Google Colab use the command below. ```bash -!git clone https://{clasic_access_token}@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git + ``` \*\* This is how to generat [clasic_access_token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token#creating-a-personal-access-token-classic). From a39e444163876851b6975f75d28ef92c027f3572 Mon Sep 17 00:00:00 2001 From: Francis_Nji <33159972+FrancisNji@users.noreply.github.com> Date: Fri, 22 Mar 2024 19:08:27 -0400 Subject: [PATCH 06/26] Update README.md For anonymity due to paper submission: git clone https://github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0cb995d..c61f9bf 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ There are three ways to Download and Manage the MWDC package: 2 - Use command line: ```bash - git clone https://github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git + ``` \*Because the repository is private the command line method is not Recomended. From 7b664918676185fb2daefcac6f5973a17b32d119 Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 01:53:12 -0400 Subject: [PATCH 07/26] Update dsc_model_2.py --- HESC/base_models/dsc_model_2.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/HESC/base_models/dsc_model_2.py b/HESC/base_models/dsc_model_2.py index c97339a..bd9f175 100644 --- a/HESC/base_models/dsc_model_2.py +++ b/HESC/base_models/dsc_model_2.py @@ -216,7 +216,7 @@ def compile(self, optimizer='sgd', loss='kld'): self.model.compile(optimizer=optimizer, loss=['mse', 'kld']) def fit(self, x, y=None, maxiter=2e4, batch_size=256, tol=1e-3, - update_interval=140, save_dir='/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DSC_2/saved'): + update_interval=140, save_dir='/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DSC_2/saved'): print('Update interval', update_interval) save_interval = 500 @@ -312,7 +312,7 @@ def data_preprocessing(data_path): rdata_daily_np_array_T_R_nor_R = rdata_daily_np_array_T_R_nor.reshape((rdata_daily_np_array_T_R_nor.shape[0], 1, rdata_daily_np_array.shape[2], rdata_daily_np_array.shape[3], rdata_daily_np_array.shape[0])) return rdata_daily_np_array_T_R_nor, rdata_daily_np_array_T_R_nor_R -path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' data_nor_eval, data_clustering = data_preprocessing('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') @@ -331,7 +331,7 @@ def main(): maxiter = 2e4 # Maximum number of times the model traning will iterate update_interval = 50 # After each interval the clustering weights will be modified tol = 0.0000001 # If there is a cluster change more than this tollerance the model training will run - save_dir = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DSC_2/saved' # The trained model will be stored here + save_dir = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DSC_2/saved' # The trained model will be stored here # load dataset x = data_clustering # Input dataset of the transformed daily data @@ -631,14 +631,14 @@ def to_binary_matrix(self, y_clusters): unique_labels = np.unique(np.concatenate(clustering_models)) print(sim_matrixx) -np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DSC_ens_co_occurrence_matrix.npy', sim_matrixx) +np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DSC_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) import numpy as geek data_nor_eval = data_nor -#sim_matrixx = geek.load('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') +#sim_matrixx = geek.load('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') from sklearn.cluster import SpectralClustering spec_clt = SpectralClustering(n_clusters=7, affinity='precomputed', @@ -647,7 +647,7 @@ def to_binary_matrix(self, y_clusters): -# result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) +# result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) """# **Evaluation Metrics**""" From 87c59d2f536ba181e006c000ed0b6ef4fb4f3d76 Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 01:54:37 -0400 Subject: [PATCH 08/26] Update dsc_model_2.py --- HESC/base_models/dsc_model_2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HESC/base_models/dsc_model_2.py b/HESC/base_models/dsc_model_2.py index bd9f175..8ebc89d 100644 --- a/HESC/base_models/dsc_model_2.py +++ b/HESC/base_models/dsc_model_2.py @@ -217,7 +217,7 @@ def compile(self, optimizer='sgd', loss='kld'): def fit(self, x, y=None, maxiter=2e4, batch_size=256, tol=1e-3, update_interval=140, save_dir='/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DSC_2/saved'): - + print('Update interval', update_interval) save_interval = 500 print('Save interval', save_interval) From 8cdb955efc06cd62be1aa61269b5bc565dcb82ea Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 01:55:36 -0400 Subject: [PATCH 09/26] Update dtc_clustering_model.py --- HESC/base_models/dtc_clustering_model.py | 40 ++++++++++++------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/HESC/base_models/dtc_clustering_model.py b/HESC/base_models/dtc_clustering_model.py index 5d4f9b7..562a975 100644 --- a/HESC/base_models/dtc_clustering_model.py +++ b/HESC/base_models/dtc_clustering_model.py @@ -511,7 +511,7 @@ def pretrain(self, X, optimizer='adam', epochs=10, batch_size=64, - save_dir='/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model', + save_dir='/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model', verbose=1): """ Pre-train the autoencoder using only MSE reconstruction loss @@ -543,7 +543,7 @@ def fit(self, X_train, y_train=None, tol=0.001, patience=5, finetune_heatmap_at_epoch=8, - save_dir='/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model'): + save_dir='/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model'): """ Training procedure # Arguments @@ -748,7 +748,7 @@ def data_preprocessing(data_path): rdata_daily_np_array_T_R_nor = np.float32(rdata_daily_np_array_T_R_nor) # convert the data type to float32, otherwise the loass will be out-of-limit return rdata_daily_np_array_T_R_nor -path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' data_path = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') nor_data = data_preprocessing(data_path) @@ -788,7 +788,7 @@ class Config(object): finetune_heatmap_at_epoch = 8 #, type=int, help='epoch where heatmap finetuning starts') initial_heatmap_loss_weight = 0.1 #, type=float, help='initial weight of heatmap loss vs clustering loss') final_heatmap_loss_weight = 0.9 #, type=float, help='final weight of heatmap loss vs clustering loss (heatmap finetuning)') - save_dir = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model' + save_dir = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DTC-Clustering-Model' @@ -1150,14 +1150,14 @@ def to_binary_matrix(self, y_clusters): unique_labels = np.unique(np.concatenate(clustering_models)) print(sim_matrixx) -np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DSC_ens_co_occurrence_matrix.npy', sim_matrixx) +np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DSC_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) import numpy as geek data_nor_eval = data_nor -#sim_matrixx = geek.load('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') +#sim_matrixx = geek.load('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') from sklearn.cluster import SpectralClustering spec_clt = SpectralClustering(n_clusters=7, affinity='precomputed', @@ -1166,7 +1166,7 @@ def to_binary_matrix(self, y_clusters): pickle.dump(final_labels, open(path + 'DSC_fin_ens_' + str(silh) + '.pkl', "wb")) -# result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) +# result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) """# **Evaluation Metrics**""" @@ -1251,12 +1251,12 @@ def best_clustering(n): -# result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_295.pkl", "rb")) -# result_1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_295.pkl", "rb")) -# result_2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3367.pkl", "rb")) -# result_3 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3124.pkl", "rb")) -# result_4 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3226.pkl", "rb")) -# result_5 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_02014.pkl", "rb")) +# result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_295.pkl", "rb")) +# result_1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_295.pkl", "rb")) +# result_2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3367.pkl", "rb")) +# result_3 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3124.pkl", "rb")) +# result_4 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3226.pkl", "rb")) +# result_5 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_02014.pkl", "rb")) clusters_list = [ result_1, result_2, result_3, result_4,result_5] @@ -1297,14 +1297,14 @@ def to_binary_matrix(self, y_clusters): unique_labels = np.unique(np.concatenate(clustering_models)) print(sim_matrixx) -np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DTC_ens_co_occurrence_matrix.npy', sim_matrixx) +np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DTC_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) import numpy as geek data_nor_eval = data_nor -#sim_matrixx = geek.load('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') +#sim_matrixx = geek.load('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') from sklearn.cluster import SpectralClustering spec_clt = SpectralClustering(n_clusters=7, affinity='precomputed', @@ -1363,9 +1363,9 @@ def best_clustering5(n): -pickle.dump(final_labels, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_hom_ens_034.pkl", "wb")) +pickle.dump(final_labels, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_hom_ens_034.pkl", "wb")) -#cluster_result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/test_clustering_results30.pkl", "rb")) +#cluster_result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/test_clustering_results30.pkl", "rb")) @@ -1588,7 +1588,7 @@ def get_siz(self, x): df1['clusterid'] = clusters #df1["cluster"] = cluster.labels_ df1['clusterid'].value_counts() -df1.to_csv("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DEC_clustering.csv") +df1.to_csv("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DEC_clustering.csv") df1 df1.groupby('clusterid').count() @@ -1603,9 +1603,9 @@ def get_siz(self, x): import pickle -pickle.dump(clusters, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_ensemble2_028.pkl", "wb")) +pickle.dump(clusters, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_ensemble2_028.pkl", "wb")) -#cluster_result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_ensemble2_028.pkl", "rb")) +#cluster_result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_ensemble2_028.pkl", "rb")) """**Evaluations**""" From eadc32f9ca5aada33c64f4d47bf5fd78ca0b9f66 Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 01:56:08 -0400 Subject: [PATCH 10/26] Update kmeans_ensemble.py --- HESC/base_models/kmeans_ensemble.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/HESC/base_models/kmeans_ensemble.py b/HESC/base_models/kmeans_ensemble.py index c6b8fd6..0a68566 100644 --- a/HESC/base_models/kmeans_ensemble.py +++ b/HESC/base_models/kmeans_ensemble.py @@ -117,7 +117,7 @@ def data_preprocessing(data_path): rdata_daily_np_array_T_R_nor_R = rdata_daily_np_array_T_R_nor.reshape((rdata_daily_np_array_T_R_nor.shape[0], rdata_daily_np_array.shape[2], rdata_daily_np_array.shape[3], rdata_daily_np_array.shape[0])) return rdata_daily_np_array_T_R_nor, rdata_daily_np_array_T_R_nor_R -path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' data_nor_eval, data_clustering = data_preprocessing('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') @@ -317,7 +317,7 @@ def nor_get_clusters_and_centers(input,formed_clusters): print(silh) print(u,indices) -#pickle.dump(result_3, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3125.pkl", "wb")) +#pickle.dump(result_3, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3125.pkl", "wb")) silhouette, result_4 = best_clustering(20) @@ -327,7 +327,7 @@ def nor_get_clusters_and_centers(input,formed_clusters): print(silh) print(u,indices) -#pickle.dump(result_4, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3389.pkl", "wb")) +#pickle.dump(result_4, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3389.pkl", "wb")) silhouette, result_5 = best_clustering(20) @@ -378,14 +378,14 @@ def to_binary_matrix(self, y_clusters): unique_labels = np.unique(np.concatenate(clustering_models)) print(sim_matrixx) -np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/KMeans_co_occurrence_matrix.npy', sim_matrixx) +np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/KMeans_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) import numpy as geek data_nor = data_nor_eval -#sim_matrixx = geek.load('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') +#sim_matrixx = geek.load('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/DEC_co_occurrence_matrix.npy') from sklearn.cluster import SpectralClustering spec_clt = SpectralClustering(n_clusters=7, affinity='precomputed', @@ -398,9 +398,9 @@ def to_binary_matrix(self, y_clusters): -#pickle.dump(final_labels, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "wb")) +#pickle.dump(final_labels, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "wb")) -result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) +result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) data_nor_eval = data_nor @@ -661,7 +661,7 @@ def to_binary_matrix(self, y_clusters): # print(km_sim_matrixx) -#np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/KMeans_ens_co_occurrence_matrix.npy', sim_matrixx) +#np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/KMeans_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) def best_clustering5(n): @@ -1008,7 +1008,7 @@ def get_siz(self, x): df1['clusterid'] = clusters #df1["cluster"] = cluster.labels_ df1['clusterid'].value_counts() -df1.to_csv("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/K-Means.csv") +df1.to_csv("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/K-Means.csv") df1 df1.groupby('clusterid').count() @@ -1023,9 +1023,9 @@ def get_siz(self, x): import pickle -#pickle.dump(clusters, open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_Ensemble1_029.pkl", "wb")) +#pickle.dump(clusters, open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_Ensemble1_029.pkl", "wb")) -#cluster_result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_Ensemble1_029.pkl", "rb")) +#cluster_result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_Ensemble1_029.pkl", "rb")) base_clustering = silhouette_score1(data_nor_eval, clusters) @@ -1080,4 +1080,4 @@ def get_siz(self, x): # /content/DAC_models/DAC_model_final_1700598391.ckpt.data-00000-of-00001 -# /content/DAC_models/DAC_model_final_1700598391.ckpt.index \ No newline at end of file +# /content/DAC_models/DAC_model_final_1700598391.ckpt.index From 2f4482d6be02e194b851198b6d8df625a5a3ae95 Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 01:58:29 -0400 Subject: [PATCH 11/26] Update model_selection.py --- HESC/base_models/model_selection.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/HESC/base_models/model_selection.py b/HESC/base_models/model_selection.py index e0b8593..9292e1d 100644 --- a/HESC/base_models/model_selection.py +++ b/HESC/base_models/model_selection.py @@ -3,8 +3,7 @@ Automatically generated by Colaboratory. -Original file is located at - https://colab.research.google.com/drive/1-eUuh7ZpQUHuUOmero5lyMZ8s2b4uYuA + # **Model Selection** @@ -991,4 +990,4 @@ def best_clustering(n): print("Variance is ", avg_var(data_nor, result_gmm)) -print("Inter-cluster distance ", avg_inter_dist(data_nor, result_gmm)) \ No newline at end of file +print("Inter-cluster distance ", avg_inter_dist(data_nor, result_gmm)) From b006e8c8d51985deb60c368b0acf604ec77514cd Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 01:58:58 -0400 Subject: [PATCH 12/26] Update kmeans_ensemble.py --- HESC/base_models/kmeans_ensemble.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/HESC/base_models/kmeans_ensemble.py b/HESC/base_models/kmeans_ensemble.py index 0a68566..6bc2b0e 100644 --- a/HESC/base_models/kmeans_ensemble.py +++ b/HESC/base_models/kmeans_ensemble.py @@ -1,10 +1,6 @@ # -*- coding: utf-8 -*- """KMeans_ensemble.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1q4Opc27RF8qYi_IUzcNL-BKXNLnfIipH [Source](https://www.kaggle.com/code/thedevastator/how-to-ensemble-clustering-algorithms-updated) """ From d64e3ae9f47fd3df9f02e1233c4e8b11cfdd01d2 Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:01:16 -0400 Subject: [PATCH 13/26] Update 41_dac_multivariate_data_11_18_23.py --- .../41_dac_multivariate_data_11_18_23.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/HESC/base_models/41_dac_multivariate_data_11_18_23.py b/HESC/base_models/41_dac_multivariate_data_11_18_23.py index ed7d12e..d1d6092 100644 --- a/HESC/base_models/41_dac_multivariate_data_11_18_23.py +++ b/HESC/base_models/41_dac_multivariate_data_11_18_23.py @@ -1,11 +1,3 @@ -# -*- coding: utf-8 -*- -"""41_DAC_Multivariate_Data_11-18-23.ipynb - -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1rWucbhqY1LHqAUw8avzhNP9hm7d_gaJ4 -""" @@ -126,7 +118,7 @@ def data_preprocessing(data_path): rdata_daily_np_array_T_R_nor_R = rdata_daily_np_array_T_R_nor.reshape((rdata_daily_np_array_T_R_nor.shape[0], rdata_daily_np_array.shape[2], rdata_daily_np_array.shape[3], rdata_daily_np_array.shape[0])) return rdata_daily_np_array_T_R_nor, rdata_daily_np_array_T_R_nor_R -path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' data_nor_eval, data_clustering = data_preprocessing('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') @@ -316,7 +308,7 @@ def train(): model_name = 'DAC_model_final_' + str(round(time()))+ '.ckpt' #save_path = saver.save(sess, 'DAC_models/' + model_name) - save_path = saver.save(sess, '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/DAC/res/' + model_name) + save_path = saver.save(sess, '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/DAC/res/' + model_name) print("Model saved in file: %s" % save_path) print("Total epochs: %d" % epoch) @@ -579,7 +571,7 @@ def to_binary_matrix(self, y_clusters): print(sim_matrixx) print(sim_matrixx.shape) print(unique_labels) -#np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) +#np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) # To normalize a matrix From 9184b455ce7c380d2a0b2e179ad0bc77c03a254a Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:01:29 -0400 Subject: [PATCH 14/26] Update dec_clustering_model.py --- HESC/base_models/dec_clustering_model.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/HESC/base_models/dec_clustering_model.py b/HESC/base_models/dec_clustering_model.py index d5bf6bc..cf7ccfe 100644 --- a/HESC/base_models/dec_clustering_model.py +++ b/HESC/base_models/dec_clustering_model.py @@ -1,11 +1,4 @@ -# -*- coding: utf-8 -*- -"""DEC-Clustering-Model.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1cUcuKqCzQNeei3W-uAW5TDa_foLb14_f -""" From 4e9e5caba52beff45c3785deb0de0a3b43c45c76 Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:01:40 -0400 Subject: [PATCH 15/26] Update dsc_model_2.py --- HESC/base_models/dsc_model_2.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/HESC/base_models/dsc_model_2.py b/HESC/base_models/dsc_model_2.py index 8ebc89d..6967352 100644 --- a/HESC/base_models/dsc_model_2.py +++ b/HESC/base_models/dsc_model_2.py @@ -1,11 +1,4 @@ -# -*- coding: utf-8 -*- -"""DSC_Model_2.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1jtLQxGTtiRrhZ5gxwCR6dWYmppXZ8ARW -""" from google.colab import drive drive.mount('/content/drive') From 2b899ad7bc1a55140198b85a25a541f3227f20bd Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:01:53 -0400 Subject: [PATCH 16/26] Update dtc_clustering_model.py --- HESC/base_models/dtc_clustering_model.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/HESC/base_models/dtc_clustering_model.py b/HESC/base_models/dtc_clustering_model.py index 562a975..73409f6 100644 --- a/HESC/base_models/dtc_clustering_model.py +++ b/HESC/base_models/dtc_clustering_model.py @@ -1,11 +1,3 @@ -# -*- coding: utf-8 -*- -"""DTC-Clustering-Model.ipynb - -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1KsUhZSXy92wUviCSKDas0mTogDkkhYb3 -""" from google.colab import drive drive.mount('/content/drive') From 704b90dab18b472652ccb10c34bd1a819105cbcb Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:03:55 -0400 Subject: [PATCH 17/26] Update cluster_ensembles.py --- HESC/baseline_models/cluster_ensembles.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/HESC/baseline_models/cluster_ensembles.py b/HESC/baseline_models/cluster_ensembles.py index 8a0d6fd..bc24b54 100644 --- a/HESC/baseline_models/cluster_ensembles.py +++ b/HESC/baseline_models/cluster_ensembles.py @@ -1,11 +1,4 @@ -# -*- coding: utf-8 -*- -"""Cluster_Ensembles.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1rAsFJZG3p6lrII47rE2w243oFKizap_c -""" @@ -18,7 +11,7 @@ !pip install "dask[dataframe]" !pip install netCDF4 -!git clone https://""@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +!git clone https://""@github.com/ # Commented out IPython magic to ensure Python compatibility. # %cd multivariate-weather-data-clustering From 8207c27a5782732e266d1a67a09e0c10c9967aa9 Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:05:14 -0400 Subject: [PATCH 18/26] Update ensemble_sc.py --- HESC/baseline_models/ensemble_sc.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/HESC/baseline_models/ensemble_sc.py b/HESC/baseline_models/ensemble_sc.py index 3420bf8..a17be15 100644 --- a/HESC/baseline_models/ensemble_sc.py +++ b/HESC/baseline_models/ensemble_sc.py @@ -1,10 +1,6 @@ # -*- coding: utf-8 -*- """Ensemble_SC.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1IUxgFKCFBNwc-EN7mivbl9gGyXnyaTGE Paper: Ensemble Learning for Spectral Clustering Implementation of the paper “Li, H., Ye, X., Imakura, A. and Sakurai, T., 2020, November. Ensemble learning for spectral clustering. In 2020 IEEE International Conference on Data Mining (ICDM) (pp. 1094-1099). IEEE” In Python. From fb445eb8a120d92e659f51b57b6da9b05fe60a60 Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:06:16 -0400 Subject: [PATCH 19/26] Update parea_multi_view_ensemble_clustering.py --- .../parea_multi_view_ensemble_clustering.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/HESC/baseline_models/parea_multi_view_ensemble_clustering.py b/HESC/baseline_models/parea_multi_view_ensemble_clustering.py index 5a5f921..7592e67 100644 --- a/HESC/baseline_models/parea_multi_view_ensemble_clustering.py +++ b/HESC/baseline_models/parea_multi_view_ensemble_clustering.py @@ -1,10 +1,6 @@ # -*- coding: utf-8 -*- """Parea_multi_view_ensemble_clustering.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1IrTNUG_vH6uIUgOTDzGhNWtmIwhIlSAr Paper: Parea: multi-view ensemble clustering for cancer subtype discovery @@ -25,7 +21,7 @@ !pip install "dask[dataframe]" !pip install netCDF4 -!git clone https://""@github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git +!git clone https://""@github.com/ # Commented out IPython magic to ensure Python compatibility. # %cd multivariate-weather-data-clustering @@ -359,4 +355,4 @@ def best_clustering(n): from sklearn.metrics import calinski_harabasz_score ch = calinski_harabasz_score(data_nor, result_parea) -print("Davies-Bouldin score is ", ch) \ No newline at end of file +print("Davies-Bouldin score is ", ch) From b4cd1340584fc300956bbd3fcb6efa79115dd4a6 Mon Sep 17 00:00:00 2001 From: OmarFaruqueUMBC <113269969+OmarFaruqueUMBC@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:07:47 -0400 Subject: [PATCH 20/26] Update all_models.py --- HESC/all_models.py | 72 ++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/HESC/all_models.py b/HESC/all_models.py index 454d7c1..b9b6d8a 100644 --- a/HESC/all_models.py +++ b/HESC/all_models.py @@ -1,10 +1,6 @@ # -*- coding: utf-8 -*- """all_models.ipynb -Automatically generated by Colaboratory. - -Original file is located at - https://colab.research.google.com/drive/1Vw2_E0x8GozGufXJghJ28Hcg8KsUOrS7 """ from google.colab import drive @@ -230,7 +226,7 @@ def avg_inter_dist(norm_data, clustering_results): data var = list(data.variables)[3:] -path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' data_nor, data_clustering = data_preprocessing('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') @@ -238,11 +234,11 @@ def avg_inter_dist(norm_data, clustering_results): data_nor_nor = data_nor -DSC_cluster_result = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) +DSC_cluster_result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) -DSC_cluster_result1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.34805576652439557.pkl", "rb")) +DSC_cluster_result1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.34805576652439557.pkl", "rb")) -DSC_cluster_result2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.35190532062602214.pkl", "rb")) +DSC_cluster_result2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.35190532062602214.pkl", "rb")) DSC_cluster_result @@ -271,13 +267,13 @@ def avg_inter_dist(norm_data, clustering_results): print("Inter-cluster distance ", avg_inter_dist(data_nor, DSC_cluster_result)) -DEC_Cluster_results1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_Ensemble1_034.pkl", "rb")) +DEC_Cluster_results1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_Ensemble1_034.pkl", "rb")) -DEC_Cluster_results2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.30096328.pkl", "rb")) +DEC_Cluster_results2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.30096328.pkl", "rb")) -DEC_Cluster_results3 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.30640745.pkl", "rb")) +DEC_Cluster_results3 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.30640745.pkl", "rb")) -DEC_Cluster_results4 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_03315.pkl", "rb")) +DEC_Cluster_results4 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_03315.pkl", "rb")) DEC_Cluster_results1 @@ -287,15 +283,15 @@ def avg_inter_dist(norm_data, clustering_results): print(silh) print(u,indices) -DTC_Cluster_results1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_Ensemble1_034.pkl", "rb")) +DTC_Cluster_results1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_Ensemble1_034.pkl", "rb")) -DTC_Cluster_results2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3367.pkl", "rb")) +DTC_Cluster_results2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3367.pkl", "rb")) -DTC_Cluster_results3 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_hom_ens_034.pkl", "rb")) +DTC_Cluster_results3 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_hom_ens_034.pkl", "rb")) -DTC_Cluster_results4 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3226.pkl", "rb")) +DTC_Cluster_results4 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3226.pkl", "rb")) -DTC_Cluster_results5 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3124.pkl", "rb")) +DTC_Cluster_results5 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3124.pkl", "rb")) DTC_Cluster_results1 @@ -305,11 +301,11 @@ def avg_inter_dist(norm_data, clustering_results): print(silh) print(u,indices) -KMeans_Cluster_results1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3389.pkl", "rb")) +KMeans_Cluster_results1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3389.pkl", "rb")) -KMeans_Cluster_results2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.313.pkl", "rb")) +KMeans_Cluster_results2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.313.pkl", "rb")) -KMeans_Cluster_results3 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3125.pkl", "rb")) +KMeans_Cluster_results3 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3125.pkl", "rb")) KMeans_Cluster_results1 @@ -378,7 +374,7 @@ def to_binary_matrix(self, y_clusters): print(sim_matrixx) print(sim_matrixx.shape) print(unique_labels) -#np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) +#np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) # To normalize a matrix @@ -434,11 +430,11 @@ def to_binary_matrix(self, y_clusters): from sklearn.metrics import pairwise_distances -dsc_ens = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_fin_ens_0.32257442534935293.pkl", "rb")) -dec_ens = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_fin_ens_0.3135124.pkl", "rb")) -km_ens = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/kmeans_fin_ens_0.32412578566580286.pkl", "rb")) +dsc_ens = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_fin_ens_0.32257442534935293.pkl", "rb")) +dec_ens = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_fin_ens_0.3135124.pkl", "rb")) +km_ens = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/kmeans_fin_ens_0.32412578566580286.pkl", "rb")) -#dtc_ens = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_fin_ens_0.3135124.pkl", "rb")) +#dtc_ens = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_fin_ens_0.3135124.pkl", "rb")) @@ -469,7 +465,7 @@ def to_binary_matrix(self, y_clusters): print(sim_matrixx) print(sim_matrixx.shape) print(unique_labels) -np.save('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) +np.save('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy', sim_matrixx) #print(norm_sim_matrix) @@ -511,7 +507,7 @@ def to_binary_matrix(self, y_clusters): from sklearn.decomposition import NMF -sim_matrixx = np.load('/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy') +sim_matrixx = np.load('/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Non-negative Matrix Factorization/fin_ens_co_occurrence_matrix.npy') #sim_matrixx @@ -722,7 +718,7 @@ def hbgf(base_clusters, nclass): print(silh) print(u,indices) -#path = '/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' +#path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' pickle.dump(label_hbgf, open(path + 'HBGF_ens_' + str(silh) + '.pkl', "wb")) silh = silhouette_score1(data_nor, label_hbgf) @@ -793,7 +789,7 @@ def best_clustering(n): """# **HESC_hbgp**""" -hbgp = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/HBGF_enss_0.30932982840030593.pkl", "rb")) +hbgp = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/HBGF_enss_0.30932982840030593.pkl", "rb")) silh = silhouette_score1(data_nor_eval, hbgp) u,indices = np.unique(hbgp,return_counts = True) # sc=0.3412 st 64 @@ -815,9 +811,9 @@ def best_clustering(n): -co_occ = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35889223651046626.pkl", "rb")) +co_occ = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35889223651046626.pkl", "rb")) -nnmf = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/nmf_ens0.35258519039733466.pkl", "rb")) +nnmf = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/nmf_ens0.35258519039733466.pkl", "rb")) NUM_alg = 100 occurrence_threshold = 0.2 @@ -839,7 +835,7 @@ def best_clustering(n): print(final_matrix.shape) print(unique_labels) -#final_labels1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35936684974088645.pkl", "rb")) +#final_labels1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35936684974088645.pkl", "rb")) ch_index1s2 = calinski_harabasz_score(data_nor, nnmf) print(ch_index1s2) @@ -892,14 +888,14 @@ def best_clustering(n): -final_labels1 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35936684974088645.pkl", "rb")) +final_labels1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/co-occ_ens_0.35936684974088645.pkl", "rb")) ch_index12 = calinski_harabasz_score(data_nor, final_labels1) print(ch_index12) """# **DTC**""" -final_labels2 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_22.pkl", "rb")) +final_labels2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_22.pkl", "rb")) ch_index2 = calinski_harabasz_score(data_nor, final_labels2) @@ -907,16 +903,16 @@ def best_clustering(n): """# **DSC**""" -final_labels3 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.35190532062602214.pkl", "rb")) -#/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.32763868041010935.pkl +final_labels3 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.35190532062602214.pkl", "rb")) +#/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.32763868041010935.pkl ch_index3 = calinski_harabasz_score(data_nor, final_labels3) print(ch_index3) """# **DEC**""" -final_labels4 = pickle.load(open("/content/drive/MyDrive/Jianwu-Wang-Francis-Nji/Papers-by-Francis/Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_03315.pkl", "rb")) +final_labels4 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_03315.pkl", "rb")) ch_index4 = calinski_harabasz_score(data_nor, final_labels4) -print(ch_index4) \ No newline at end of file +print(ch_index4) From 001eafb5b4aa9fec5dad180752039c2839c3263e Mon Sep 17 00:00:00 2001 From: Francis_Nji <33159972+FrancisNji@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:18:08 -0400 Subject: [PATCH 21/26] Update setup.py .................. --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index e0feabe..641b837 100644 --- a/setup.py +++ b/setup.py @@ -13,12 +13,12 @@ setup( name="mwdc", version="1.5.0", - author="Jianwu Wang, Francis Nji, Omar Faruque, Rohan Salvi, Mostafa Cham", + author=".......................", author_email="", - url="https://github.com/big-data-lab-umbc/multivariate-weather-data-clustering.git", + url=".................................................", install_requires=list(install_requires), packages=find_packages( exclude=("example*", "archive*", "Benchmark*")), long_description=long_description, -) \ No newline at end of file +) From 638cd9fc453d0102a0bee3fd36afa9652ad03498 Mon Sep 17 00:00:00 2001 From: Francis_Nji <33159972+FrancisNji@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:32:53 -0400 Subject: [PATCH 22/26] Update image_generation.py --- archive/image_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archive/image_generation.py b/archive/image_generation.py index d4fcbee..93411dd 100644 --- a/archive/image_generation.py +++ b/archive/image_generation.py @@ -77,7 +77,7 @@ def image_saving(data_variable, saving_path): if __name__ == '__main__': - data = xr.open_dataset("/Users/jianwu/Data/ECRP_ERA5/ERA5_sample_hourly_20200201-20200331.nc") + data = xr.open_dataset("/Users/////Data/ECRP_ERA5/ERA5_sample_hourly_20200201-20200331.nc") print(data.data_vars) image_saving(data['v10'], "/Users/jianwu/Data/ECRP_ERA5/") #for data_key in data.data_vars: From 86450dac9881f2114d1b26240f2d430b1c0b577b Mon Sep 17 00:00:00 2001 From: Francis_Nji <33159972+FrancisNji@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:34:33 -0400 Subject: [PATCH 23/26] Update feature_extraction_pca_clustering.py ....... --- archive/feature_extraction_pca_clustering.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/archive/feature_extraction_pca_clustering.py b/archive/feature_extraction_pca_clustering.py index f7a1521..5933b93 100644 --- a/archive/feature_extraction_pca_clustering.py +++ b/archive/feature_extraction_pca_clustering.py @@ -48,7 +48,7 @@ def image_feature(path, image_size): if __name__ == '__main__': var_name = "u10" img_path = "/Volumes/GoogleDrive/.shortcut-targets-by-id/1vfQuEpjPQbXwHTxqAw34ALMoA45PJ7KQ/ECRP_Data_Science/Zheng/new_data_images/" + var_name - work_dir = "/Users/jianwu/Data/ECRP_ERA5/version-2/csv" + work_dir = "/Users//////Data/ECRP_ERA5/version-2/csv" # the size of the new figure is 41x41 img_features, img_names = image_feature(img_path, (41, 41)) @@ -87,4 +87,4 @@ def image_feature(path, image_size): pca_clustering_path = work_dir + "/" + var_name + "_pca_clusters.csv" image_cluster.to_csv(pca_clustering_path, index=False) - print("done!") \ No newline at end of file + print("done!") From 73f00b8c6d1c822fb183a245041ca51714f78eaa Mon Sep 17 00:00:00 2001 From: Francis_Nji <33159972+FrancisNji@users.noreply.github.com> Date: Sat, 23 Mar 2024 02:35:32 -0400 Subject: [PATCH 24/26] Update image_generation.py ... --- archive/image_generation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/archive/image_generation.py b/archive/image_generation.py index 93411dd..1827709 100644 --- a/archive/image_generation.py +++ b/archive/image_generation.py @@ -79,7 +79,7 @@ def image_saving(data_variable, saving_path): if __name__ == '__main__': data = xr.open_dataset("/Users/////Data/ECRP_ERA5/ERA5_sample_hourly_20200201-20200331.nc") print(data.data_vars) - image_saving(data['v10'], "/Users/jianwu/Data/ECRP_ERA5/") + image_saving(data['v10'], "/Users////u/Data/ECRP_ERA5/") #for data_key in data.data_vars: # image_saving(data[data_key], "/Users/jianwu/Data/ECRP_ERA5/") From d6f3ced44fb4009931539b4347c613969e06b7a5 Mon Sep 17 00:00:00 2001 From: Francis_Nji <33159972+FrancisNji@users.noreply.github.com> Date: Sun, 28 Apr 2024 13:36:38 -0400 Subject: [PATCH 25/26] Update all_models.py Cleaned --- HESC/all_models.py | 146 ++++++++++----------------------------------- 1 file changed, 30 insertions(+), 116 deletions(-) diff --git a/HESC/all_models.py b/HESC/all_models.py index b9b6d8a..0d51816 100644 --- a/HESC/all_models.py +++ b/HESC/all_models.py @@ -11,18 +11,27 @@ # Install dask.dataframe !pip install "dask[dataframe]" !pip install netCDF4 +!pip install PyMetis +!pip install kahypar -import pandas as pd +import os +import warnings +from typing import Optional import numpy as np -import time +import pandas as pd +import xarray as xr +import pymetis +import kahypar +from scipy import sparse +from sklearn.metrics import pairwise_distances, normalized_mutual_info_score +from sklearn.utils.extmath import safe_sparse_dot + +from sklearn.metrics import silhouette_score, pairwise_distances, davies_bouldin_score +from sklearn.cluster import KMeans from matplotlib import pyplot as plt from matplotlib.pyplot import cm -#import netCDF4 -# from netCDF4 import Dataset -# import netCDF4 as nc import random import netCDF4 as nc -import xarray as xr import datetime import datetime as dt from netCDF4 import date2num,num2date @@ -40,36 +49,18 @@ from scipy.sparse import csr_matrix from sklearn.mixture import GaussianMixture from sklearn.metrics import adjusted_rand_score - import seaborn as sns -import xarray as xr -#from mwdc.clustering.st_agglomerative import st_agglomerative - -import warnings -warnings.filterwarnings("ignore") - -# import netCDF4 -# from netCDF4 import Dataset -from sklearn.preprocessing import StandardScaler from scipy.cluster.hierarchy import dendrogram, linkage import scipy.cluster.hierarchy as sch from sklearn.cluster import AgglomerativeClustering -# from mwdc.visualization.clusterplotting import clusterPlot2D -# from mwdc.visualization.visualization import visualization2 -# from mwdc.preprocessing.preprocessing import data_preprocessing -# from mwdc.evaluation.st_evaluation import st_rmse_df, st_corr, st_rmse_np -# from mwdc.clustering.st_agglomerative import st_agglomerative - import sys import pickle import matplotlib as mpl import matplotlib.colors as colors -import os -import xarray as xr -import warnings + warnings.filterwarnings("ignore") -!pip install netCDF4 + from sklearn.metrics import silhouette_samples, silhouette_score @@ -80,6 +71,8 @@ def silhouette_score1(X, labels, *, metric="cosine", sample_size=None, random_st from sklearn import preprocessing from sklearn.preprocessing import MinMaxScaler +warnings.filterwarnings("ignore") + def data_preprocessing(data_path): rdata_daily = xr.open_dataset(data_path) # data_path = '/content/drive/MyDrive/ERA5_Dataset.nc' rdata_daily_np_array = np.array(rdata_daily.to_array()) # the shape of the dailt data is (7, 365, 41, 41) @@ -213,17 +206,8 @@ def avg_inter_dist(norm_data, clustering_results): """# **Implementation**""" -#path2 = ('/content/drive/MyDrive/Data/mock_v4.nc') path2 = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') -#path2 = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc') -#path2 = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily_smalldomain.nc') -#path2 = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_hourly.nc') -#path2 = ('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_hourly_smalldomain.nc') data = xr.open_dataset(path2, decode_times=False)#To view the date as integers of 0, 1, 2,.... -#data = xr.open_dataset(path2)# decode_times=False) #To view the date as integers of 0, 1, 2,.... -#data5 = xr.open_dataset(path2) # To view time in datetime format -var = list(data.variables)[3:] -data var = list(data.variables)[3:] path = '/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/' @@ -234,39 +218,18 @@ def avg_inter_dist(norm_data, clustering_results): data_nor_nor = data_nor +################################################################### + +##DSC homogeneous clustering results DSC_cluster_result = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_Ensemble1_032.pkl", "rb")) DSC_cluster_result1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.34805576652439557.pkl", "rb")) DSC_cluster_result2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DSC_hom_ens_0.35190532062602214.pkl", "rb")) -DSC_cluster_result - -silh = silhouette_score1(data_nor, DSC_cluster_result) -u,indices = np.unique(DSC_cluster_result,return_counts = True) # sc=0.3412 st 64 -# u,indices -print(silh) -print(u,indices) - -data_nor_eval = data_nor - -result = DSC_cluster_result - -from sklearn.metrics import davies_bouldin_score - -db = davies_bouldin_score(data_nor, DSC_cluster_result) -print("Davies-Bouldin score is ", db) - -from sklearn.metrics import calinski_harabasz_score -ch = calinski_harabasz_score(data_nor, DSC_cluster_result) -print("Davies-Bouldin score is ", ch) - -print("RMSE score is ", total_rmse('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc', DSC_cluster_result)) - -print("Variance is ", avg_var(data_nor, DSC_cluster_result)) - -print("Inter-cluster distance ", avg_inter_dist(data_nor, DSC_cluster_result)) +################################################################### +##DEC homogeneous clustering results DEC_Cluster_results1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_Ensemble1_034.pkl", "rb")) DEC_Cluster_results2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_0.30096328.pkl", "rb")) @@ -275,14 +238,9 @@ def avg_inter_dist(norm_data, clustering_results): DEC_Cluster_results4 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_hom_ens_03315.pkl", "rb")) -DEC_Cluster_results1 - -silh = silhouette_score1(data_nor, DEC_Cluster_results1) -u,indices = np.unique(DEC_Cluster_results1,return_counts = True) # sc=0.3412 st 64 -# u,indices -print(silh) -print(u,indices) +################################################################### +##DTC homogeneous clustering results DTC_Cluster_results1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DEC_Ensemble1_034.pkl", "rb")) DTC_Cluster_results2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3367.pkl", "rb")) @@ -293,43 +251,18 @@ def avg_inter_dist(norm_data, clustering_results): DTC_Cluster_results5 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/DTC_3124.pkl", "rb")) -DTC_Cluster_results1 - -silh = silhouette_score1(data_nor, DTC_Cluster_results1) -u,indices = np.unique(DTC_Cluster_results1,return_counts = True) # sc=0.3412 st 64 -# u,indices -print(silh) -print(u,indices) +################################################################### +##KMeans homogeneous clustering results KMeans_Cluster_results1 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3389.pkl", "rb")) KMeans_Cluster_results2 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.313.pkl", "rb")) KMeans_Cluster_results3 = pickle.load(open("/content/drive/MyDrive///Ensemble_Clustering/final/ensemble_alg/Final_ensemble/KMeans_0.3125.pkl", "rb")) -KMeans_Cluster_results1 - -silh = silhouette_score1(data_nor, KMeans_Cluster_results1) -u,indices = np.unique(KMeans_Cluster_results1,return_counts = True) # sc=0.3412 st 64 -# u,indices -print(silh) -print(u,indices) - -from sklearn.metrics import davies_bouldin_score - -db = davies_bouldin_score(data_nor, KMeans_Cluster_results1) -print("Davies-Bouldin score is ", db) - -from sklearn.metrics import calinski_harabasz_score -ch = calinski_harabasz_score(data_nor, KMeans_Cluster_results1) -print("Davies-Bouldin score is ", ch) - -print("RMSE score is ", total_rmse('/content/drive/MyDrive/Data/ERA5_meteo_sfc_2021_daily.nc', KMeans_Cluster_results1)) - -print("Variance is ", avg_var(data_nor, KMeans_Cluster_results1)) - -print("Inter-cluster distance ", avg_inter_dist(data_nor, KMeans_Cluster_results1)) +################################################################### +##Main Heterogeneous Ensemble """# **HESC_performance**""" class ClusterSimilarityMatrix(): @@ -610,25 +543,6 @@ def best_clustering5(n): """# **Hybrid Bipartite Graph Formulation (HBGF)**""" -!pip install PyMetis - -!pip install kahypar - -import os -import warnings -from typing import Optional -import numpy as np -import pandas as pd -import xarray as xr -import pymetis -import kahypar -from scipy import sparse -from sklearn.metrics import pairwise_distances, normalized_mutual_info_score -from sklearn.utils.extmath import safe_sparse_dot - -from sklearn.metrics import silhouette_score, pairwise_distances, davies_bouldin_score -from sklearn.cluster import KMeans - def create_hypergraph(base_clusters): """Create the incidence matrix of base clusters' hypergraph From 4e9376d90d8f172e35e2f96ef87fa5cfacf81300 Mon Sep 17 00:00:00 2001 From: Francis_Nji <33159972+FrancisNji@users.noreply.github.com> Date: Sun, 28 Apr 2024 13:38:35 -0400 Subject: [PATCH 26/26] Update all_models.py --- HESC/all_models.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/HESC/all_models.py b/HESC/all_models.py index 0d51816..abf724e 100644 --- a/HESC/all_models.py +++ b/HESC/all_models.py @@ -61,15 +61,11 @@ warnings.filterwarnings("ignore") - -from sklearn.metrics import silhouette_samples, silhouette_score - def silhouette_score1(X, labels, *, metric="cosine", sample_size=None, random_state=None, **kwds): return np.mean(silhouette_samples(X, labels, metric="cosine", **kwds)) ## This function will will pre-process our daily data for DEC model as numpy array from sklearn import preprocessing -from sklearn.preprocessing import MinMaxScaler warnings.filterwarnings("ignore")