diff --git a/src/core/cuda/include/CudaAngleForce.cuh b/src/core/cuda/include/CudaAngleForce.cuh index 2782255f..7659a103 100644 --- a/src/core/cuda/include/CudaAngleForce.cuh +++ b/src/core/cuda/include/CudaAngleForce.cuh @@ -1,9 +1,6 @@ -#ifndef CUDA_ANGLE_FORCE_H -#define CUDA_ANGLE_FORCE_H +#pragma once #include "system.h" -__global__ void calc_angle_forces_kernel(int start, int end, angle_t *angles, coord_t *coords, cangle_t *cangles, dvel_t *dvelocities, double *energy_sum); +void init_angle_force_kernel_data(); double calc_angle_forces_host(int start, int end); - void cleanup_angle_force(); -#endif \ No newline at end of file diff --git a/src/core/cuda/include/CudaBondForce.cuh b/src/core/cuda/include/CudaBondForce.cuh index 61f2d2b8..0e7a9ba2 100644 --- a/src/core/cuda/include/CudaBondForce.cuh +++ b/src/core/cuda/include/CudaBondForce.cuh @@ -1,9 +1,6 @@ -#ifndef CUDA_BOND_FORCE_H -#define CUDA_BOND_FORCE_H +#pragma once #include "system.h" -__global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t *coords, cbond_t *cbonds, dvel_t *dvelocities, double *energy_sum); +void init_bond_force_kernel_data(); double calc_bond_forces_host(int start, int end); - void cleanup_bond_force(); -#endif \ No newline at end of file diff --git a/src/core/cuda/include/CudaContext.cuh b/src/core/cuda/include/CudaContext.cuh new file mode 100644 index 00000000..3c2883fa --- /dev/null +++ b/src/core/cuda/include/CudaContext.cuh @@ -0,0 +1,140 @@ +#pragma once + +#include + +#include "system.h" +#include "utils.h" + +class CudaContext { + public: + /* + Common data + */ + coord_t* d_coords = nullptr; + dvel_t* d_dvelocities = nullptr; + vel_t* d_velocities = nullptr; + + /* + Used in CudaAngleForce.cu + */ + angle_t* d_angles = nullptr; + cangle_t* d_cangles = nullptr; + + /* + Used in CudaBondForce.cu + */ + bond_t* d_bonds = nullptr; + cbond_t* d_cbonds = nullptr; + + /* + Used in CudaImproper2Force.cu + */ + improper_t* d_impropers = nullptr; + cimproper_t* d_cimpropers = nullptr; + + /* + Used in CudaLeapfrog.cu + */ + atype_t* d_atypes = nullptr; + catype_t* d_catypes = nullptr; + + /* + Used in CudaShakeConstraints.cu + */ + int* d_mol_n_shakes = nullptr; + shake_bond_t* d_shake_bonds = nullptr; + double* d_winv = nullptr; + coord_t* d_xcoords = nullptr; + + /* + Used in CudaNonbondedQQForce.cu + */ + q_atom_t* d_q_atoms = nullptr; + q_charge_t* d_q_charges = nullptr; + int* d_LJ_matrix = nullptr; + bool* d_excluded = nullptr; + q_elscale_t* d_q_elscales = nullptr; + q_catype_t* d_q_catypes = nullptr; + q_atype_t* d_q_atypes = nullptr; + E_nonbonded_t* d_EQ_nonbond_qq = nullptr; + double* d_lambdas = nullptr; + + /* + Used in CudaPolxWaterForce.cu + */ + shell_t* d_wshells = nullptr; + /* + Used in CudaPshellForce.cu + */ + bool* d_shell = nullptr; + coord_t* d_coords_top = nullptr; + + /* + Used in CudaRestrangForce.cu + */ + restrang_t* d_restrangs = nullptr; + E_restraint_t* d_EQ_restraint = nullptr; + restrdis_t* d_restrdists = nullptr; + + /* + Used in CudaRestrposForce.cu + */ + restrpos_t* d_restrspos = nullptr; + + /* + Used in CudaRestrseqForce.cu + */ + restrseq_t* d_restrseqs = nullptr; + bool* d_heavy = nullptr; + + /* + Used in CudaRestrwallForce.cu + */ + restrwall_t* d_restrwalls = nullptr; + + /* + Used in CudaTorsionForce.cu + */ + torsion_t* d_torsions = nullptr; + ctorsion_t* d_ctorsions = nullptr; + + /* + Used in CudaNonbondedPPForce.cu + */ + ccharge_t* d_ccharges; + charge_t* d_charges; + p_atom_t* d_p_atoms; + + static CudaContext& instance() { + static CudaContext ctx; + return ctx; + } + void init(); + + template + void sync_array_to_device(T* dst, const T* src, int count); + + template + void sync_array_to_host(T* dst, const T* src, int count); + + void sync_all_to_device(); + void sync_all_to_host(); + void reset_energies(); + + private: + CudaContext() = default; + + void free(); + + ~CudaContext() { free(); } + CudaContext(const CudaContext&) = delete; + CudaContext& operator=(const CudaContext&) = delete; +}; +template +void CudaContext::sync_array_to_device(T* dst, const T* src, int count) { + check_cuda(cudaMemcpy(dst, src, count * sizeof(T), cudaMemcpyHostToDevice)); +} +template +void CudaContext::sync_array_to_host(T* dst, const T* src, int count) { + check_cuda(cudaMemcpy(dst, src, count * sizeof(T), cudaMemcpyDeviceToHost)); +} \ No newline at end of file diff --git a/src/core/cuda/include/CudaImproper2Force.cuh b/src/core/cuda/include/CudaImproper2Force.cuh index 97097f84..232fbf6b 100644 --- a/src/core/cuda/include/CudaImproper2Force.cuh +++ b/src/core/cuda/include/CudaImproper2Force.cuh @@ -1,11 +1,6 @@ -#ifndef CUDA_IMPROPER2_FORCE -#define CUDA_IMPROPER2_FORCE_H +#pragma once #include "system.h" -__global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, double* energy_sum); - +void init_improper2_force_kernel_data(); double calc_improper2_forces_host(int start, int end); - void cleanup_improper2_force(); - -#endif \ No newline at end of file diff --git a/src/core/cuda/include/CudaLeapfrog.cuh b/src/core/cuda/include/CudaLeapfrog.cuh index ed2d47b0..c3ad57a7 100644 --- a/src/core/cuda/include/CudaLeapfrog.cuh +++ b/src/core/cuda/include/CudaLeapfrog.cuh @@ -1,8 +1,6 @@ -#ifndef CUDA_LEAPFROG_H -#define CUDA_LEAPFROG_H +#pragma once #include "system.h" +void init_leapfrog_kernel_data(); void calc_leapfrog_host(); void cleanup_leapfrog(); - -#endif \ No newline at end of file diff --git a/src/core/cuda/include/CudaNonbondedPPForce.cuh b/src/core/cuda/include/CudaNonbondedPPForce.cuh new file mode 100644 index 00000000..ea8cd22d --- /dev/null +++ b/src/core/cuda/include/CudaNonbondedPPForce.cuh @@ -0,0 +1,8 @@ +#pragma once +#include "system.h" + +void init_nonbonded_pp_force_kernel_data(); + +void calc_nonbonded_pp_forces_host_v2(); + +void cleanup_nonbonded_pp_force(); \ No newline at end of file diff --git a/src/core/cuda/include/CudaNonbondedPWForce.cuh b/src/core/cuda/include/CudaNonbondedPWForce.cuh new file mode 100644 index 00000000..dc0cb7c4 --- /dev/null +++ b/src/core/cuda/include/CudaNonbondedPWForce.cuh @@ -0,0 +1,7 @@ +#pragma once +#include "system.h" + +void init_nonbonded_pw_force_kernel_data(); +void calc_nonbonded_pw_forces_host_v2(); + +void cleanup_nonbonded_pw_force(); \ No newline at end of file diff --git a/src/core/cuda/include/CudaNonbondedQPForce.cuh b/src/core/cuda/include/CudaNonbondedQPForce.cuh new file mode 100644 index 00000000..d48acdd6 --- /dev/null +++ b/src/core/cuda/include/CudaNonbondedQPForce.cuh @@ -0,0 +1,6 @@ +#pragma once + +void init_nonbonded_qp_force_kernel_data(); +void calc_nonbonded_qp_forces_host_v2(); + +void cleanup_nonbonded_qp_force(); \ No newline at end of file diff --git a/src/core/cuda/include/CudaNonbondedQQForce.cuh b/src/core/cuda/include/CudaNonbondedQQForce.cuh index b21dbc4b..aba8a911 100644 --- a/src/core/cuda/include/CudaNonbondedQQForce.cuh +++ b/src/core/cuda/include/CudaNonbondedQQForce.cuh @@ -1,8 +1,7 @@ -#ifndef CUDA_NONBONDED_QQ_FORCE_H -#define CUDA_NONBONDED_QQ_FORCE_H +#pragma once #include "system.h" +void init_nonbonded_qq_force_kernel_data(); void calc_nonbonded_qq_forces_host(); -void cleanup_nonbonded_qq_force(); -#endif \ No newline at end of file +void cleanup_nonbonded_qq_force(); diff --git a/src/core/cuda/include/CudaNonbondedQWForce.cuh b/src/core/cuda/include/CudaNonbondedQWForce.cuh new file mode 100644 index 00000000..14979b6f --- /dev/null +++ b/src/core/cuda/include/CudaNonbondedQWForce.cuh @@ -0,0 +1,7 @@ +#pragma once +#include "system.h" +void init_nonbonded_qw_force_kernel_data(); + +void calc_nonbonded_qw_forces_host_v2(); + +void cleanup_nonbonded_qw_force(); \ No newline at end of file diff --git a/src/core/cuda/include/CudaNonbondedWWForce.cuh b/src/core/cuda/include/CudaNonbondedWWForce.cuh new file mode 100644 index 00000000..1aaa3f6a --- /dev/null +++ b/src/core/cuda/include/CudaNonbondedWWForce.cuh @@ -0,0 +1,7 @@ +#pragma once +#include "system.h" +void init_nonbonded_ww_force_kernel_data(); + +void calc_nonbonded_ww_forces_host_v2(); + +void cleanup_nonbonded_ww_force(); \ No newline at end of file diff --git a/src/core/cuda/include/CudaPolxWaterForce.cuh b/src/core/cuda/include/CudaPolxWaterForce.cuh index 706585b5..5b699f35 100644 --- a/src/core/cuda/include/CudaPolxWaterForce.cuh +++ b/src/core/cuda/include/CudaPolxWaterForce.cuh @@ -1,10 +1,8 @@ -#ifndef __CUDA_POLX_WATER_FORCE_CUH__ -#define __CUDA_POLX_WATER_FORCE_CUH__ - +#pragma once #include "system.h" +void init_polx_water_force_kernel_data(); + void calc_polx_water_forces_host(int iteration); void cleanup_polx_water_force(); - -#endif \ No newline at end of file diff --git a/src/core/cuda/include/CudaPshellForce.cuh b/src/core/cuda/include/CudaPshellForce.cuh index 0e2c7099..c86bfb4f 100644 --- a/src/core/cuda/include/CudaPshellForce.cuh +++ b/src/core/cuda/include/CudaPshellForce.cuh @@ -1,17 +1,7 @@ -#ifndef CUDA_PSHELL_FORCE_CUH -#define CUDA_PSHELL_FORCE_CUH +#pragma once #include "system.h" -__global__ void calc_pshell_force_kernel( - int n_atoms_solute, - bool* shell, - bool* excluded, - coord_t* coords, - coord_t* coords_top, - double* ufix_energy, - double* ushell_energy, - dvel_t* dvelocities -); +void init_pshell_force_kernel_data(); + void calc_pshell_forces_host(); -void cleanup_pshell_force(); -#endif // CUDA_PSHELL_FORCE_CUH \ No newline at end of file +void cleanup_pshell_force(); diff --git a/src/core/cuda/include/CudaRadixWaterForce.cuh b/src/core/cuda/include/CudaRadixWaterForce.cuh index 891a8861..c13330e3 100644 --- a/src/core/cuda/include/CudaRadixWaterForce.cuh +++ b/src/core/cuda/include/CudaRadixWaterForce.cuh @@ -1,22 +1,6 @@ -#ifndef CUDA_RADIX_WATER_FORCE_CU -#define CUDA_RADIX_WATER_FORCE_CU +#pragma once #include "system.h" - -__global__ void calc_radix_water_forces_kernel( - coord_t* coords, - double shift, - int n_atoms_solute, - int n_atoms, - topo_t topo, - md_t md, - double Dwmz, - double awmz, - dvel_t* dvelocities, - double* energy); - +void init_radix_water_force_kernel_data(); void calc_radix_water_forces_host(); - void cleanup_radix_water_force(); - -#endif \ No newline at end of file diff --git a/src/core/cuda/include/CudaRestrangForce.cuh b/src/core/cuda/include/CudaRestrangForce.cuh index f44f9b0c..c5c5e98d 100644 --- a/src/core/cuda/include/CudaRestrangForce.cuh +++ b/src/core/cuda/include/CudaRestrangForce.cuh @@ -1,8 +1,7 @@ -#ifndef CUDA_RESTRANG_FORCE_CUH -#define CUDA_RESTRANG_FORCE_CUH +#pragma once #include "system.h" +void init_restrang_force_kernel_data(); void calc_restrang_force_host(); -void cleanup_restrang_force(); -#endif // CUDA_RESTRANG_FORCE_CUH \ No newline at end of file +void cleanup_restrang_force(); diff --git a/src/core/cuda/include/CudaRestrdisForce.cuh b/src/core/cuda/include/CudaRestrdisForce.cuh index 04abe85f..09fd9350 100644 --- a/src/core/cuda/include/CudaRestrdisForce.cuh +++ b/src/core/cuda/include/CudaRestrdisForce.cuh @@ -1,10 +1,7 @@ -#ifndef CUDA_RESTRDIS_FORCE_CUH -#define CUDA_RESTRDIS_FORCE_CUH +#pragma once #include "system.h" -__global__ void calc_restrdis_forces_kernel(); +void init_restrdis_force_kernel_data(); void calc_restrdis_forces_host(); void cleanup_restrdis_force(); - -#endif \ No newline at end of file diff --git a/src/core/cuda/include/CudaRestrposForce.cuh b/src/core/cuda/include/CudaRestrposForce.cuh index 693ab5bb..ddbd29d1 100644 --- a/src/core/cuda/include/CudaRestrposForce.cuh +++ b/src/core/cuda/include/CudaRestrposForce.cuh @@ -1,8 +1,7 @@ -#ifndef CUDA_RESTRPOS_FORCE_CUH -#define CUDA_RESTRPOS_FORCE_CUH +#pragma once #include "system.h" +void init_restrpos_force_kernel_data(); void calc_restrpos_forces_host(); -void cleanup_restrpos_force(); -#endif // CUDA_RESTRPOS_FORCE_CUH \ No newline at end of file +void cleanup_restrpos_force(); diff --git a/src/core/cuda/include/CudaRestrseqForce.cuh b/src/core/cuda/include/CudaRestrseqForce.cuh index ae013466..efbe60bc 100644 --- a/src/core/cuda/include/CudaRestrseqForce.cuh +++ b/src/core/cuda/include/CudaRestrseqForce.cuh @@ -1,11 +1,6 @@ -#ifndef CUDA_RESTRSEQ_FORCE_CUH -#define CUDA_RESTRSEQ_FORCE_CUH +#pragma once #include "system.h" -__global__ void calc_restrseq_forces_kernel(); +void init_restrseq_force_kernel_data(); void calc_restrseq_forces_host(); - void cleanup_restrseq_force(); - - -#endif // CUDA_RESTRSEQ_FORCE_CUH \ No newline at end of file diff --git a/src/core/cuda/include/CudaRestrwallForce.cuh b/src/core/cuda/include/CudaRestrwallForce.cuh index c5fa7fe5..c78a4759 100644 --- a/src/core/cuda/include/CudaRestrwallForce.cuh +++ b/src/core/cuda/include/CudaRestrwallForce.cuh @@ -1,8 +1,6 @@ -#ifndef CUDA_RESTRWALL_FORCE_CUH -#define CUDA_RESTRWALL_FORCE_CUH +#pragma once #include "system.h" +void init_restrwall_force_kernel_data(); void calc_restrwall_forces_host(); void cleanup_restrwall_force(); - -#endif \ No newline at end of file diff --git a/src/core/cuda/include/CudaShakeConstraints.cuh b/src/core/cuda/include/CudaShakeConstraints.cuh index c4f85736..e05739a8 100644 --- a/src/core/cuda/include/CudaShakeConstraints.cuh +++ b/src/core/cuda/include/CudaShakeConstraints.cuh @@ -1,22 +1,7 @@ -#ifndef CUDA_SHAKE_CONSTRAINTS_CUH -#define CUDA_SHAKE_CONSTRAINTS_CUH +#pragma once #include "system.h" -__global__ void calc_shake_constraints_kernel( - int n_molecules, - int *mol_n_shakes, - shake_bond_t *shake_bonds, - coord_t *coords, - coord_t *xcoords, - double *winv, - int *total_iterations, - int *mol_shake_offset -); +void init_shake_constraints_kernel_data(); int calc_shake_constraints_host(); - void cleanup_shake_constraints(); - - - -#endif \ No newline at end of file diff --git a/src/core/cuda/include/CudaTemperature.cuh b/src/core/cuda/include/CudaTemperature.cuh index 89ffb6df..ad7b8a9d 100644 --- a/src/core/cuda/include/CudaTemperature.cuh +++ b/src/core/cuda/include/CudaTemperature.cuh @@ -1,11 +1,7 @@ -#ifndef CUDA_TEMPERATURE_CUH -#define CUDA_TEMPERATURE_CUH +#pragma once #include "system.h" -__global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t* atypes, catype_t *catypes, vel_t *velocities, bool* excluded, double boltz, double ekinmax, -double *Temp_solute, double *Tfree_solute, double *Texcl_solute, double *Temp_solvent, double *Tfree_solvent, double *Texcl_solvent); +void init_temperature_kernel_data(); void calc_temperature_host(); void cleanup_temperature(); - -#endif \ No newline at end of file diff --git a/src/core/cuda/include/CudaTorsionForce.cuh b/src/core/cuda/include/CudaTorsionForce.cuh index 9e51543a..b2ec94b8 100644 --- a/src/core/cuda/include/CudaTorsionForce.cuh +++ b/src/core/cuda/include/CudaTorsionForce.cuh @@ -1,11 +1,7 @@ -#ifndef CUDA_TORSION_FORCE_H -#define CUDA_TORSION_FORCE_H +#pragma once #include "system.h" -__global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsions, ctorsion_t* ctorsions, coord_t* coords, dvel_t* dvelocities, double* energy_sum); - +void init_torsion_force_kernel_data(); double calc_torsion_forces_host(int start, int end); void cleanup_torsion_force(); - -#endif \ No newline at end of file diff --git a/src/core/cuda/src/CudaAngleForce.cu b/src/core/cuda/src/CudaAngleForce.cu index f9467b97..452d64cf 100644 --- a/src/core/cuda/src/CudaAngleForce.cu +++ b/src/core/cuda/src/CudaAngleForce.cu @@ -1,17 +1,13 @@ #include "cuda/include/CudaAngleForce.cuh" -#include "utils.h" +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaUtility.cuh" +#include "utils.h" namespace CudaAngleForce { bool is_initialized = false; -angle_t* d_angles = nullptr; -coord_t* d_coords = nullptr; -cangle_t* d_cangles = nullptr; -dvel_t* d_dvelocities = nullptr; -double* d_energy_sum = nullptr; +double* d_energy_sum; } // namespace CudaAngleForce - __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, coord_t* coords, cangle_t* cangles, dvel_t* dvelocities, double* energy_sum) { int idx = blockIdx.x * blockDim.x + threadIdx.x + start; if (idx >= end) return; @@ -76,30 +72,19 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co } double calc_angle_forces_host(int start, int end) { - using namespace CudaAngleForce; int N = end - start; if (N <= 0) return 0.0; + using namespace CudaAngleForce; int blockSize = 256; int numBlocks = (N + blockSize - 1) / blockSize; - if (!is_initialized) { - is_initialized = true; - check_cudaMalloc((void**)&d_angles, n_angles * sizeof(angle_t)); - check_cudaMalloc((void**)&d_coords, n_atoms * sizeof(coord_t)); - check_cudaMalloc((void**)&d_cangles, n_cangles * sizeof(cangle_t)); - check_cudaMalloc((void**)&d_dvelocities, n_atoms * sizeof(dvel_t)); - check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); - - - - // This data is constant, copy once - cudaMemcpy(d_angles, angles, n_angles * sizeof(angle_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_cangles, cangles, n_cangles * sizeof(cangle_t), cudaMemcpyHostToDevice); - } - - // copy data to device - cudaMemcpy(d_coords, coords, n_atoms * sizeof(coord_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_dvelocities, dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyHostToDevice); + CudaContext& ctx = CudaContext::instance(); + auto d_angles = ctx.d_angles; + auto d_coords = ctx.d_coords; + auto d_cangles = ctx.d_cangles; + auto d_dvelocities = ctx.d_dvelocities; + // todo: now have to do that, after moving all to CudaContext, can remove it + // ctx.sync_all_to_device(); double h_energy_sum = 0.0; cudaMemcpy(d_energy_sum, &h_energy_sum, sizeof(double), cudaMemcpyHostToDevice); @@ -108,20 +93,25 @@ double calc_angle_forces_host(int start, int end) { calc_angle_forces_kernel<<>>(start, end, d_angles, d_coords, d_cangles, d_dvelocities, d_energy_sum); cudaDeviceSynchronize(); + // todo: Now have to do that, after moving all to CudaContext, can remove it // copy results back to host cudaMemcpy(&h_energy_sum, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); - cudaMemcpy(dvelocities, d_dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyDeviceToHost); + cudaMemcpy(dvelocities, ctx.d_dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyDeviceToHost); return h_energy_sum; } +void init_angle_force_kernel_data() { + using namespace CudaAngleForce; + if (!is_initialized) { + check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); + is_initialized = true; + } +} + void cleanup_angle_force() { using namespace CudaAngleForce; if (is_initialized) { - cudaFree(d_angles); - cudaFree(d_coords); - cudaFree(d_cangles); - cudaFree(d_dvelocities); cudaFree(d_energy_sum); is_initialized = false; } -} \ No newline at end of file +} diff --git a/src/core/cuda/src/CudaBondForce.cu b/src/core/cuda/src/CudaBondForce.cu index b4119c38..47869e76 100644 --- a/src/core/cuda/src/CudaBondForce.cu +++ b/src/core/cuda/src/CudaBondForce.cu @@ -1,15 +1,10 @@ #include "cuda/include/CudaBondForce.cuh" +#include "cuda/include/CudaContext.cuh" #include "utils.h" - namespace CudaBondForce { bool is_initialized = false; -bond_t* d_bonds = nullptr; -coord_t* d_coords = nullptr; -cbond_t* d_cbonds = nullptr; -dvel_t* d_dvelocities = nullptr; -double* d_energy_sum = nullptr; +double* d_energy_sum; } // namespace CudaBondForce - __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t* coords, cbond_t* cbonds, dvel_t* dvelocities, double* energy_sum) { int idx = blockIdx.x * blockDim.x + threadIdx.x + start; if (idx >= end) return; @@ -39,42 +34,41 @@ __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord } double calc_bond_forces_host(int start, int end) { - using namespace CudaBondForce; int N = end - start; if (N <= 0) return 0.0; + using namespace CudaBondForce; int blockSize = 256; int numBlocks = (N + blockSize - 1) / blockSize; - if (!is_initialized) { - check_cudaMalloc((void**)&d_bonds, sizeof(bond_t) * n_bonds); - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_cbonds, sizeof(cbond_t) * n_cbonds); - check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms); - check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); - is_initialized = true; - } + double energy = 0.0; + cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice); + + CudaContext& context = CudaContext::instance(); + bond_t* d_bonds = context.d_bonds; + coord_t* d_coords = context.d_coords; + cbond_t* d_cbonds = context.d_cbonds; + dvel_t* d_dvelocities = context.d_dvelocities; - cudaMemcpy(d_bonds, bonds, sizeof(bond_t) * n_bonds, cudaMemcpyHostToDevice); - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_cbonds, cbonds, sizeof(cbond_t) * n_cbonds, cudaMemcpyHostToDevice); - cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice); - double zero = 0.0; - cudaMemcpy(d_energy_sum, &zero, sizeof(double), cudaMemcpyHostToDevice); calc_bond_forces_kernel<<>>(start, end, d_bonds, d_coords, d_cbonds, d_dvelocities, d_energy_sum); cudaDeviceSynchronize(); - cudaMemcpy(&zero, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); - return zero; + + return energy; +} + +void init_bond_force_kernel_data() { + using namespace CudaBondForce; + if (!is_initialized) { + check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); + is_initialized = true; + } } void cleanup_bond_force() { using namespace CudaBondForce; if (is_initialized) { - cudaFree(d_bonds); - cudaFree(d_coords); - cudaFree(d_cbonds); - cudaFree(d_dvelocities); cudaFree(d_energy_sum); is_initialized = false; } -} \ No newline at end of file +} diff --git a/src/core/cuda/src/CudaContext.cu b/src/core/cuda/src/CudaContext.cu new file mode 100644 index 00000000..19f22165 --- /dev/null +++ b/src/core/cuda/src/CudaContext.cu @@ -0,0 +1,217 @@ +#include + +#include "cuda/include/CudaContext.cuh" + +void CudaContext::init() { + check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); + check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms); + check_cudaMalloc((void**)&d_velocities, sizeof(vel_t) * n_atoms); + check_cudaMalloc((void**)&d_angles, sizeof(angle_t) * n_angles); + check_cudaMalloc((void**)&d_cangles, sizeof(cangle_t) * n_cangles); + check_cudaMalloc((void**)&d_bonds, sizeof(bond_t) * n_bonds); + check_cudaMalloc((void**)&d_cbonds, sizeof(cbond_t) * n_cbonds); + check_cudaMalloc((void**)&d_impropers, sizeof(improper_t) * n_impropers); + check_cudaMalloc((void**)&d_cimpropers, sizeof(cimproper_t) * n_cimpropers); + + check_cudaMalloc((void**)&d_mol_n_shakes, sizeof(int) * n_molecules); + check_cudaMalloc((void**)&d_shake_bonds, sizeof(shake_bond_t) * n_shake_constraints); + check_cudaMalloc((void**)&d_winv, sizeof(double) * n_atoms); + check_cudaMalloc((void**)&d_xcoords, sizeof(coord_t) * n_atoms); + + check_cudaMalloc((void**)&d_atypes, sizeof(atype_t) * n_atypes); + check_cudaMalloc((void**)&d_catypes, sizeof(catype_t) * n_catypes); + + + check_cudaMalloc((void**)&d_q_atoms, sizeof(q_atom_t) * n_qatoms); + check_cudaMalloc((void**)&d_q_charges, sizeof(q_charge_t) * n_qatoms * n_lambdas); + check_cudaMalloc((void**)&d_LJ_matrix, sizeof(int) * n_atoms_solute * n_atoms_solute); + check_cudaMalloc((void**)&d_excluded, sizeof(bool) * n_atoms); + check_cudaMalloc((void**)&d_q_elscales, sizeof(q_elscale_t) * n_qelscales); + check_cudaMalloc((void**)&d_q_catypes, sizeof(q_catype_t) * n_qcatypes); + check_cudaMalloc((void**)&d_q_atypes, sizeof(q_atype_t) * n_qatoms * n_lambdas); + check_cudaMalloc((void**)&d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas); + check_cudaMalloc((void**)&d_lambdas, sizeof(double) * n_lambdas); + + check_cudaMalloc((void**)&d_wshells, n_shells * sizeof(shell_t)); + check_cudaMalloc((void**)&d_shell, sizeof(bool) * n_atoms); + check_cudaMalloc((void**)&d_coords_top, sizeof(coord_t) * n_atoms); + + check_cudaMalloc((void**)&d_restrangs, sizeof(restrang_t) * n_restrangs); + check_cudaMalloc((void**)&d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas); + check_cudaMalloc((void**)&d_restrdists, sizeof(restrdis_t) * n_restrdists); + + check_cudaMalloc((void**)&d_restrspos, sizeof(restrpos_t) * n_restrspos); + + check_cudaMalloc((void**)&d_restrseqs, sizeof(restrseq_t) * n_restrseqs); + check_cudaMalloc((void**)&d_heavy, sizeof(bool) * n_atoms); + + check_cudaMalloc((void**)&d_restrwalls, sizeof(restrwall_t) * n_restrwalls); + + check_cudaMalloc((void**)&d_torsions, sizeof(torsion_t) * n_torsions); + check_cudaMalloc((void**)&d_ctorsions, sizeof(ctorsion_t) * n_ctorsions); + + check_cudaMalloc((void**)&d_ccharges, sizeof(ccharge_t) * n_ccharges); + check_cudaMalloc((void**)&d_charges, sizeof(charge_t) * n_charges); + check_cudaMalloc((void**)&d_p_atoms, sizeof(p_atom_t) * n_patoms); + + sync_all_to_device(); +} + +void CudaContext::sync_all_to_device() { + sync_array_to_device(d_coords, coords, n_atoms); + sync_array_to_device(d_dvelocities, dvelocities, n_atoms); + sync_array_to_device(d_velocities, velocities, n_atoms); + sync_array_to_device(d_angles, angles, n_angles); + sync_array_to_device(d_cangles, cangles, n_cangles); + sync_array_to_device(d_bonds, bonds, n_bonds); + sync_array_to_device(d_cbonds, cbonds, n_cbonds); + sync_array_to_device(d_impropers, impropers, n_impropers); + sync_array_to_device(d_cimpropers, cimpropers, n_cimpropers); + + sync_array_to_device(d_mol_n_shakes, mol_n_shakes, n_molecules); + sync_array_to_device(d_shake_bonds, shake_bonds, n_shake_constraints); + sync_array_to_device(d_winv, winv, n_atoms); + sync_array_to_device(d_xcoords, xcoords, n_atoms); + + sync_array_to_device(d_atypes, atypes, n_atypes); + sync_array_to_device(d_catypes, catypes, n_catypes); + + sync_array_to_device(d_q_atoms, q_atoms, n_qatoms); + sync_array_to_device(d_q_charges, q_charges, n_qatoms * n_lambdas); + sync_array_to_device(d_LJ_matrix, LJ_matrix, n_atoms_solute * n_atoms_solute); + sync_array_to_device(d_excluded, excluded, n_atoms); + sync_array_to_device(d_q_elscales, q_elscales, n_qelscales); + sync_array_to_device(d_q_catypes, q_catypes, n_qcatypes); + sync_array_to_device(d_q_atypes, q_atypes, n_qatoms * n_lambdas); + sync_array_to_device(d_EQ_nonbond_qq, EQ_nonbond_qq, n_lambdas); + sync_array_to_device(d_lambdas, lambdas, n_lambdas); + sync_array_to_device(d_wshells, wshells, n_shells); + + sync_array_to_device(d_shell, shell, n_atoms); + sync_array_to_device(d_coords_top, coords_top, n_atoms); + + sync_array_to_device(d_restrangs, restrangs, n_restrangs); + sync_array_to_device(d_EQ_restraint, EQ_restraint, n_lambdas); + sync_array_to_device(d_restrdists, restrdists, n_restrdists); + + sync_array_to_device(d_restrspos, restrspos, n_restrspos); + + sync_array_to_device(d_restrseqs, restrseqs, n_restrseqs); + sync_array_to_device(d_heavy, heavy, n_atoms); + sync_array_to_device(d_restrwalls, restrwalls, n_restrwalls); + + sync_array_to_device(d_torsions, torsions, n_torsions); + sync_array_to_device(d_ctorsions, ctorsions, n_ctorsions); + + sync_array_to_device(d_ccharges, ccharges, n_ccharges); + sync_array_to_device(d_charges, charges, n_charges); + sync_array_to_device(d_p_atoms, p_atoms, n_patoms); +} + +void CudaContext::sync_all_to_host() { + sync_array_to_host(coords, d_coords, n_atoms); + sync_array_to_host(dvelocities, d_dvelocities, n_atoms); + sync_array_to_host(velocities, d_velocities, n_atoms); + sync_array_to_host(angles, d_angles, n_angles); + sync_array_to_host(cangles, d_cangles, n_cangles); + sync_array_to_host(bonds, d_bonds, n_bonds); + sync_array_to_host(cbonds, d_cbonds, n_cbonds); + sync_array_to_host(impropers, d_impropers, n_impropers); + sync_array_to_host(cimpropers, d_cimpropers, n_cimpropers); + + sync_array_to_host(mol_n_shakes, d_mol_n_shakes, n_molecules); + sync_array_to_host(shake_bonds, d_shake_bonds, n_shake_constraints); + sync_array_to_host(winv, d_winv, n_atoms); + sync_array_to_host(xcoords, d_xcoords, n_atoms); + + sync_array_to_host(atypes, d_atypes, n_atypes); + sync_array_to_host(catypes, d_catypes, n_catypes); + + sync_array_to_host(q_atoms, d_q_atoms, n_qatoms); + sync_array_to_host(q_charges, d_q_charges, n_qatoms * n_lambdas); + sync_array_to_host(LJ_matrix, d_LJ_matrix, n_atoms_solute * n_atoms_solute); + sync_array_to_host(excluded, d_excluded, n_atoms); + sync_array_to_host(q_elscales, d_q_elscales, n_qelscales); + sync_array_to_host(q_catypes, d_q_catypes, n_qcatypes); + sync_array_to_host(q_atypes, d_q_atypes, n_qatoms * n_lambdas); + sync_array_to_host(EQ_nonbond_qq, d_EQ_nonbond_qq, n_lambdas); + sync_array_to_host(lambdas, d_lambdas, n_lambdas); + sync_array_to_host(wshells, d_wshells, n_shells); + sync_array_to_host(shell, d_shell, n_atoms); + sync_array_to_host(coords_top, d_coords_top, n_atoms); + + sync_array_to_host(restrangs, d_restrangs, n_restrangs); + sync_array_to_host(EQ_restraint, d_EQ_restraint, n_lambdas); + sync_array_to_host(restrdists, d_restrdists, n_restrdists); + + sync_array_to_host(restrspos, d_restrspos, n_restrspos); + + sync_array_to_host(restrseqs, d_restrseqs, n_restrseqs); + sync_array_to_host(heavy, d_heavy, n_atoms); + sync_array_to_host(restrwalls, d_restrwalls, n_restrwalls); + + sync_array_to_host(torsions, d_torsions, n_torsions); + sync_array_to_host(ctorsions, d_ctorsions, n_ctorsions); + + sync_array_to_host(ccharges, d_ccharges, n_ccharges); + sync_array_to_host(charges, d_charges, n_charges); + sync_array_to_host(p_atoms, d_p_atoms, n_patoms); +} + +void CudaContext::free() { + cudaFree(d_coords); + cudaFree(d_dvelocities); + cudaFree(d_velocities); + cudaFree(d_angles); + cudaFree(d_cangles); + cudaFree(d_bonds); + cudaFree(d_cbonds); + cudaFree(d_impropers); + cudaFree(d_cimpropers); + + cudaFree(d_atypes); + cudaFree(d_catypes); + + cudaFree(d_mol_n_shakes); + cudaFree(d_shake_bonds); + cudaFree(d_winv); + cudaFree(d_xcoords); + + cudaFree(d_q_atoms); + cudaFree(d_q_charges); + cudaFree(d_LJ_matrix); + cudaFree(d_excluded); + cudaFree(d_q_elscales); + cudaFree(d_q_catypes); + cudaFree(d_q_atypes); + cudaFree(d_EQ_nonbond_qq); + cudaFree(d_lambdas); + + cudaFree(d_wshells); + cudaFree(d_shell); + cudaFree(d_coords_top); + + cudaFree(d_restrangs); + cudaFree(d_EQ_restraint); + cudaFree(d_restrdists); + + cudaFree(d_restrspos); + + cudaFree(d_restrseqs); + cudaFree(d_heavy); + + cudaFree(d_restrwalls); + + cudaFree(d_torsions); + cudaFree(d_ctorsions); + + cudaFree(d_ccharges); + cudaFree(d_charges); + cudaFree(d_p_atoms); +} + +void CudaContext::reset_energies() { + cudaMemset(d_dvelocities, 0, sizeof(dvel_t) * n_atoms); + cudaMemset(d_EQ_nonbond_qq, 0, sizeof(E_nonbonded_t) * n_lambdas); + cudaMemset(d_EQ_restraint, 0, sizeof(E_restraint_t) * n_lambdas); +} diff --git a/src/core/cuda/src/CudaImproper2Force.cu b/src/core/cuda/src/CudaImproper2Force.cu index 4d5e8670..21f1e1a5 100644 --- a/src/core/cuda/src/CudaImproper2Force.cu +++ b/src/core/cuda/src/CudaImproper2Force.cu @@ -1,14 +1,11 @@ +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaImproper2Force.cuh" #include "cuda/include/CudaUtility.cuh" #include "utils.h" namespace CudaImproper2Force { bool is_initialized = false; -improper_t* d_impropers = nullptr; -cimproper_t* d_cimpropers = nullptr; -coord_t* d_coords = nullptr; -dvel_t* d_dvelocities = nullptr; -double* d_energy_sum = nullptr; +double* d_energy_sum; } // namespace CudaImproper2Force __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, double* energy_sum) { @@ -125,45 +122,44 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp atomicAdd(&dvelocities[aki].z, dv * dpk.z); atomicAdd(&dvelocities[ali].x, dv * dpl.x); atomicAdd(&dvelocities[ali].y, dv * dpl.y); - atomicAdd(&dvelocities[ali].z, dv * dpl.z); + atomicAdd(&dvelocities[ali].z, dv * dpl.z); } double calc_improper2_forces_host(int start, int end) { - using namespace CudaImproper2Force; int N = end - start; if (N <= 0) return 0.0; + using namespace CudaImproper2Force; int blockSize = 256; int numBlocks = (N + blockSize - 1) / blockSize; - if (!is_initialized) { - check_cudaMalloc((void**)&d_impropers, sizeof(improper_t) * n_impropers); - check_cudaMalloc((void**)&d_cimpropers, sizeof(cimproper_t) * n_cimpropers); - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms); - check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); - is_initialized = true; - } - cudaMemcpy(d_impropers, impropers, sizeof(improper_t) * n_impropers, cudaMemcpyHostToDevice); - cudaMemcpy(d_cimpropers, cimpropers, sizeof(cimproper_t) * n_cimpropers, cudaMemcpyHostToDevice); - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice); - double zero = 0.0; - cudaMemcpy(d_energy_sum, &zero, sizeof(double), cudaMemcpyHostToDevice); + double energy = 0.0; + cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice); + + CudaContext& context = CudaContext::instance(); + // context.sync_all_to_device(); + coord_t* d_coords = context.d_coords; + dvel_t* d_dvelocities = context.d_dvelocities; + improper_t* d_impropers = context.d_impropers; + cimproper_t* d_cimpropers = context.d_cimpropers; + calc_improper2_forces_kernel<<>>(start, end, d_impropers, d_cimpropers, d_coords, d_dvelocities, d_energy_sum); cudaDeviceSynchronize(); - double energy = 0.0; cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); return energy; } +void init_improper2_force_kernel_data() { + using namespace CudaImproper2Force; + if (!is_initialized) { + check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); + is_initialized = true; + } +} + void cleanup_improper2_force() { using namespace CudaImproper2Force; if (is_initialized) { - cudaFree(d_impropers); - cudaFree(d_cimpropers); - cudaFree(d_coords); - cudaFree(d_dvelocities); cudaFree(d_energy_sum); is_initialized = false; } diff --git a/src/core/cuda/src/CudaLeapfrog.cu b/src/core/cuda/src/CudaLeapfrog.cu index 5d0af953..d246b069 100644 --- a/src/core/cuda/src/CudaLeapfrog.cu +++ b/src/core/cuda/src/CudaLeapfrog.cu @@ -1,19 +1,13 @@ #include +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaLeapfrog.cuh" #include "cuda/include/CudaShakeConstraints.cuh" #include "utils.h" namespace CudaLeapfrog { -bool is_initialized = false; -atype_t* d_atypes = nullptr; -catype_t* d_catypes = nullptr; -vel_t* d_velocities = nullptr; -dvel_t* d_dvelocities = nullptr; -coord_t* d_coords = nullptr; -coord_t* d_xcoords = nullptr; -} // namespace CudaLeapfrog +} __global__ void calc_leapfrog_kernel( atype_t* atypes, @@ -52,23 +46,13 @@ __global__ void calc_leapfrog_kernel( } void calc_leapfrog_host() { - using namespace CudaLeapfrog; - if (!is_initialized) { - check_cudaMalloc((void**)&d_atypes, sizeof(atype_t) * n_atypes); - check_cudaMalloc((void**)&d_catypes, sizeof(catype_t) * n_catypes); - check_cudaMalloc((void**)&d_velocities, sizeof(vel_t) * n_atoms); - check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms); - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_xcoords, sizeof(coord_t) * n_atoms); - - cudaMemcpy(d_atypes, atypes, sizeof(atype_t) * n_atypes, cudaMemcpyHostToDevice); - cudaMemcpy(d_catypes, catypes, sizeof(catype_t) * n_catypes, cudaMemcpyHostToDevice); - - is_initialized = true; - } - cudaMemcpy(d_velocities, velocities, sizeof(vel_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); + CudaContext& ctx = CudaContext::instance(); + auto d_atypes = ctx.d_atypes; + auto d_catypes = ctx.d_catypes; + auto d_velocities = ctx.d_velocities; + auto d_dvelocities = ctx.d_dvelocities; + auto d_coords = ctx.d_coords; + auto d_xcoords = ctx.d_xcoords; int blockSize = 256; int numBlocks = (n_atoms + blockSize - 1) / blockSize; @@ -102,15 +86,7 @@ void calc_leapfrog_host() { } } } -void cleanup_leapfrog() { - using namespace CudaLeapfrog; - if (is_initialized) { - cudaFree(d_atypes); - cudaFree(d_catypes); - cudaFree(d_velocities); - cudaFree(d_dvelocities); - cudaFree(d_coords); - cudaFree(d_xcoords); - is_initialized = false; - } -} + +void init_leapfrog_kernel_data() {} + +void cleanup_leapfrog() {} \ No newline at end of file diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu new file mode 100644 index 00000000..d32db32b --- /dev/null +++ b/src/core/cuda/src/CudaNonbondedPPForce.cu @@ -0,0 +1,311 @@ +#include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedPPForce.cuh" +#include "utils.h" + +namespace CudaNonbondedPPForce { +bool is_initialized = false; +dvel_t* PP_MAT; +double *D_PP_Evdw, *D_PP_Ecoul; +double *D_PP_evdw_TOT, *D_PP_ecoul_TOT; + +__device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj, + coord_t* Xs, coord_t* Ys, int* LJs, bool* excluded_s, double* Evdw, double* Ecoul, dvel_t* patom_a, dvel_t* patom_b, + ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, topo_t D_topo) { + bool bond14, bond23; + double scaling; + coord_t da; + double r2a, ra, r6a; + double Vela, V_a, V_b; + double dva; + double crg_i, crg_j; + double ai_aii, aj_aii, ai_bii, aj_bii; + catype_t ai_type, aj_type; + + bond23 = LJs[row * BLOCK_SIZE + column] == 3; + bond14 = LJs[row * BLOCK_SIZE + column] == 1; + + if (bond23) return; + if (excluded_s[row] || excluded_s[BLOCK_SIZE + column]) return; + + scaling = bond14 ? D_topo.el14_scale : 1; + + crg_i = D_ccharges[D_charges[pi].code - 1].charge; + crg_j = D_ccharges[D_charges[pj].code - 1].charge; + + ai_type = D_catypes[D_atypes[pi].code - 1]; + aj_type = D_catypes[D_atypes[pj].code - 1]; + + da.x = Ys[column].x - Xs[row].x; + da.y = Ys[column].y - Xs[row].y; + da.z = Ys[column].z - Xs[row].z; + r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2)); + ra = sqrt(r2a); + r6a = r2a * r2a * r2a; + + Vela = scaling * D_topo.coulomb_constant * crg_i * crg_j * ra; + + ai_aii = bond14 ? ai_type.aii_1_4 : ai_type.aii_normal; + aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal; + ai_bii = bond14 ? ai_type.bii_1_4 : ai_type.bii_normal; + aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal; + + V_a = r6a * r6a * ai_aii * aj_aii; + V_b = r6a * ai_bii * aj_bii; + dva = r2a * (-Vela - 12 * V_a + 6 * V_b); + + patom_a->x = -dva * da.x; + patom_a->y = -dva * da.y; + patom_a->z = -dva * da.z; + + patom_b->x = dva * da.x; + patom_b->y = dva * da.y; + patom_b->z = dva * da.z; + + *Ecoul += Vela; + *Evdw += (V_a - V_b); +} + +__global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute, + coord_t* X, double* Evdw, double* Ecoul, dvel_t* PP_MAT, + ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, int* D_LJ_matrix, bool* D_excluded, topo_t D_topo) { + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; + + // Thread index + int tx = threadIdx.x; + int ty = threadIdx.y; + + int aStart = BLOCK_SIZE * by; + int bStart = BLOCK_SIZE * bx; + + __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + + Ecoul_S[ty][tx] = 0; + Evdw_S[ty][tx] = 0; + + if (aStart + ty >= n_patoms) return; + if (bStart + tx >= n_patoms) return; + + __shared__ coord_t Xs[BLOCK_SIZE]; + __shared__ coord_t Ys[BLOCK_SIZE]; + __shared__ int LJs[BLOCK_SIZE * BLOCK_SIZE]; + __shared__ bool excluded_s[2 * BLOCK_SIZE]; + + int pi = D_patoms[aStart + ty].a - 1; + int pj = D_patoms[bStart + tx].a - 1; + + if (tx == 0) { + Xs[ty] = X[pi]; + excluded_s[ty] = D_excluded[pi]; + } + + if (ty == 0) { + Ys[tx] = X[pj]; + excluded_s[BLOCK_SIZE + tx] = D_excluded[pj]; + } + LJs[ty * BLOCK_SIZE + tx] = D_LJ_matrix[pi * n_atoms_solute + pj]; + + __syncthreads(); + + if (bx < by || (bx == by && tx < ty)) return; + + dvel_t patom_a, patom_b; + memset(&patom_a, 0, sizeof(dvel_t)); + memset(&patom_b, 0, sizeof(dvel_t)); + + int row = by * BLOCK_SIZE + ty; + int column = bx * BLOCK_SIZE + tx; + + if (bx != by || tx != ty) { + double evdw = 0, ecoul = 0; + calc_pp_dvel_matrix_incr(ty, pi, tx, pj, Xs, Ys, LJs, excluded_s, &evdw, &ecoul, &patom_a, + &patom_b, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo); + Evdw_S[ty][tx] = evdw; + Ecoul_S[ty][tx] = ecoul; + } + + PP_MAT[row * n_patoms + column] = patom_a; + PP_MAT[column * n_patoms + row] = patom_b; + + __syncthreads(); + + int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + + if (tx == 0 && ty == 0) { + double tot_Evdw = 0; + double tot_Ecoul = 0; + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + tot_Evdw += Evdw_S[i][j]; + tot_Ecoul += Ecoul_S[i][j]; + } + } + Evdw[rowlen * by + bx] = tot_Evdw; + Ecoul[rowlen * by + bx] = tot_Ecoul; + + // printf("bx = %d by = %d tot_Evdw = %f tot_Ecoul = %f\n", bx, by, Evdw[rowlen * by + bx], Ecoul[rowlen * by + bx]); + } + + __syncthreads(); +} + +__global__ void calc_pp_dvel_vector(int n_patoms, dvel_t* DV_X, dvel_t* PP_MAT, p_atom_t* D_patoms) { + int row = blockIdx.x * blockDim.x + threadIdx.x; + if (row >= n_patoms) return; + + dvel_t dP; + dP.x = 0; + dP.y = 0; + dP.z = 0; + + for (int i = 0; i < n_patoms; i++) { + if (i != row) { + dP.x += PP_MAT[i + n_patoms * row].x; + dP.y += PP_MAT[i + n_patoms * row].y; + dP.z += PP_MAT[i + n_patoms * row].z; + } + } + + int p = D_patoms[row].a - 1; + + DV_X[p].x += dP.x; + DV_X[p].y += dP.y; + DV_X[p].z += dP.z; + + __syncthreads(); +} + +__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) { + int tx = threadIdx.x; + int ty = threadIdx.y; + + __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + + // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work + double coul_TOT = 0; + double vdw_TOT = 0; + for (int i = ty; i < rows; i += BLOCK_SIZE) { + for (int j = tx; j < columns; j += BLOCK_SIZE) { + if (i <= j || !upper_diagonal) { + coul_TOT += Ecoul[i * columns + j]; + vdw_TOT += Evdw[i * columns + j]; + } + } + } + Ecoul_S[ty][tx] = coul_TOT; + Evdw_S[ty][tx] = vdw_TOT; + + __syncthreads(); + + if (tx == 0 && ty == 0) { + double Evdw_temp = 0; + double Ecoul_temp = 0; + + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + Evdw_temp += Evdw_S[i][j]; + Ecoul_temp += Ecoul_S[i][j]; + } + } + + *Evdw_TOT = Evdw_temp; + *Ecoul_TOT = Ecoul_temp; + } +} +} // namespace CudaNonbondedPPForce +void calc_nonbonded_pp_forces_host_v2() { + using namespace CudaNonbondedPPForce; + + int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t); + + int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + + CudaContext& ctx = CudaContext::instance(); + auto X = ctx.d_coords; + auto DV_X = ctx.d_dvelocities; + auto D_ccharges = ctx.d_ccharges; + auto D_charges = ctx.d_charges; + auto D_catypes = ctx.d_catypes; + auto D_atypes = ctx.d_atypes; + auto D_patoms = ctx.d_p_atoms; + auto D_LJ_matrix = ctx.d_LJ_matrix; + auto D_excluded = ctx.d_excluded; + + dim3 threads, grid; + + threads = dim3(BLOCK_SIZE, BLOCK_SIZE); + grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y); + + calc_pp_dvel_matrix<<>>(n_patoms, n_atoms_solute, X, D_PP_Evdw, D_PP_Ecoul, PP_MAT, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_LJ_matrix, D_excluded, topo); + calc_pp_dvel_vector<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, DV_X, PP_MAT, D_patoms); + + cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); + +#ifdef DEBUG + cudaMemcpy(h_PP_MAT, PP_MAT, mem_size_PP_MAT, cudaMemcpyDeviceToHost); +#endif + +#ifdef DEBUG + for (int i = 0; i < n_patoms; i++) { + for (int j = 0; j < n_patoms; j++) { + if (h_PP_MAT[i * n_patoms + j].x > 100) + printf("PP_MAT[%d][%d] = %f %f %f\n", i, j, h_PP_MAT[i * n_patoms + j].x, h_PP_MAT[i * n_patoms + j].y, h_PP_MAT[i * n_patoms + j].z); + } + } +#endif + cudaMemset(D_PP_evdw_TOT, 0, sizeof(double)); + cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double)); + + calc_energy_sum<<<1, threads>>>(n_blocks, n_blocks, D_PP_evdw_TOT, D_PP_ecoul_TOT, D_PP_Evdw, D_PP_Ecoul, true); + double PP_evdw_TOT, PP_ecoul_TOT; + cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + + E_nonbond_pp.Uvdw += PP_evdw_TOT; + E_nonbond_pp.Ucoul += PP_ecoul_TOT; +} + +void init_nonbonded_pp_force_kernel_data() { + using namespace CudaNonbondedPPForce; + + if (!is_initialized) { + int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + + int mem_size_PP_MAT = n_patoms * n_patoms * sizeof(dvel_t); + + int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(double); + int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(double); +#ifdef DEBUG + printf("Allocating PP_MAT\n"); +#endif + check_cudaMalloc((void**)&PP_MAT, mem_size_PP_MAT); +#ifdef DEBUG + printf("Allocating D_PP_Evdw\n"); +#endif + check_cudaMalloc((void**)&D_PP_Evdw, mem_size_PP_Evdw); +#ifdef DEBUG + printf("Allocating D_PP_Ecoul\n"); +#endif + check_cudaMalloc((void**)&D_PP_Ecoul, mem_size_PP_Ecoul); + + check_cudaMalloc((void**)&D_PP_evdw_TOT, sizeof(double)); + check_cudaMalloc((void**)&D_PP_ecoul_TOT, sizeof(double)); + + is_initialized = true; + } +} + +void cleanup_nonbonded_pp_force() { + using namespace CudaNonbondedPPForce; + if (is_initialized) { + cudaFree(PP_MAT); + cudaFree(D_PP_Evdw); + cudaFree(D_PP_Ecoul); + cudaFree(D_PP_evdw_TOT); + cudaFree(D_PP_ecoul_TOT); + is_initialized = false; + } +} \ No newline at end of file diff --git a/src/core/cuda/src/CudaNonbondedPWForce.cu b/src/core/cuda/src/CudaNonbondedPWForce.cu new file mode 100644 index 00000000..dec352db --- /dev/null +++ b/src/core/cuda/src/CudaNonbondedPWForce.cu @@ -0,0 +1,380 @@ +#include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedPWForce.cuh" + +namespace CudaNonbondedPWForce { +// Declare any necessary static variables or device pointers here +bool is_initialized = false; +struct calc_pw_t { + dvel_t P; + dvel_t W; +}; + +calc_pw_t *PW_MAT, *h_PW_MAT; +double *D_PW_Evdw, *D_PW_Ecoul; +double *D_PW_evdw_TOT, *D_PW_ecoul_TOT, PW_evdw_TOT, PW_ecoul_TOT; + +__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) { + int tx = threadIdx.x; + int ty = threadIdx.y; + + __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + + // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work + double coul_TOT = 0; + double vdw_TOT = 0; + for (int i = ty; i < rows; i += BLOCK_SIZE) { + for (int j = tx; j < columns; j += BLOCK_SIZE) { + if (i <= j || !upper_diagonal) { + coul_TOT += Ecoul[i * columns + j]; + vdw_TOT += Evdw[i * columns + j]; + } + } + } + Ecoul_S[ty][tx] = coul_TOT; + Evdw_S[ty][tx] = vdw_TOT; + + __syncthreads(); + + if (tx == 0 && ty == 0) { + double Evdw_temp = 0; + double Ecoul_temp = 0; + + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + Evdw_temp += Evdw_S[i][j]; + Ecoul_temp += Ecoul_S[i][j]; + } + } + + *Evdw_TOT = Evdw_temp; + *Ecoul_TOT = Ecoul_temp; + } +} + +__device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_atoms_solute, + coord_t* Xs, coord_t* Ws, bool* excluded_s, double* Evdw, double* Ecoul, calc_pw_t* pw, + ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, topo_t D_topo) { + coord_t da; + double r2a, ra, r6a; + double Vela, V_a, V_b; + double dva; + double qi, qj; + double ai_aii, aj_aii, ai_bii, aj_bii; + catype_t ai_type, aj_type; + atype_t i_type, j_type; + + if (excluded_s[row]) return; + qi = D_ccharges[D_charges[pi].code - 1].charge; + qj = D_ccharges[D_charges[n_atoms_solute + j].code - 1].charge; // TODO: FIX THIS!!! WILL NOT WORK WITH QATOMS!!!!! + + // if (pi < 100 && j < 100){ + // printf("qi = %f qj = %f\n", qi, qj); + // } + + ai_type = D_catypes[D_atypes[pi].code - 1]; + aj_type = D_catypes[D_atypes[n_atoms_solute + j].code - 1]; + + da.x = Ws[column].x - Xs[row].x; + da.y = Ws[column].y - Xs[row].y; + da.z = Ws[column].z - Xs[row].z; + r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2)); + ra = sqrt(r2a); + r6a = r2a * r2a * r2a; + + Vela = D_topo.coulomb_constant * qi * qj * ra; + + ai_aii = ai_type.aii_normal; + aj_aii = aj_type.aii_normal; + ai_bii = ai_type.bii_normal; + aj_bii = aj_type.bii_normal; + + V_a = r6a * r6a * ai_aii * aj_aii; + V_b = r6a * ai_bii * aj_bii; + dva = r2a * (-Vela - 12 * V_a + 6 * V_b); + + pw->P.x -= dva * da.x; + pw->P.y -= dva * da.y; + pw->P.z -= dva * da.z; + + pw->W.x += dva * da.x; + pw->W.y += dva * da.y; + pw->W.z += dva * da.z; + + *Ecoul += Vela; + *Evdw += (V_a - V_b); + + // if (pi == 522 && j == 175) { + // printf("Vela = %f V_a = %f V_b = %f P = %f %f %f ai_aii = %f aj_aii = %f\n", Vela, V_a, V_b, pw->P.x, pw->P.y, pw->P.z, ai_aii, aj_aii); + // } + + // if (pi < 100 && j < 100) printf("Evdw = %f Ecoul = %f\n", *Evdw, *Ecoul); +} + +__global__ void calc_pw_dvel_vector_row(int n_patoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_pw_t* PW_MAT, p_atom_t* D_patoms) { + int row = blockIdx.x * blockDim.x + threadIdx.x; + if (row >= n_patoms) return; + + dvel_t dP; + + dP.x = 0; + dP.y = 0; + dP.z = 0; + + for (int i = 0; i < 3 * n_waters; i++) { + dP.x += PW_MAT[i + 3 * n_waters * row].P.x; + dP.y += PW_MAT[i + 3 * n_waters * row].P.y; + dP.z += PW_MAT[i + 3 * n_waters * row].P.z; + } + + int p = D_patoms[row].a - 1; + + DV_X[p].x += dP.x; + DV_X[p].y += dP.y; + DV_X[p].z += dP.z; + + __syncthreads(); +} + +__global__ void calc_pw_dvel_vector_column(int n_patoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_pw_t* PW_MAT) { + int column = blockIdx.x * blockDim.x + threadIdx.x; + if (column >= 3 * n_waters) return; + + dvel_t dW; + + dW.x = 0; + dW.y = 0; + dW.z = 0; + + for (int i = 0; i < n_patoms; i++) { + dW.x += PW_MAT[column + 3 * n_waters * i].W.x; + dW.y += PW_MAT[column + 3 * n_waters * i].W.y; + dW.z += PW_MAT[column + 3 * n_waters * i].W.z; + } + + DV_W[column].x += dW.x; + DV_W[column].y += dW.y; + DV_W[column].z += dW.z; + + __syncthreads(); +} + +__global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_waters, + coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_pw_t* PW_MAT, + ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, bool* D_excluded, topo_t D_topo) { + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; + + // Thread index + int tx = threadIdx.x; + int ty = threadIdx.y; + + __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + + Ecoul_S[ty][tx] = 0; + Evdw_S[ty][tx] = 0; + + // if (tx == 0 && ty == 0) printf("bx = %d by = %d\n", bx, by); + + // if (bx == 0 && by == 0) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty); + + int aStart = BLOCK_SIZE * by; + int bStart = BLOCK_SIZE * bx; + + if (aStart + ty >= n_patoms) return; + if (bStart + tx >= 3 * n_waters) return; + + // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty); + + __shared__ coord_t Xs[BLOCK_SIZE]; + __shared__ coord_t Ws[BLOCK_SIZE]; + __shared__ bool excluded_s[BLOCK_SIZE]; + + int pi = D_patoms[aStart + ty].a - 1; + + Xs[ty] = X[pi]; + Ws[tx] = W[bStart + tx]; + + if (tx == 0) { + excluded_s[ty] = D_excluded[pi]; + } + + __syncthreads(); + + calc_pw_t pw; + memset(&pw, 0, sizeof(calc_pw_t)); + + int row = by * BLOCK_SIZE + ty; + int column = bx * BLOCK_SIZE + tx; + + // if (row == 0 && column == 1) { + // printf("Xs[0] = %f\n", Xs[0]); + // printf("Ys[0] = %f\n", Ys[0]); + // printf("Xs[1] = %f\n", Xs[1]); + // printf("Ys[1] = %f\n", Ys[1]); + // printf("Xs[2] = %f\n", Xs[2]); + // printf("Ys[2] = %f\n", Ys[2]); + // printf("Xs[3] = %f\n", Xs[3]); + // printf("Ys[3] = %f\n", Ys[3]); + // printf("Xs[4] = %f\n", Xs[4]); + // printf("Ys[4] = %f\n", Ys[4]); + + // printf("Ys[%d] = %f Xs[%d] = %f\n", 3 * ty, Ys[3 * ty], 3 * tx, Xs[3 * tx]); + // } + + // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty); + // __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_patoms, + // coord_t *Ps, coord_t *Xs, double *Evdw, double *Ecoul, calc_pw_t *pw, + // ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms) + double evdw = 0, ecoul = 0; + calc_pw_dvel_matrix_incr(ty, pi, tx, bStart + tx, n_atoms_solute, Xs, Ws, excluded_s, &evdw, &ecoul, &pw, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo); + Evdw_S[ty][tx] = evdw; + Ecoul_S[ty][tx] = ecoul; + + // if (row == 0 && column == 1) { + // printf("water_a = %f %f %f water_b = %f %f %f\n", water_a[0].x, water_a[0].y, water_a[0].z, water_b[0].x, water_b[0].y, water_b[0].z); + // } + + // if (bx == 8 && by == 1) printf("n_qatoms = %d\n", n_qatoms); + // if (bx == 8 && by == 1) printf("qi = %d j = %d charge[%d] = %f\n", row, column, row + n_qatoms, D_qcharges[row + n_qatoms * 1].q); + + PW_MAT[column + 3 * n_waters * row] = pw; + + __syncthreads(); + + int rowlen = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; + + if (tx == 0 && ty == 0) { + double tot_Evdw = 0; + double tot_Ecoul = 0; + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + tot_Evdw += Evdw_S[i][j]; + tot_Ecoul += Ecoul_S[i][j]; + } + } + Evdw[rowlen * by + bx] = tot_Evdw; + Ecoul[rowlen * by + bx] = tot_Ecoul; + } + + __syncthreads(); +} + +} // namespace CudaNonbondedPWForce + +void calc_nonbonded_pw_forces_host_v2() { + using namespace CudaNonbondedPWForce; + int mem_size_W = 3 * n_waters * sizeof(coord_t); + int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t); + int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t); + int mem_size_PW_MAT = 3 * n_waters * n_patoms * sizeof(calc_pw_t); + + int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + int n_blocks_w = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; + + int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(double); + int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(double); + + CudaContext& ctx = CudaContext::instance(); + auto X = ctx.d_coords; + auto DV_X = ctx.d_dvelocities; + auto D_ccharges = ctx.d_ccharges; + auto D_charges = ctx.d_charges; + auto D_catypes = ctx.d_catypes; + auto D_atypes = ctx.d_atypes; + auto D_patoms = ctx.d_p_atoms; + auto D_excluded = ctx.d_excluded; + + dim3 threads, grid; + + threads = dim3(BLOCK_SIZE, BLOCK_SIZE); + grid = dim3((3 * n_waters + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y); + + calc_pw_dvel_matrix<<>>(n_patoms, n_atoms_solute, n_waters, X, X + n_atoms_solute, D_PW_Evdw, D_PW_Ecoul, PW_MAT, + D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_excluded, topo); + calc_pw_dvel_vector_column<<<((3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, n_waters, DV_X, DV_X + n_atoms_solute, PW_MAT); + calc_pw_dvel_vector_row<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, n_waters, DV_X, DV_X + n_atoms_solute, PW_MAT, D_patoms); + +#ifdef DEBUG + cudaMemcpy(h_PW_MAT, PW_MAT, mem_size_PW_MAT, cudaMemcpyDeviceToHost); +#endif + +// for (int i = 0; i < n_waters; i++) { +// printf("X[%d] = %f %f %f\n", i, coords[i].x, coords[i].y, coords[i].z); +// } + +// printf("n_patoms = %d n_watoms = %d\n", n_patoms, 3 * n_waters); +#ifdef DEBUG + for (int i = 0; i < n_patoms; i++) { + for (int j = 0; j < 3 * n_waters; j++) { + if (h_PW_MAT[3 * i * n_waters + j].P.x > 100) + printf("PW_MAT[%d][%d].P = %f %f %f\n", i, j, h_PW_MAT[3 * i * n_waters + j].P.x, h_PW_MAT[3 * i * n_waters + j].P.y, h_PW_MAT[3 * i * n_waters + j].P.z); + } + } +#endif + + cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); + cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost); + + calc_energy_sum<<<1, threads>>>(n_blocks_p, n_blocks_w, D_PW_evdw_TOT, D_PW_ecoul_TOT, D_PW_Evdw, D_PW_Ecoul, false); + + cudaMemcpy(&PW_evdw_TOT, D_PW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&PW_ecoul_TOT, D_PW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + + E_nonbond_pw.Uvdw += PW_evdw_TOT; + E_nonbond_pw.Ucoul += PW_ecoul_TOT; + + // for (int i = 0; i < n_atoms; i++) { + // printf("dvelocities[%d] = %f %f %f\n", i, dvelocities[i].x, dvelocities[i].y, dvelocities[i].z); + // } +} + +void init_nonbonded_pw_force_kernel_data() { + using namespace CudaNonbondedPWForce; + if (!is_initialized) { + int mem_size_PW_MAT = 3 * n_waters * n_patoms * sizeof(calc_pw_t); + + int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + int n_blocks_w = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; + int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(double); + int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(double); +#ifdef DEBUG + printf("Allocating PW_MAT\n"); +#endif + check_cudaMalloc((void**)&PW_MAT, mem_size_PW_MAT); + +#ifdef DEBUG + printf("Allocating D_PW_Evdw\n"); +#endif + check_cudaMalloc((void**)&D_PW_Evdw, mem_size_PW_Evdw); +#ifdef DEBUG + printf("Allocating D_PW_Ecoul\n"); +#endif + check_cudaMalloc((void**)&D_PW_Ecoul, mem_size_PW_Ecoul); + + check_cudaMalloc((void**)&D_PW_evdw_TOT, sizeof(double)); + check_cudaMalloc((void**)&D_PW_ecoul_TOT, sizeof(double)); + +#ifdef DEBUG + printf("All GPU solvent memory allocated\n"); +#endif + + h_PW_MAT = (calc_pw_t*)malloc(mem_size_PW_MAT); + is_initialized = true; + } +} + +void cleanup_nonbonded_pw_force() { + using namespace CudaNonbondedPWForce; + if (is_initialized) { + cudaFree(PW_MAT); + cudaFree(D_PW_Evdw); + cudaFree(D_PW_Ecoul); + cudaFree(D_PW_evdw_TOT); + cudaFree(D_PW_ecoul_TOT); + free(h_PW_MAT); + is_initialized = false; + } +} \ No newline at end of file diff --git a/src/core/cuda/src/CudaNonbondedQPForce.cu b/src/core/cuda/src/CudaNonbondedQPForce.cu new file mode 100644 index 00000000..4eb53908 --- /dev/null +++ b/src/core/cuda/src/CudaNonbondedQPForce.cu @@ -0,0 +1,365 @@ +#include + +#include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedQPForce.cuh" +#include "system.h" + +namespace CudaNonbondedQPForce { +struct calc_qw_t { + dvel_t Q; + dvel_t O; + dvel_t H1; + dvel_t H2; +}; + +struct calc_qp_t { + dvel_t Q; + dvel_t P; +}; +bool is_initialized = false; +double *D_QP_Evdw, *D_QP_Ecoul, *h_QP_Evdw, *h_QP_Ecoul; +calc_qp_t *QP_MAT, *h_QP_MAT; +double *D_QP_evdw_TOT, *D_QP_ecoul_TOT, QP_evdw_TOT, QP_ecoul_TOT; + +// General +__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) { + int tx = threadIdx.x; + int ty = threadIdx.y; + + __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + + // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work + double coul_TOT = 0; + double vdw_TOT = 0; + for (int i = ty; i < rows; i += BLOCK_SIZE) { + for (int j = tx; j < columns; j += BLOCK_SIZE) { + if (i <= j || !upper_diagonal) { + coul_TOT += Ecoul[i * columns + j]; + vdw_TOT += Evdw[i * columns + j]; + } + } + } + Ecoul_S[ty][tx] = coul_TOT; + Evdw_S[ty][tx] = vdw_TOT; + + __syncthreads(); + + if (tx == 0 && ty == 0) { + double Evdw_temp = 0; + double Ecoul_temp = 0; + + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + Evdw_temp += Evdw_S[i][j]; + Ecoul_temp += Ecoul_S[i][j]; + } + } + + *Evdw_TOT = Evdw_temp; + *Ecoul_TOT = Ecoul_temp; + } +} +__global__ void calc_qp_dvel_vector_row(int n_qatoms, int n_patoms, dvel_t* DV_X, calc_qp_t* QP_MAT, q_atom_t* D_qatoms) { + int row = blockIdx.x * blockDim.x + threadIdx.x; + if (row >= n_qatoms) return; + + dvel_t dQ; + + dQ.x = 0; + dQ.y = 0; + dQ.z = 0; + + for (int i = 0; i < n_patoms; i++) { + dQ.x += QP_MAT[i + n_patoms * row].Q.x; + dQ.y += QP_MAT[i + n_patoms * row].Q.y; + dQ.z += QP_MAT[i + n_patoms * row].Q.z; + } + + int q = D_qatoms[row].a - 1; + + DV_X[q].x += dQ.x; + DV_X[q].y += dQ.y; + DV_X[q].z += dQ.z; + + __syncthreads(); +} + +__global__ void calc_qp_dvel_vector_column(int n_qatoms, int n_patoms, dvel_t* DV_X, calc_qp_t* QP_MAT, p_atom_t* D_patoms) { + int column = blockIdx.x * blockDim.x + threadIdx.x; + if (column >= n_patoms) return; + + dvel_t dP; + + dP.x = 0; + dP.y = 0; + dP.z = 0; + + for (int i = 0; i < n_qatoms; i++) { + dP.x += QP_MAT[column + n_patoms * i].P.x; + dP.y += QP_MAT[column + n_patoms * i].P.y; + dP.z += QP_MAT[column + n_patoms * i].P.z; + } + + int p = D_patoms[column].a - 1; + + DV_X[p].x += dP.x; + DV_X[p].y += dP.y; + DV_X[p].z += dP.z; + + __syncthreads(); +} + +__device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms, + coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t* qp, + q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas, + catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) { + coord_t da; + double r2, r6, r; + double ai_aii, aj_aii, ai_bii, aj_bii; + q_catype_t qi_type; + catype_t aj_type; + bool bond23, bond14; + double scaling, Vel, V_a, V_b, dv; + + int j = D_patoms[pj].a - 1; + + bond23 = LJs[row * BLOCK_SIZE + column] == 3; + bond14 = LJs[row * BLOCK_SIZE + column] == 1; + + if (bond23) return; + if (excluded_s[row] || excluded_s[BLOCK_SIZE + column]) return; + + scaling = bond14 ? D_topo.el14_scale : 1; + + da.x = Qs[row].x - Ps[column].x; + da.y = Qs[row].y - Ps[column].y; + da.z = Qs[row].z - Ps[column].z; + + r2 = pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2); + + r6 = r2 * r2 * r2; + r2 = 1 / r2; + r = sqrt(r2); + + for (int state = 0; state < n_lambdas; state++) { + qi_type = D_qcatypes[D_qatypes[qi + n_qatoms * state].code - 1]; + aj_type = D_catypes[D_atypes[j].code - 1]; + + ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai; + aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal; + ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi; + aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal; + + Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi + n_qatoms * state].q * D_ccharges[D_charges[j].code - 1].charge * r; + V_a = ai_aii * aj_aii / (r6 * r6); + V_b = ai_bii * aj_bii / r6; + dv = r2 * (-Vel - (12 * V_a - 6 * V_b)) * D_lambdas[state]; + + // if (state == 0 && qi == 0 && pj == 1) { + // printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r); + // printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii); + // } + + // Update forces + qp->Q.x += dv * da.x; + qp->Q.y += dv * da.y; + qp->Q.z += dv * da.z; + qp->P.x -= dv * da.x; + qp->P.y -= dv * da.y; + qp->P.z -= dv * da.z; + + // Update Q totals + Ecoul_S[row][state * BLOCK_SIZE + column] += Vel; + Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b); + } +} + +__global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute, + coord_t* X, double* Evdw, double* Ecoul, calc_qp_t* QP_MAT, + q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas, + int* D_LJ_matrix, bool* D_excluded, catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) { + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; + + // Thread index + int tx = threadIdx.x; + int ty = threadIdx.y; + + // TODO implement >2 states on GPU + __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + + Ecoul_S[ty][tx] = 0; + Evdw_S[ty][tx] = 0; + Ecoul_S[ty][tx + BLOCK_SIZE] = 0; + Evdw_S[ty][tx + BLOCK_SIZE] = 0; + + int aStart = BLOCK_SIZE * by; + int bStart = BLOCK_SIZE * bx; + + if (aStart + ty >= n_qatoms) return; + if (bStart + tx >= n_patoms) return; + + int qi = D_qatoms[aStart + ty].a - 1; + int pj = D_patoms[bStart + tx].a - 1; + + __shared__ coord_t Qs[BLOCK_SIZE]; + __shared__ coord_t Ps[BLOCK_SIZE]; + __shared__ int LJs[BLOCK_SIZE * BLOCK_SIZE]; + __shared__ bool excluded_s[2 * BLOCK_SIZE]; + + if (tx == 0) { + Qs[ty] = X[qi]; + excluded_s[ty] = D_excluded[qi]; + } + + if (ty == 0) { + Ps[tx] = X[pj]; + excluded_s[BLOCK_SIZE + tx] = D_excluded[pj]; + } + LJs[ty * BLOCK_SIZE + tx] = D_LJ_matrix[qi * n_atoms_solute + pj]; + + __syncthreads(); + + calc_qp_t qp; + memset(&qp, 0, sizeof(calc_qp_t)); + + int row = by * BLOCK_SIZE + ty; + int column = bx * BLOCK_SIZE + tx; + + calc_qp_dvel_matrix_incr(ty, row, tx, column, n_lambdas, n_qatoms, Qs, Ps, LJs, excluded_s, Evdw_S, Ecoul_S, + &qp, D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_catypes, D_atypes, D_ccharges, D_charges, D_topo); + + QP_MAT[n_patoms * row + column] = qp; + + __syncthreads(); + + int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + + if (tx == 0 && ty == 0) { + // TODO implement >2 states on GPU + for (int state = 0; state < min(2, n_lambdas); state++) { + double tot_Evdw = 0; + double tot_Ecoul = 0; + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE]; + tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE]; + } + } + Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw; + Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul; + } + } + + __syncthreads(); +} + +} // namespace CudaNonbondedQPForce + +void calc_nonbonded_qp_forces_host_v2() { + using namespace CudaNonbondedQPForce; + + int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + + CudaContext& ctx = CudaContext::instance(); + auto X = ctx.d_coords; + auto DV_X = ctx.d_dvelocities; + auto D_qcatypes = ctx.d_q_catypes; + auto D_qatypes = ctx.d_q_atypes; + auto D_qcharges = ctx.d_q_charges; + auto D_patoms = ctx.d_p_atoms; + auto D_qatoms = ctx.d_q_atoms; + auto D_lambdas = ctx.d_lambdas; + auto D_LJ_matrix = ctx.d_LJ_matrix; + auto D_excluded = ctx.d_excluded; + auto D_catypes = ctx.d_catypes; + auto D_atypes = ctx.d_atypes; + auto D_ccharges = ctx.d_ccharges; + auto D_charges = ctx.d_charges; + + dim3 threads, grid; + + threads = dim3(BLOCK_SIZE, BLOCK_SIZE); + grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y); + + calc_qp_dvel_matrix<<>>(n_qatoms, n_patoms, n_lambdas, n_atoms_solute, X, D_QP_Evdw, D_QP_Ecoul, QP_MAT, + D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_LJ_matrix, D_excluded, + D_catypes, D_atypes, D_ccharges, D_charges, topo); + calc_qp_dvel_vector_column<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_patoms); + calc_qp_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_qatoms); + +#ifdef DEBUG + cudaMemcpy(h_QP_MAT, QP_MAT, mem_size_QP_MAT, cudaMemcpyDeviceToHost); +#endif + +#ifdef DEBUG + for (int i = 0; i < n_qatoms; i++) { + for (int j = 0; j < n_patoms; j++) { + if (i == 0) + // if (h_QP_MAT[i * n_patoms + j].Q.x > 100) + printf("QP_MAT[%d][%d].Q = %f %f %f\n", i, j, h_QP_MAT[i * n_patoms + j].Q.x, h_QP_MAT[i * n_patoms + j].Q.y, h_QP_MAT[i * n_patoms + j].Q.z); + } + } +#endif + int mem_size_DV_X = n_atoms * sizeof(dvel_t); + cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); + + // TODO: make Evdw & Ecoul work for # of states > 2 + for (int state = 0; state < min(2, n_lambdas); state++) { + calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, &D_QP_Evdw[state * n_blocks_p * n_blocks_q], &D_QP_Ecoul[state * n_blocks_p * n_blocks_q], false); + + cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + + EQ_nonbond_qp[state].Uvdw += QP_evdw_TOT; + EQ_nonbond_qp[state].Ucoul += QP_ecoul_TOT; + } +} + +void init_nonbonded_qp_force_kernel_data() { + using namespace CudaNonbondedQPForce; + + if (!is_initialized) { + int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + // TODO make Evdw & Ecoul work for # of states > 2 + int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double); + int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double); + int mem_size_QP_MAT = n_qatoms * n_patoms * sizeof(calc_qp_t); + + check_cudaMalloc((void**)&D_QP_Evdw, mem_size_QP_Evdw); + check_cudaMalloc((void**)&D_QP_Ecoul, mem_size_QP_Ecoul); + h_QP_Evdw = (double*)malloc(mem_size_QP_Evdw); + h_QP_Ecoul = (double*)malloc(mem_size_QP_Ecoul); + + check_cudaMalloc((void**)&QP_MAT, mem_size_QP_MAT); + h_QP_MAT = (calc_qp_t*)malloc(mem_size_QP_MAT); + + check_cudaMalloc((void**)&D_QP_evdw_TOT, sizeof(double)); + check_cudaMalloc((void**)&D_QP_ecoul_TOT, sizeof(double)); + + is_initialized = true; + } +} + +void cleanup_nonbonded_qp_force() { + using namespace CudaNonbondedQPForce; + + if (is_initialized) { + cudaFree(D_QP_Evdw); + cudaFree(D_QP_Ecoul); + free(h_QP_Evdw); + free(h_QP_Ecoul); + cudaFree(QP_MAT); + free(h_QP_MAT); + cudaFree(D_QP_evdw_TOT); + cudaFree(D_QP_ecoul_TOT); + + is_initialized = false; + } +} \ No newline at end of file diff --git a/src/core/cuda/src/CudaNonbondedQQForce.cu b/src/core/cuda/src/CudaNonbondedQQForce.cu index ed2fc54e..e2138ed6 100644 --- a/src/core/cuda/src/CudaNonbondedQQForce.cu +++ b/src/core/cuda/src/CudaNonbondedQQForce.cu @@ -1,19 +1,6 @@ +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaNonbondedQQForce.cuh" #include "utils.h" -namespace CudaNonbondedQQForce { -bool is_initialized = false; -q_atom_t* d_q_atoms = nullptr; -q_charge_t* d_q_charges = nullptr; -int* d_LJ_matrix = nullptr; -bool* d_excluded = nullptr; -q_elscale_t* d_q_elscales = nullptr; -q_catype_t* d_q_catypes = nullptr; -q_atype_t* d_q_atypes = nullptr; -coord_t* d_coords = nullptr; -E_nonbonded_t* d_EQ_nonbond_qq = nullptr; -dvel_t* d_dvelocities = nullptr; -double* d_lambdas = nullptr; -} // namespace CudaNonbondedQQForce __global__ void calc_nonbonded_qq_forces_kernel( q_atom_t* q_atoms, @@ -107,34 +94,20 @@ __global__ void calc_nonbonded_qq_forces_kernel( } void calc_nonbonded_qq_forces_host() { - using namespace CudaNonbondedQQForce; - if (!is_initialized) { - check_cudaMalloc((void**)&d_q_atoms, sizeof(q_atom_t) * n_qatoms); - check_cudaMalloc((void**)&d_q_charges, sizeof(q_charge_t) * n_qatoms * n_lambdas); - check_cudaMalloc((void**)&d_LJ_matrix, sizeof(int) * n_atoms_solute * n_atoms_solute); - check_cudaMalloc((void**)&d_excluded, sizeof(bool) * n_atoms); - check_cudaMalloc((void**)&d_q_elscales, sizeof(q_elscale_t) * n_qelscales); - check_cudaMalloc((void**)&d_q_catypes, sizeof(q_catype_t) * n_qcatypes); - check_cudaMalloc((void**)&d_q_atypes, sizeof(q_atype_t) * n_qatoms * n_lambdas); - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas); - check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms); - check_cudaMalloc((void**)&d_lambdas, sizeof(double) * n_lambdas); - - cudaMemcpy(d_q_atoms, q_atoms, sizeof(q_atom_t) * n_qatoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_q_charges, q_charges, sizeof(q_charge_t) * n_qatoms * n_lambdas, cudaMemcpyHostToDevice); - cudaMemcpy(d_LJ_matrix, LJ_matrix, sizeof(int) * n_atoms_solute * n_atoms_solute, cudaMemcpyHostToDevice); - cudaMemcpy(d_excluded, excluded, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_q_elscales, q_elscales, sizeof(q_elscale_t) * n_qelscales, cudaMemcpyHostToDevice); - cudaMemcpy(d_q_catypes, q_catypes, sizeof(q_catype_t) * n_qcatypes, cudaMemcpyHostToDevice); - cudaMemcpy(d_q_atypes, q_atypes, sizeof(q_atype_t) * n_qatoms * n_lambdas, cudaMemcpyHostToDevice); - cudaMemcpy(d_lambdas, lambdas, sizeof(double) * n_lambdas, cudaMemcpyHostToDevice); - - is_initialized = true; - } - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_EQ_nonbond_qq, EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas, cudaMemcpyHostToDevice); + CudaContext& ctx = CudaContext::instance(); + // ctx.sync_all_to_device(); + auto d_q_atoms = ctx.d_q_atoms; + auto d_q_charges = ctx.d_q_charges; + auto d_LJ_matrix = ctx.d_LJ_matrix; + auto d_excluded = ctx.d_excluded; + auto d_q_elscales = ctx.d_q_elscales; + auto d_q_catypes = ctx.d_q_catypes; + auto d_q_atypes = ctx.d_q_atypes; + auto d_coords = ctx.d_coords; + auto d_EQ_nonbond_qq = ctx.d_EQ_nonbond_qq; + auto d_dvelocities = ctx.d_dvelocities; + auto d_lambdas = ctx.d_lambdas; + int blockSize = 256; int numBlocks = (n_qatoms + blockSize - 1) / blockSize; calc_nonbonded_qq_forces_kernel<<>>( @@ -157,20 +130,6 @@ void calc_nonbonded_qq_forces_host() { cudaMemcpy(EQ_nonbond_qq, d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas, cudaMemcpyDeviceToHost); cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); } -void cleanup_nonbonded_qq_force() { - using namespace CudaNonbondedQQForce; - if (is_initialized) { - cudaFree(d_q_atoms); - cudaFree(d_q_charges); - cudaFree(d_LJ_matrix); - cudaFree(d_excluded); - cudaFree(d_q_elscales); - cudaFree(d_q_catypes); - cudaFree(d_q_atypes); - cudaFree(d_coords); - cudaFree(d_EQ_nonbond_qq); - cudaFree(d_dvelocities); - cudaFree(d_lambdas); - is_initialized = false; - } -} + +void init_nonbonded_qq_force_kernel_data() {} +void cleanup_nonbonded_qq_force() {} \ No newline at end of file diff --git a/src/core/cuda/src/CudaNonbondedQWForce.cu b/src/core/cuda/src/CudaNonbondedQWForce.cu new file mode 100644 index 00000000..974c380c --- /dev/null +++ b/src/core/cuda/src/CudaNonbondedQWForce.cu @@ -0,0 +1,404 @@ +#include + +#include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedQWForce.cuh" +#include "utils.h" + +namespace CudaNonbondedQWForce { +struct calc_qw_t { + dvel_t Q; + dvel_t O; + dvel_t H1; + dvel_t H2; +}; +bool is_initialized = false; +calc_qw_t *QW_MAT, *h_QW_MAT; + +double *D_QW_Evdw, *D_QW_Ecoul, *h_QW_Evdw, *h_QW_Ecoul; +double *D_QW_evdw_TOT, *D_QW_ecoul_TOT, QW_evdw_TOT, QW_ecoul_TOT; + +// General +__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) { + int tx = threadIdx.x; + int ty = threadIdx.y; + + __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE]; + __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE]; + + // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work + double coul_TOT = 0; + double vdw_TOT = 0; + for (int i = ty; i < rows; i += BLOCK_SIZE) { + for (int j = tx; j < columns; j += BLOCK_SIZE) { + if (i <= j || !upper_diagonal) { + coul_TOT += Ecoul[i * columns + j]; + vdw_TOT += Evdw[i * columns + j]; + } + } + } + Ecoul_S[ty][tx] = coul_TOT; + Evdw_S[ty][tx] = vdw_TOT; + + __syncthreads(); + + if (tx == 0 && ty == 0) { + double Evdw_temp = 0; + double Ecoul_temp = 0; + + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + Evdw_temp += Evdw_S[i][j]; + Ecoul_temp += Ecoul_S[i][j]; + } + } + + *Evdw_TOT = Evdw_temp; + *Ecoul_TOT = Ecoul_temp; + } +} + +__global__ void calc_qw_dvel_vector_row(int n_qatoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_qw_t* MAT, q_atom_t* D_qatoms) { + int row = blockIdx.x * blockDim.x + threadIdx.x; + if (row >= n_qatoms) return; + + dvel_t dQ; + + dQ.x = 0; + dQ.y = 0; + dQ.z = 0; + + for (int i = 0; i < n_waters; i++) { + dQ.x += MAT[i + n_waters * row].Q.x; + dQ.y += MAT[i + n_waters * row].Q.y; + dQ.z += MAT[i + n_waters * row].Q.z; + } + + int q = D_qatoms[row].a - 1; + + DV_X[q].x += dQ.x; + DV_X[q].y += dQ.y; + DV_X[q].z += dQ.z; + + __syncthreads(); +} + +__global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_qw_t* MAT) { + int column = blockIdx.x * blockDim.x + threadIdx.x; + if (column >= n_waters) return; + + dvel_t dO, dH1, dH2; + + dO.x = 0; + dO.y = 0; + dO.z = 0; + dH1.x = 0; + dH1.y = 0; + dH1.z = 0; + dH2.x = 0; + dH2.y = 0; + dH2.z = 0; + + for (int i = 0; i < n_qatoms; i++) { + dO.x += MAT[column + n_waters * i].O.x; + dO.y += MAT[column + n_waters * i].O.y; + dO.z += MAT[column + n_waters * i].O.z; + dH1.x += MAT[column + n_waters * i].H1.x; + dH1.y += MAT[column + n_waters * i].H1.y; + dH1.z += MAT[column + n_waters * i].H1.z; + dH2.x += MAT[column + n_waters * i].H2.x; + dH2.y += MAT[column + n_waters * i].H2.y; + dH2.z += MAT[column + n_waters * i].H2.z; + } + + DV_W[3 * column].x += dO.x; + DV_W[3 * column].y += dO.y; + DV_W[3 * column].z += dO.z; + DV_W[3 * column + 1].x += dH1.x; + DV_W[3 * column + 1].y += dH1.y; + DV_W[3 * column + 1].z += dH1.z; + DV_W[3 * column + 2].x += dH2.x; + DV_W[3 * column + 2].y += dH2.y; + DV_W[3 * column + 2].z += dH2.z; + + __syncthreads(); +} + +__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, double crg_ow, double crg_hw, double A_O, double B_O, + coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t* qw, + q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) { + int j; + coord_t dO, dH1, dH2; + double r2O, rH1, rH2, r6O, rO, r2H1, r2H2; + double dvO, dvH1, dvH2; + double V_a, V_b, VelO, VelH1, VelH2; + q_atype_t qa_type; + q_catype_t qi_type; + double ai_aii, ai_bii; + + j = 3 * column; + dO.x = Ws[j].x - Qs[row].x; + dO.y = Ws[j].y - Qs[row].y; + dO.z = Ws[j].z - Qs[row].z; + dH1.x = Ws[j + 1].x - Qs[row].x; + dH1.y = Ws[j + 1].y - Qs[row].y; + dH1.z = Ws[j + 1].z - Qs[row].z; + dH2.x = Ws[j + 2].x - Qs[row].x; + dH2.y = Ws[j + 2].y - Qs[row].y; + dH2.z = Ws[j + 2].z - Qs[row].z; + + r2O = pow(dO.x, 2) + pow(dO.y, 2) + pow(dO.z, 2); + rH1 = sqrt(1.0 / (pow(dH1.x, 2) + pow(dH1.y, 2) + pow(dH1.z, 2))); + rH2 = sqrt(1.0 / (pow(dH2.x, 2) + pow(dH2.y, 2) + pow(dH2.z, 2))); + r6O = r2O * r2O * r2O; + r2O = 1.0 / r2O; + rO = sqrt(r2O); + r2H1 = rH1 * rH1; + r2H2 = rH2 * rH2; + + // Reset potential + dvO = 0; + dvH1 = 0; + dvH2 = 0; + + for (int state = 0; state < n_lambdas; state++) { + qa_type = D_qatypes[qi + n_qatoms * state]; + qi_type = D_qcatypes[qa_type.code - 1]; + + ai_aii = qi_type.Ai; + ai_bii = qi_type.Bi; + + V_a = ai_aii * A_O / (r6O * r6O); + V_b = ai_bii * B_O / (r6O); + + VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi + n_qatoms * state].q * rO; + VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH1; + VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH2; + + dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * D_lambdas[state]; + dvH1 -= r2H1 * VelH1 * D_lambdas[state]; + dvH2 -= r2H2 * VelH2 * D_lambdas[state]; + + // Update Q totals + Ecoul_S[row][state * BLOCK_SIZE + column] += (VelO + VelH1 + VelH2); + Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b); + } + + // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!! + + // Update forces on Q-atom + (*qw).Q.x -= (dvO * dO.x + dvH1 * dH1.x + dvH2 * dH2.x); + (*qw).Q.y -= (dvO * dO.y + dvH1 * dH1.y + dvH2 * dH2.y); + (*qw).Q.z -= (dvO * dO.z + dvH1 * dH1.z + dvH2 * dH2.z); + + // Update forces on water + (*qw).O.x += dvO * dO.x; + (*qw).O.y += dvO * dO.y; + (*qw).O.z += dvO * dO.z; + (*qw).H1.x += dvH1 * dH1.x; + (*qw).H1.y += dvH1 * dH1.y; + (*qw).H1.z += dvH1 * dH1.z; + (*qw).H2.x += dvH2 * dH2.x; + (*qw).H2.y += dvH2 * dH2.y; + (*qw).H2.z += dvH2 * dH2.z; +} + +__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O, + coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_qw_t* MAT, + q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) { + // Block index + int bx = blockIdx.x; + int by = blockIdx.y; + + // Thread index + int tx = threadIdx.x; + int ty = threadIdx.y; + + // TODO implement >2 states on GPU + __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE]; + + Ecoul_S[ty][tx] = 0; + Evdw_S[ty][tx] = 0; + Ecoul_S[ty][tx + BLOCK_SIZE] = 0; + Evdw_S[ty][tx + BLOCK_SIZE] = 0; + + int aStart = BLOCK_SIZE * by; + int bStart = 3 * BLOCK_SIZE * bx; + + if (aStart + ty >= n_qatoms) return; + if (bStart + 3 * tx >= 3 * n_waters) return; + + __shared__ coord_t Qs[BLOCK_SIZE]; + __shared__ coord_t Ws[3 * BLOCK_SIZE]; + + if (tx == 0) { + Qs[ty] = X[D_qatoms[aStart + ty].a - 1]; + } + + if (ty == 0) { + Ws[3 * tx] = W[bStart + 3 * tx]; + Ws[3 * tx + 1] = W[bStart + 3 * tx + 1]; + Ws[3 * tx + 2] = W[bStart + 3 * tx + 2]; + } + + __syncthreads(); + + calc_qw_t qw; + memset(&qw, 0, sizeof(calc_qw_t)); + + int row = by * BLOCK_SIZE + ty; + int column = bx * BLOCK_SIZE + tx; + + calc_qw_dvel_matrix_incr(ty, aStart + ty, tx, n_lambdas, n_qatoms, crg_ow, crg_hw, A_O, B_O, Qs, Ws, Evdw_S, Ecoul_S, + &qw, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, D_topo); + + MAT[column + n_waters * row] = qw; + + __syncthreads(); + + int rowlen = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; + int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + + if (tx == 0 && ty == 0) { + // TODO implement >2 states on GPU + for (int state = 0; state < min(2, n_lambdas); state++) { + double tot_Evdw = 0; + double tot_Ecoul = 0; + for (int i = 0; i < BLOCK_SIZE; i++) { + for (int j = 0; j < BLOCK_SIZE; j++) { + tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE]; + tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE]; + } + } + Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw; + Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul; + } + } + + __syncthreads(); +} +} // namespace CudaNonbondedQWForce + +void calc_nonbonded_qw_forces_host_v2() { + using namespace CudaNonbondedQWForce; + + int mem_size_X = n_atoms_solute * sizeof(coord_t); + int mem_size_W = 3 * n_waters * sizeof(coord_t); + int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t); + int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t); + int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t); + + int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; + // TODO make Evdw & Ecoul work for # of states > 2 + int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); + int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); + + CudaContext& ctx = CudaContext::instance(); + auto X = ctx.d_coords; + auto DV_X = ctx.d_dvelocities; + auto D_qcatypes = ctx.d_q_catypes; + auto D_qatypes = ctx.d_q_atypes; + auto D_qcharges = ctx.d_q_charges; + auto D_qatoms = ctx.d_q_atoms; + auto D_lambdas = ctx.d_lambdas; + + dim3 threads, grid; + + threads = dim3(BLOCK_SIZE, BLOCK_SIZE); + grid = dim3((n_waters + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y); + + double evdw, ecoul; + + calc_qw_dvel_matrix<<>>(n_qatoms, n_waters, n_lambdas, crg_ow, crg_hw, A_O, B_O, X, X + n_atoms_solute, D_QW_Evdw, D_QW_Ecoul, + QW_MAT, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, topo); + calc_qw_dvel_vector_column<<<((n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT); + calc_qw_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT, D_qatoms); + +#ifdef DEBUG + cudaMemcpy(h_QW_MAT, QW_MAT, mem_size_MAT, cudaMemcpyDeviceToHost); +#endif + +#ifdef DEBUG + for (int i = 0; i < n_qatoms; i++) { + for (int j = 0; j < 3 * n_waters; j++) { + if (h_QW_MAT[3 * i * n_waters + j].Q.x > 100) + printf("QW_MAT[%d][%d].Q = %f %f %f\n", i, j, h_QW_MAT[i * 3 * n_waters + j].Q.x, h_QW_MAT[i * 3 * n_waters + j].Q.y, h_QW_MAT[i * 3 * n_waters + j].Q.z); + } + } +#endif + + cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost); + cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost); + + // TODO make Evdw & Ecoul work for # of states > 2 + for (int state = 0; state < min(2, n_lambdas); state++) { + calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, &D_QW_Evdw[state * n_blocks_w * n_blocks_q], + &D_QW_Ecoul[state * n_blocks_w * n_blocks_q], false); + + cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost); + + EQ_nonbond_qw[state].Uvdw += QW_evdw_TOT; + EQ_nonbond_qw[state].Ucoul += QW_ecoul_TOT; + } +} + +void init_nonbonded_qw_force_kernel_data() { + using namespace CudaNonbondedQWForce; + if (!is_initialized) { + catype_t catype_ow; // Atom type of first O atom + + catype_ow = catypes[atypes[n_atoms_solute].code - 1]; + + A_O = catype_ow.aii_normal; + B_O = catype_ow.bii_normal; + + int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE; + int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE; + int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t); + + int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); + int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double); +#ifdef DEBUG + printf("Allocating QW_MAT\n"); +#endif + check_cudaMalloc((void**)&QW_MAT, mem_size_MAT); + +#ifdef DEBUG + printf("Allocating D_QW_Evdw\n"); +#endif + check_cudaMalloc((void**)&D_QW_Evdw, mem_size_QW_Evdw); +#ifdef DEBUG + printf("Allocating D_QW_Ecoul\n"); +#endif + check_cudaMalloc((void**)&D_QW_Ecoul, mem_size_QW_Ecoul); + + check_cudaMalloc((void**)&D_QW_evdw_TOT, sizeof(double)); + check_cudaMalloc((void**)&D_QW_ecoul_TOT, sizeof(double)); + + h_QW_Evdw = (double*)malloc(mem_size_QW_Evdw); + h_QW_Ecoul = (double*)malloc(mem_size_QW_Ecoul); + + h_QW_MAT = (calc_qw_t*)malloc(mem_size_MAT); + is_initialized = true; + } +} + +void cleanup_nonbonded_qw_force() { + using namespace CudaNonbondedQWForce; + + if (is_initialized) { + cudaFree(QW_MAT); + cudaFree(D_QW_Evdw); + cudaFree(D_QW_Ecoul); + cudaFree(D_QW_evdw_TOT); + cudaFree(D_QW_ecoul_TOT); + + free(h_QW_Evdw); + free(h_QW_Ecoul); + free(h_QW_MAT); + + is_initialized = false; + } +} \ No newline at end of file diff --git a/src/core/cuda/src/CudaNonbondedWWForce.cu b/src/core/cuda/src/CudaNonbondedWWForce.cu new file mode 100644 index 00000000..84b6652c --- /dev/null +++ b/src/core/cuda/src/CudaNonbondedWWForce.cu @@ -0,0 +1,321 @@ +#include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaNonbondedWWForce.cuh" +namespace CudaNonbondedWWForce { + +bool is_initialized = false; +double *D_WW_evdw_TOT, *D_WW_ecoul_TOT, WW_evdw_TOT, WW_ecoul_TOT; + +__device__ __forceinline__ void calculate_unforce_bound( + const int y, const int x, const coord_t& q, const coord_t& p, + const topo_t& D_topo, const double& crg_ow, const double& crg_hw, + const double& A_OO, const double& B_OO, double& evdw, double& ecoul, + double& dv, double& tmpx, double& tmpy, double& tmpz) { + int belong_y = y / 3; + int belong_x = x / 3; + if (belong_y == belong_x) { + return; + } + + bool y_is_o = (y % 3 == 0); + bool x_is_o = (x % 3 == 0); + + // Compute distance components + tmpx = p.x - q.x; + tmpy = p.y - q.y; + tmpz = p.z - q.z; + // double inv_dis = 1.0 / sqrt(pow(tmpx, 2) + pow(tmpy, 2) + pow(tmpz, 2)); + double inv_dis = rsqrt(tmpx * tmpx + tmpy * tmpy + tmpz * tmpz); + double inv_dis2 = inv_dis * inv_dis; + + ecoul = inv_dis * D_topo.coulomb_constant * (y_is_o ? crg_ow : crg_hw) * + (x_is_o ? crg_ow : crg_hw); + double v_a = 0, v_b = 0; + if (y_is_o && x_is_o) { + double inv_dis6 = inv_dis2 * inv_dis2 * inv_dis2; + double inv_dis12 = inv_dis6 * inv_dis6; + v_a = A_OO * inv_dis12; + v_b = B_OO * inv_dis6; + evdw = v_a - v_b; + dv = inv_dis * inv_dis * (-ecoul - 12.0 * v_a + 6.0 * v_b); + } else { + dv = inv_dis * inv_dis * -ecoul; + } +} + +template +__global__ void +calc_ww(const int N, const double crg_ow, const double crg_hw, + const double A_OO, const double B_OO, const topo_t D_topo, + coord_t* __restrict__ W, dvel_t* __restrict__ DV_W, + double* __restrict__ Evdw_TOT, double* __restrict__ ecoul_TOT) { + // Calculate block boundaries + int NX = N; + int NY = (N + 1) / 2; + int x_cal_num = blockDim.x * Block_x; + int y_cal_num = blockDim.y * Block_y; + int block_x_left_begin = 1 + blockIdx.x * x_cal_num; + int block_y_left_begin = blockIdx.y * y_cal_num; + int block_x_left_end = min(block_x_left_begin + x_cal_num - 1, NX - 1); + int block_y_left_end = min(block_y_left_begin + y_cal_num - 1, NY - 1); + + // Shared memory declarations with padding to avoid bank conflicts + __shared__ coord_t p[2 * Thread_x * Block_x + 1]; + __shared__ coord_t q[2 * Thread_y * Block_y + 1]; + __shared__ double sum_row_x[2 * Thread_y * Block_y + 1]; + __shared__ double sum_row_y[2 * Thread_y * Block_y + 1]; + __shared__ double sum_row_z[2 * Thread_y * Block_y + 1]; + + __shared__ double block_ecoul[(Thread_x * Thread_y + 31) / 32], + block_evdw[(Thread_x * Thread_y + 31) / 32]; + + // Thread indices + int thread_y_left_begin = block_y_left_begin + threadIdx.y * Block_y; + int thread_x_left_begin = block_x_left_begin + threadIdx.x * Block_x; + int cur_thread_num = blockDim.x * threadIdx.y + threadIdx.x; + int thread_num = blockDim.x * blockDim.y; + +// Optimized coordinate loading with coalesced memory access +#pragma unroll + for (int i = cur_thread_num; i < x_cal_num && block_x_left_begin + i < NX; i += thread_num) { + p[i] = W[block_x_left_begin + i]; + p[x_cal_num + i] = W[N - (block_x_left_begin + i)]; + } + +#pragma unroll + for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) { + q[i] = W[block_y_left_begin + i]; + q[y_cal_num + i] = W[N - 1 - (block_y_left_begin + i)]; + } + +// Initialize sum arrays +#pragma unroll + for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) { + sum_row_x[i] = sum_row_x[y_cal_num + i] = 0; + sum_row_y[i] = sum_row_y[y_cal_num + i] = 0; + sum_row_z[i] = sum_row_z[y_cal_num + i] = 0; + } + __syncthreads(); + // Initialize column sums + double sum_col_x[2 * Block_x], sum_col_y[2 * Block_x], sum_col_z[2 * Block_x]; +#pragma unroll + for (int i = 0; i < Block_x; i++) { + sum_col_x[i] = sum_col_x[Block_x + i] = 0; + sum_col_y[i] = sum_col_y[Block_x + i] = 0; + sum_col_z[i] = sum_col_z[Block_x + i] = 0; + } + + // Main computation loop with reduced thread divergence + double evdw_tot = 0, ecoul_tot = 0; +#pragma unroll + for (int i = 0; i < Block_x; i++) { + int i2 = i; + int x = thread_x_left_begin + i2; + if (x >= NX) continue; + + int offset_x = x - block_x_left_begin; + const coord_t& now_p0 = p[offset_x]; + const coord_t& now_p1 = p[x_cal_num + offset_x]; + +#pragma unroll + for (int j = 0; j < Block_y; j++) { + int j2 = (j + threadIdx.x) % Block_y; + int y = thread_y_left_begin + j2; + if (y >= NY) continue; + + // Optimized condition check + if (y >= x && (N % 2 == 0 || y != NY - 1)) { + int offset_y = y - block_y_left_begin; + double evdw = 0, ecoul = 0, dv = 0; + double tmpx = 0, tmpy = 0, tmpz = 0; + + int y2 = N - 1 - y; + int x2 = N - x; + calculate_unforce_bound(y2, x2, q[y_cal_num + offset_y], now_p1, D_topo, + crg_ow, crg_hw, A_OO, B_OO, evdw, ecoul, dv, + tmpx, tmpy, tmpz); + + evdw_tot += evdw; + ecoul_tot += ecoul; + double v_x = dv * tmpx; + double v_y = dv * tmpy; + double v_z = dv * tmpz; + + sum_col_x[Block_x + i2] += v_x; + sum_col_y[Block_x + i2] += v_y; + sum_col_z[Block_x + i2] += v_z; + sum_row_x[y_cal_num + offset_y] -= v_x; + sum_row_y[y_cal_num + offset_y] -= v_y; + sum_row_z[y_cal_num + offset_y] -= v_z; + } else if (y < x) { + int offset_y = y - block_y_left_begin; + double evdw = 0, ecoul = 0, dv = 0; + double tmpx = 0, tmpy = 0, tmpz = 0; + + calculate_unforce_bound(y, x, q[offset_y], now_p0, D_topo, crg_ow, + crg_hw, A_OO, B_OO, evdw, ecoul, dv, tmpx, tmpy, + tmpz); + + evdw_tot += evdw; + ecoul_tot += ecoul; + double v_x = dv * tmpx; + double v_y = dv * tmpy; + double v_z = dv * tmpz; + + sum_col_x[i2] += v_x; + sum_col_y[i2] += v_y; + sum_col_z[i2] += v_z; + sum_row_x[offset_y] -= v_x; + sum_row_y[offset_y] -= v_y; + sum_row_z[offset_y] -= v_z; + } + } + } + +// Optimized reduction using warp-level primitives +#pragma unroll + for (unsigned w = 16; w >= 1; w /= 2) { + ecoul_tot += __shfl_down_sync(0xffffffff, ecoul_tot, w); + evdw_tot += __shfl_down_sync(0xffffffff, evdw_tot, w); + } + + // Store block results + int warp_id = cur_thread_num / warpSize; + int lane_id = cur_thread_num % warpSize; + if (lane_id == 0) { + block_evdw[warp_id] = evdw_tot; + block_ecoul[warp_id] = ecoul_tot; + } + __syncthreads(); + + // Final reduction + if (warp_id == 0) { + double val1 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_ecoul[lane_id] : 0; + double val2 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_evdw[lane_id] : 0; + +#pragma unroll + for (unsigned w = 16; w >= 1; w /= 2) { + val1 += __shfl_down_sync(0xffffffff, val1, w); + val2 += __shfl_down_sync(0xffffffff, val2, w); + } + + if (lane_id == 0) { + atomicAdd(ecoul_TOT, val1); + atomicAdd(Evdw_TOT, val2); + } + } + +// Optimized row reduction +#pragma unroll + for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) { + int idx = block_y_left_begin + i; + if (idx < block_x_left_end) { + atomicAdd(&DV_W[idx].x, sum_row_x[i]); + atomicAdd(&DV_W[idx].y, sum_row_y[i]); + atomicAdd(&DV_W[idx].z, sum_row_z[i]); + } + if (idx >= block_x_left_begin) { + idx = N - 1 - idx; + atomicAdd(&DV_W[idx].x, sum_row_x[i + y_cal_num]); + atomicAdd(&DV_W[idx].y, sum_row_y[i + y_cal_num]); + atomicAdd(&DV_W[idx].z, sum_row_z[i + y_cal_num]); + } + } + +// Optimized column reduction +#pragma unroll + for (int i = 0; i < Block_x; i++) { + int i2 = (i + threadIdx.y) % Block_x; + int idx = thread_x_left_begin + i2; + if (idx < N) { + if (block_y_left_begin < idx) { + atomicAdd(&DV_W[idx].x, sum_col_x[i2]); + atomicAdd(&DV_W[idx].y, sum_col_y[i2]); + atomicAdd(&DV_W[idx].z, sum_col_z[i2]); + } + if (block_y_left_end >= idx) { + atomicAdd(&DV_W[N - idx].x, sum_col_x[i2 + Block_x]); + atomicAdd(&DV_W[N - idx].y, sum_col_y[i2 + Block_x]); + atomicAdd(&DV_W[N - idx].z, sum_col_z[i2 + Block_x]); + } + } + } +} +} // namespace CudaNonbondedWWForce +void calc_nonbonded_ww_forces_host_v2() { + using namespace CudaNonbondedWWForce; + int N = 3 * n_waters; + int mem_size_W = N * sizeof(coord_t); + int mem_size_DV_W = N * sizeof(dvel_t); + + WW_evdw_TOT = 0; + WW_ecoul_TOT = 0; + cudaMemcpy(D_WW_evdw_TOT, &WW_evdw_TOT, sizeof(double), + cudaMemcpyHostToDevice); + cudaMemcpy(D_WW_ecoul_TOT, &WW_ecoul_TOT, sizeof(double), + cudaMemcpyHostToDevice); + + CudaContext& ctx = CudaContext::instance(); + auto W = ctx.d_coords + n_atoms_solute; + auto DV_W = ctx.d_dvelocities + n_atoms_solute; + + // Optimize thread block configuration + const int thread_num_x = 32; // Keep at 32 for better warp utilization + const int thread_num_y = 1; // Keep at 1 for better memory coalescing + dim3 block_sz = dim3(thread_num_x, thread_num_y); + + const int N_ITERATION_Y = 32; // Keep at 32 for better memory access pattern + const int N_ITERATION_X = 1; // Keep at 1 for better memory coalescing + + int block_element_sz_x = N_ITERATION_X * block_sz.x; + int block_element_sz_y = N_ITERATION_Y * block_sz.y; + + int grid_sz_x = (N - 1 + block_element_sz_x - 1) / (block_element_sz_x); + int grid_sz_y = ((N + 1) / 2 + block_element_sz_y - 1) / block_element_sz_y; + dim3 grid = dim3(grid_sz_x, grid_sz_y); + + calc_ww + <<>>(N, crg_ow, crg_hw, A_OO, B_OO, topo, W, DV_W, + D_WW_evdw_TOT, D_WW_ecoul_TOT); + + cudaDeviceSynchronize(); + cudaMemcpy(&dvelocities[n_atoms_solute], DV_W, mem_size_DV_W, + cudaMemcpyDeviceToHost); + cudaMemcpy(&WW_evdw_TOT, D_WW_evdw_TOT, sizeof(double), + cudaMemcpyDeviceToHost); + cudaMemcpy(&WW_ecoul_TOT, D_WW_ecoul_TOT, sizeof(double), + cudaMemcpyDeviceToHost); + E_nonbond_ww.Uvdw += WW_evdw_TOT; + E_nonbond_ww.Ucoul += WW_ecoul_TOT; +} + +void init_nonbonded_ww_force_kernel_data() { + using namespace CudaNonbondedWWForce; + if (!is_initialized) { + catype_t catype_ow; // Atom type of first O, H atom + ccharge_t ccharge_ow, ccharge_hw; // Charge of first O, H atom + + catype_ow = catypes[atypes[n_atoms_solute].code - 1]; + ccharge_ow = ccharges[charges[n_atoms_solute].code - 1]; + ccharge_hw = ccharges[charges[n_atoms_solute + 1].code - 1]; + + A_OO = pow(catype_ow.aii_normal, 2); + B_OO = pow(catype_ow.bii_normal, 2); + + crg_ow = ccharge_ow.charge; + crg_hw = ccharge_hw.charge; + + check_cudaMalloc((void**)&D_WW_evdw_TOT, sizeof(double)); + check_cudaMalloc((void**)&D_WW_ecoul_TOT, sizeof(double)); + is_initialized = true; + } +} + +void cleanup_nonbonded_ww_force() { + using namespace CudaNonbondedWWForce; + if (is_initialized) { + cudaFree(D_WW_evdw_TOT); + cudaFree(D_WW_ecoul_TOT); + is_initialized = false; + } +} \ No newline at end of file diff --git a/src/core/cuda/src/CudaPolxWaterForce.cu b/src/core/cuda/src/CudaPolxWaterForce.cu index ea4d280b..53546714 100644 --- a/src/core/cuda/src/CudaPolxWaterForce.cu +++ b/src/core/cuda/src/CudaPolxWaterForce.cu @@ -1,26 +1,24 @@ #include +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaPolxWaterForce.cuh" #include "utils.h" namespace CudaPolxWaterForce { bool is_initialized = false; -shell_t* d_wshells = nullptr; -int* d_list_sh = nullptr; // use 1d array to simulate 2d array -coord_t* d_coords = nullptr; -dvel_t* d_velocities = nullptr; -double* d_theta = nullptr; -double* d_theta0 = nullptr; -double* d_tdum = nullptr; -double* d_energy = nullptr; - -int *d_water_shell = nullptr; -int *d_water_rank = nullptr; // in host int* water_shell = nullptr; int* water_rank = nullptr; -int* polx_list_sh = nullptr; // use 1d array to simulate 2d array +int* polx_list_sh = nullptr; // use 1d array to simulate 2d array + +double* d_energy; +int* d_list_sh = nullptr; +double* d_theta = nullptr; +double* d_theta0 = nullptr; +double* d_tdum = nullptr; +int* d_water_shell = nullptr; +int* d_water_rank = nullptr; } // namespace CudaPolxWaterForce @@ -88,7 +86,7 @@ __global__ void calc_polx_water_forces_kernel( int il = water_rank[idx]; int is = water_shell[idx]; - if (is < 0) return; // water not in any shell this step + if (is < 0) return; // water not in any shell this step int wi, ii; coord_t rmu, rcu, f1O, f1H1, f1H2, f2; @@ -180,7 +178,6 @@ void sort_waters() { for (int jl = 0; jl < wshells[is].n_inshell; jl++) { // printf("Searching water %d in shell %d, total number: %d\n", jl, is, wshells[is].n_inshell); jw = polx_list_sh[jl * n_shells + is]; - // printf("zzzzzzzzz %d\n", jw); // printf("Water %d in shell %d has theta %f\n", jw, is, tdum[jw] * 180 / M_PI); if (tdum[jw] < tmin) { jmin = jw; @@ -197,6 +194,8 @@ void sort_waters() { } void calc_polx_water_forces_host(int iteration) { + CudaContext& ctx = CudaContext::instance(); + for (int is = 0; is < n_shells; is++) { wshells[is].n_inshell = 0; if (iteration == 0) { @@ -204,31 +203,14 @@ void calc_polx_water_forces_host(int iteration) { } } using namespace CudaPolxWaterForce; - if (!is_initialized) { - check_cudaMalloc((void**)&d_wshells, n_shells * sizeof(shell_t)); - // todo: change the dimension - check_cudaMalloc((void**)&d_list_sh, n_max_inshell * n_shells * sizeof(int)); - - check_cudaMalloc((void**)&d_coords, n_atoms * sizeof(coord_t)); - check_cudaMalloc((void**)&d_velocities, n_atoms * sizeof(dvel_t)); - check_cudaMalloc((void**)&d_theta, n_waters * sizeof(double)); - check_cudaMalloc((void**)&d_theta0, n_waters * sizeof(double)); - check_cudaMalloc((void**)&d_tdum, n_waters * sizeof(double)); - check_cudaMalloc((void**)&d_energy, sizeof(double)); - check_cudaMalloc((void**)&d_water_rank, n_waters * sizeof(int)); - check_cudaMalloc((void**)&d_water_shell, n_waters * sizeof(int)); - water_rank = new int[n_waters]; - water_shell = new int[n_waters]; - polx_list_sh = new int[n_max_inshell * n_shells]; + // ctx.sync_all_to_device(); + cudaMemcpy(ctx.d_wshells, wshells, n_shells * sizeof(shell_t), cudaMemcpyHostToDevice); - is_initialized = true; - } - cudaMemcpy(d_wshells, wshells, n_shells * sizeof(shell_t), cudaMemcpyHostToDevice); + coord_t* d_coords = ctx.d_coords; + dvel_t* d_dvelocities = ctx.d_dvelocities; + shell_t* d_wshells = ctx.d_wshells; - cudaMemset(d_energy, 0, sizeof(double)); - cudaMemcpy(d_coords, coords, n_atoms * sizeof(coord_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_velocities, dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyHostToDevice); int blockSize = 256; int numBlocks = (n_waters + blockSize - 1) / blockSize; // printf("Calculated theta for %d waters in %d shells\n", n_waters, n_shells); @@ -270,38 +252,51 @@ void calc_polx_water_forces_host(int iteration) { } // Calculate energy and force - - check_cudaMalloc((void**)&d_energy, sizeof(double)); + cudaMemset(d_energy, 0, sizeof(double)); calc_polx_water_forces_kernel<<>>( - n_waters, n_atoms_solute, d_wshells, d_coords, d_velocities, topo, + n_waters, n_atoms_solute, d_wshells, d_coords, d_dvelocities, topo, d_theta, md, d_energy, d_water_rank, d_water_shell); double energy; cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost); E_restraint.Upolx += energy; cudaMemcpy(wshells, d_wshells, n_shells * sizeof(shell_t), cudaMemcpyDeviceToHost); // Copy back forces for all atoms (solute + solvent); water forces were being dropped. - cudaMemcpy(dvelocities, d_velocities, n_atoms * sizeof(dvel_t), cudaMemcpyDeviceToHost); + cudaMemcpy(dvelocities, d_dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyDeviceToHost); } -void cleanup_polx_water_force( +void init_polx_water_force_kernel_data() { + using namespace CudaPolxWaterForce; + if (!is_initialized) { + water_rank = new int[n_waters]; + water_shell = new int[n_waters]; + polx_list_sh = new int[n_max_inshell * n_shells]; + + check_cudaMalloc((void**)&d_energy, sizeof(double)); + check_cudaMalloc((void**)&d_list_sh, n_max_inshell * n_shells * sizeof(int)); + check_cudaMalloc((void**)&d_theta, n_waters * sizeof(double)); + check_cudaMalloc((void**)&d_theta0, n_waters * sizeof(double)); + check_cudaMalloc((void**)&d_tdum, n_waters * sizeof(double)); + check_cudaMalloc((void**)&d_water_rank, n_waters * sizeof(int)); + check_cudaMalloc((void**)&d_water_shell, n_waters * sizeof(int)); -) { + is_initialized = true; + } +} + +void cleanup_polx_water_force() { using namespace CudaPolxWaterForce; if (is_initialized) { - cudaFree(d_wshells); + delete[] water_rank; + delete[] water_shell; + delete[] polx_list_sh; + + cudaFree(d_energy); cudaFree(d_list_sh); - cudaFree(d_coords); - cudaFree(d_velocities); cudaFree(d_theta); cudaFree(d_theta0); cudaFree(d_tdum); - cudaFree(d_energy); cudaFree(d_water_rank); cudaFree(d_water_shell); - - delete[] water_rank; - delete[] water_shell; - is_initialized = false; } } diff --git a/src/core/cuda/src/CudaPshellForce.cu b/src/core/cuda/src/CudaPshellForce.cu index c17ed4ab..61c63bfb 100644 --- a/src/core/cuda/src/CudaPshellForce.cu +++ b/src/core/cuda/src/CudaPshellForce.cu @@ -1,16 +1,13 @@ +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaPshellForce.cuh" #include "utils.h" +#include namespace CudaPshellForce { bool is_initialized = false; -bool* d_shell; -bool* d_excluded; -coord_t* d_coords; -coord_t* d_coords_top; double* d_ufix_energy; double* d_ushell_energy; -dvel_t* d_dvelocities; -} // namespace CudaPshellForce +} // namespace CudaPshellForce __global__ void calc_pshell_force_kernel( int n_atoms_solute, bool* shell, @@ -50,27 +47,18 @@ __global__ void calc_pshell_force_kernel( } void calc_pshell_forces_host() { + CudaContext& ctx = CudaContext::instance(); using namespace CudaPshellForce; - if (!is_initialized) { - check_cudaMalloc((void**)&d_shell, sizeof(bool) * n_atoms); - check_cudaMalloc((void**)&d_excluded, sizeof(bool) * n_atoms); - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_coords_top, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_ufix_energy, sizeof(double)); - check_cudaMalloc((void**)&d_ushell_energy, sizeof(double)); - check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms); - cudaMemcpy(d_shell, shell, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_excluded, excluded, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice); - is_initialized = true; - } - - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_coords_top, coords_top, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice); + auto d_shell = ctx.d_shell; + auto d_excluded = ctx.d_excluded; + auto d_coords = ctx.d_coords; + auto d_coords_top = ctx.d_coords_top; + auto d_dvelocities = ctx.d_dvelocities; cudaMemset(d_ufix_energy, 0, sizeof(double)); cudaMemset(d_ushell_energy, 0, sizeof(double)); + int blockSize = 256; int numBlocks = (n_atoms_solute + blockSize - 1) / blockSize; calc_pshell_force_kernel<<>>( @@ -89,22 +77,25 @@ void calc_pshell_forces_host() { cudaMemcpy(&ushell_energy, d_ushell_energy, sizeof(double), cudaMemcpyDeviceToHost); cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); - E_restraint.Ufix += ufix_energy; E_restraint.Ushell += ushell_energy; + // ctx.sync_all_to_host(); +} + +void init_pshell_force_kernel_data() { + using namespace CudaPshellForce; + if (!is_initialized) { + check_cudaMalloc((void**)&d_ufix_energy, sizeof(double)); + check_cudaMalloc((void**)&d_ushell_energy, sizeof(double)); + is_initialized = true; + } } void cleanup_pshell_force() { using namespace CudaPshellForce; if (is_initialized) { - cudaFree(d_shell); - cudaFree(d_excluded); - cudaFree(d_coords); - cudaFree(d_coords_top); cudaFree(d_ufix_energy); cudaFree(d_ushell_energy); - cudaFree(d_dvelocities); is_initialized = false; } - } \ No newline at end of file diff --git a/src/core/cuda/src/CudaRadixWaterForce.cu b/src/core/cuda/src/CudaRadixWaterForce.cu index 92fb93f7..26406dc4 100644 --- a/src/core/cuda/src/CudaRadixWaterForce.cu +++ b/src/core/cuda/src/CudaRadixWaterForce.cu @@ -1,25 +1,24 @@ #include +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaRadixWaterForce.cuh" #include "cuda/include/CudaUtility.cuh" #include "utils.h" - namespace CudaRadixWaterForce { bool is_initialized = false; -coord_t* d_coords = nullptr; -dvel_t* d_dvelocities = nullptr; -double* d_energy = nullptr; +double* d_energy; } // namespace CudaRadixWaterForce + __global__ void calc_radix_water_forces_kernel( - coord_t* coords, - double shift, + coord_t* coords, + double shift, int n_atoms_solute, int n_atoms, topo_t topo, md_t md, double Dwmz, double awmz, - dvel_t* dvelocities, + dvel_t* dvelocities, double* energy) { int i = blockIdx.x * blockDim.x + threadIdx.x; i = n_atoms_solute + i * 3; // Process only oxygen atoms of water molecules @@ -60,6 +59,7 @@ void calc_radix_water_forces_host() { if (water_atoms == 0) { return; } + using namespace CudaRadixWaterForce; int blockSize = 256; if (water_atoms % 3 != 0) { throw std::runtime_error("Number of water atoms is not a multiple of 3"); @@ -67,16 +67,11 @@ void calc_radix_water_forces_host() { int oxygen_atoms = water_atoms / 3; int numBlocks = (oxygen_atoms + blockSize - 1) / blockSize; - using namespace CudaRadixWaterForce; - if (!is_initialized) { - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms); - check_cudaMalloc((void**)&d_energy, sizeof(double)); - is_initialized = true; - } + CudaContext& ctx = CudaContext::instance(); + // ctx.sync_all_to_device(); - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice); + auto d_coords = ctx.d_coords; + auto d_dvelocities = ctx.d_dvelocities; double energy = 0.0; cudaMemcpy(d_energy, &energy, sizeof(double), cudaMemcpyHostToDevice); @@ -86,28 +81,34 @@ void calc_radix_water_forces_host() { } else { shift = 0; } - calc_radix_water_forces_kernel<<>>(d_coords, - shift, - n_atoms_solute, - n_atoms, - topo, - md, - Dwmz, - awmz, - d_dvelocities, - d_energy); + calc_radix_water_forces_kernel<<>>(d_coords, + shift, + n_atoms_solute, + n_atoms, + topo, + md, + Dwmz, + awmz, + d_dvelocities, + d_energy); cudaDeviceSynchronize(); cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost); E_restraint.Uradx += energy; } +void init_radix_water_force_kernel_data() { + using namespace CudaRadixWaterForce; + if (!is_initialized) { + check_cudaMalloc((void**)&d_energy, sizeof(double)); + is_initialized = true; + } +} + void cleanup_radix_water_force() { using namespace CudaRadixWaterForce; if (is_initialized) { - cudaFree(d_coords); - cudaFree(d_dvelocities); cudaFree(d_energy); is_initialized = false; } -} +} \ No newline at end of file diff --git a/src/core/cuda/src/CudaRestrangForce.cu b/src/core/cuda/src/CudaRestrangForce.cu index fd568687..310336e5 100644 --- a/src/core/cuda/src/CudaRestrangForce.cu +++ b/src/core/cuda/src/CudaRestrangForce.cu @@ -1,17 +1,11 @@ +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaRestrangForce.cuh" #include "cuda/include/CudaUtility.cuh" #include "utils.h" - -namespace CudaRestrangeForce { +namespace CudaRestrangForce { bool is_initialized = false; -restrang_t* d_restrangs = nullptr; -coord_t* d_coords = nullptr; -double* d_lambdas = nullptr; -dvel_t* d_velocities = nullptr; -E_restraint_t* d_EQ_restraint = nullptr; -double* d_E_restraint = nullptr; -restrdis_t* d_restrdists = nullptr; -} // namespace CudaRestrangeForce +double* d_E_restraint; +} // namespace CudaRestrangForce __global__ void calc_restrang_force_kernel( restrang_t* restrangs, @@ -86,7 +80,6 @@ __global__ void calc_restrang_force_kernel( dk.y = f1 * (dr.y / (rij * rjk) - cos_th * dr2.y / r2jk); dk.z = f1 * (dr.z / (rij * rjk) - cos_th * dr2.z / r2jk); - atomicAdd(&dvelocities[i].x, dv * di.x); atomicAdd(&dvelocities[i].y, dv * di.y); atomicAdd(&dvelocities[i].z, dv * di.z); @@ -110,26 +103,20 @@ __global__ void calc_restrang_force_kernel( } void calc_restrang_force_host() { - using namespace CudaRestrangeForce; - if (!is_initialized) { - check_cudaMalloc((void**)&d_restrangs, sizeof(restrang_t) * n_restrangs); - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_lambdas, sizeof(double) * n_lambdas); - check_cudaMalloc((void**)&d_velocities, sizeof(dvel_t) * n_atoms); - check_cudaMalloc((void**)&d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas); - check_cudaMalloc((void**)&d_E_restraint, sizeof(double)); - check_cudaMalloc((void**)&d_restrdists, sizeof(restrdis_t) * n_restrdists); + if (n_restrangs == 0) return; + using namespace CudaRestrangForce; + CudaContext& ctx = CudaContext::instance(); + + auto d_restrangs = ctx.d_restrangs; + auto d_coords = ctx.d_coords; + auto d_lambdas = ctx.d_lambdas; + auto d_dvelocities = ctx.d_dvelocities; + auto d_EQ_restraint = ctx.d_EQ_restraint; + auto d_restrdists = ctx.d_restrdists; - cudaMemcpy(d_restrangs, restrangs, sizeof(restrang_t) * n_restrangs, cudaMemcpyHostToDevice); - cudaMemcpy(d_lambdas, lambdas, sizeof(double) * n_lambdas, cudaMemcpyHostToDevice); - cudaMemcpy(d_restrdists, restrdists, sizeof(restrdis_t) * n_restrdists, cudaMemcpyHostToDevice); - is_initialized = true; - } - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_velocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_EQ_restraint, EQ_restraint, sizeof(E_restraint_t) * n_lambdas, cudaMemcpyHostToDevice); double val = 0; cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice); + int blockSize = 256; int numBlocks = (n_restrangs + blockSize - 1) / blockSize; calc_restrang_force_kernel<<>>( @@ -139,28 +126,30 @@ void calc_restrang_force_host() { n_atoms, d_lambdas, n_lambdas, - d_velocities, + d_dvelocities, d_EQ_restraint, d_E_restraint, d_restrdists); cudaDeviceSynchronize(); cudaMemcpy(coords, d_coords, sizeof(coord_t) * n_atoms, cudaMemcpyDeviceToHost); - cudaMemcpy(dvelocities, d_velocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); + cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); cudaMemcpy(EQ_restraint, d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas, cudaMemcpyDeviceToHost); cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost); E_restraint.Urestr += val; } +void init_restrang_force_kernel_data() { + using namespace CudaRestrangForce; + if (!is_initialized) { + check_cudaMalloc((void**)&d_E_restraint, sizeof(double)); + is_initialized = true; + } +} + void cleanup_restrang_force() { - using namespace CudaRestrangeForce; + using namespace CudaRestrangForce; if (is_initialized) { - cudaFree(d_restrangs); - cudaFree(d_coords); - cudaFree(d_lambdas); - cudaFree(d_velocities); - cudaFree(d_EQ_restraint); cudaFree(d_E_restraint); - cudaFree(d_restrdists); is_initialized = false; } } \ No newline at end of file diff --git a/src/core/cuda/src/CudaRestrdisForce.cu b/src/core/cuda/src/CudaRestrdisForce.cu index b95ee4f5..af92e6d4 100644 --- a/src/core/cuda/src/CudaRestrdisForce.cu +++ b/src/core/cuda/src/CudaRestrdisForce.cu @@ -1,17 +1,12 @@ +#include + +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaRestrdisForce.cuh" #include "cuda/include/CudaUtility.cuh" #include "utils.h" -#include - namespace CudaRestrdisForce { bool is_initialized = false; -restrdis_t* d_restrdists = nullptr; -coord_t* d_coords = nullptr; -double* d_lambdas = nullptr; -dvel_t* d_dvelocities = nullptr; -E_restraint_t* d_EQ_restraint = nullptr; -double* d_E_restraint = nullptr; - +double* d_E_restraint; } // namespace CudaRestrdisForce __global__ void calc_restrdis_forces_kernel( @@ -80,22 +75,15 @@ __global__ void calc_restrdis_forces_kernel( void calc_restrdis_forces_host() { using namespace CudaRestrdisForce; - if (!is_initialized) { - check_cudaMalloc((void**)&d_restrdists, n_restrdists * sizeof(restrdis_t)); - check_cudaMalloc((void**)&d_coords, n_atoms * sizeof(coord_t)); - check_cudaMalloc((void**)&d_lambdas, n_lambdas * sizeof(double)); - check_cudaMalloc((void**)&d_dvelocities, n_atoms * sizeof(dvel_t)); - check_cudaMalloc((void**)&d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas); - check_cudaMalloc((void**)&d_E_restraint, sizeof(double) * n_lambdas); - - cudaMemcpy(d_restrdists, restrdists, n_restrdists * sizeof(restrdis_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_lambdas, lambdas, n_lambdas * sizeof(double), cudaMemcpyHostToDevice); - is_initialized = true; - } - cudaMemcpy(d_coords, coords, n_atoms * sizeof(coord_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_dvelocities, dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyHostToDevice); - cudaMemcpy(d_EQ_restraint, EQ_restraint, sizeof(E_restraint_t) * n_lambdas, cudaMemcpyHostToDevice); - cudaMemset(d_E_restraint, 0, sizeof(double) * n_lambdas); + CudaContext& ctx = CudaContext::instance(); + auto d_restrdists = ctx.d_restrdists; + auto d_coords = ctx.d_coords; + auto d_lambdas = ctx.d_lambdas; + auto d_dvelocities = ctx.d_dvelocities; + auto d_EQ_restraint = ctx.d_EQ_restraint; + + cudaMemset(d_E_restraint, 0, sizeof(double)); + int blockSize = 256; int numBlocks = (n_restrdists + blockSize - 1) / blockSize; calc_restrdis_forces_kernel<<>>( @@ -116,15 +104,18 @@ void calc_restrdis_forces_host() { E_restraint.Urestr += ener; } +void init_restrdis_force_kernel_data() { + using namespace CudaRestrdisForce; + if (!is_initialized) { + check_cudaMalloc((void**)&d_E_restraint, sizeof(double)); + is_initialized = true; + } +} + void cleanup_restrdis_force() { using namespace CudaRestrdisForce; if (is_initialized) { - cudaFree(d_restrdists); - cudaFree(d_coords); - cudaFree(d_lambdas); - cudaFree(d_dvelocities); - cudaFree(d_EQ_restraint); cudaFree(d_E_restraint); is_initialized = false; } -} +} \ No newline at end of file diff --git a/src/core/cuda/src/CudaRestrposForce.cu b/src/core/cuda/src/CudaRestrposForce.cu index 80e5e083..0468e6b0 100644 --- a/src/core/cuda/src/CudaRestrposForce.cu +++ b/src/core/cuda/src/CudaRestrposForce.cu @@ -1,16 +1,13 @@ +#include + +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaRestrposForce.cuh" #include "cuda/include/CudaUtility.cuh" #include "utils.h" namespace CudaRestrposForce { bool is_initialized = false; -restrpos_t* d_restrpos = nullptr; -coord_t* d_coords = nullptr; -double* d_lambdas = nullptr; -E_restraint_t* d_EQ_restraint = nullptr; -double* d_E_restraint = nullptr; -dvel_t* d_dvelocities = nullptr; - +double* d_E_restraint; } // namespace CudaRestrposForce __global__ void calc_restrpos_forces_kernel( @@ -65,29 +62,22 @@ __global__ void calc_restrpos_forces_kernel( } } void calc_restrpos_forces_host() { + if (n_restrspos == 0) return; using namespace CudaRestrposForce; - if (!is_initialized) { - check_cudaMalloc((void**)&d_restrpos, sizeof(restrpos_t) * n_restrspos); - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_lambdas, sizeof(double) * n_lambdas); - check_cudaMalloc((void**)&d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas); - check_cudaMalloc((void**)&d_E_restraint, sizeof(double)); - check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms); - - cudaMemcpy(d_restrpos, restrspos, sizeof(restrpos_t) * n_restrspos, cudaMemcpyHostToDevice); - cudaMemcpy(d_lambdas, lambdas, sizeof(double) * n_lambdas, cudaMemcpyHostToDevice); - is_initialized = true; - } - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_EQ_restraint, EQ_restraint, sizeof(E_restraint_t) * n_lambdas, cudaMemcpyHostToDevice); double val = 0.0; cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice); + CudaContext& ctx = CudaContext::instance(); + auto d_restrspos = ctx.d_restrspos; + auto d_coords = ctx.d_coords; + auto d_lambdas = ctx.d_lambdas; + auto d_EQ_restraint = ctx.d_EQ_restraint; + auto d_dvelocities = ctx.d_dvelocities; + int blockSize = 256; int numBlocks = (n_restrspos + blockSize - 1) / blockSize; calc_restrpos_forces_kernel<<>>( - d_restrpos, + d_restrspos, n_restrspos, d_coords, d_lambdas, @@ -101,15 +91,19 @@ void calc_restrpos_forces_host() { E_restraint.Urestr += val; cudaMemcpy(EQ_restraint, d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas, cudaMemcpyDeviceToHost); } + +void init_restrpos_force_kernel_data() { + using namespace CudaRestrposForce; + if (!is_initialized) { + check_cudaMalloc((void**)&d_E_restraint, sizeof(double)); + is_initialized = true; + } +} + void cleanup_restrpos_force() { using namespace CudaRestrposForce; if (is_initialized) { - cudaFree(d_restrpos); - cudaFree(d_coords); - cudaFree(d_lambdas); - cudaFree(d_EQ_restraint); cudaFree(d_E_restraint); - cudaFree(d_dvelocities); is_initialized = false; } -} +} \ No newline at end of file diff --git a/src/core/cuda/src/CudaRestrseqForce.cu b/src/core/cuda/src/CudaRestrseqForce.cu index 4e167357..ff805df7 100644 --- a/src/core/cuda/src/CudaRestrseqForce.cu +++ b/src/core/cuda/src/CudaRestrseqForce.cu @@ -1,20 +1,12 @@ +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaRestrseqForce.cuh" -#include "utils.h" #include "iostream" +#include "utils.h" namespace CudaRestrseqForce { bool is_initialized = false; -restrseq_t* d_restrseq = nullptr; -coord_t* d_coords = nullptr; -coord_t* d_coords_top = nullptr; -double* d_upres_energy = nullptr; -atype_t* d_atypes = nullptr; -catype_t* d_catypes = nullptr; -bool* d_heavy = nullptr; -dvel_t* d_velocities = nullptr; - +double* d_upres_energy; } // namespace CudaRestrseqForce - __global__ void calc_restrseq_forces_kernel( int n_restrseqs, restrseq_t* restrseqs, @@ -112,7 +104,7 @@ __global__ void calc_restrseq_forces_kernel( r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2); ener = .5 * k * r2; atomicAdd(upres_energy, ener); - + atomicAdd(&dvelocities[i].x, k * dr.x); atomicAdd(&dvelocities[i].y, k * dr.y); atomicAdd(&dvelocities[i].z, k * dr.z); @@ -123,28 +115,17 @@ __global__ void calc_restrseq_forces_kernel( void calc_restrseq_forces_host() { using namespace CudaRestrseqForce; - if (!is_initialized) { - check_cudaMalloc((void**)&d_restrseq, sizeof(restrseq_t) * n_restrseqs); - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_coords_top, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_upres_energy, sizeof(double)); - check_cudaMalloc((void**)&d_atypes, sizeof(atype_t) * n_atypes); - check_cudaMalloc((void**)&d_catypes, sizeof(catype_t) * n_catypes); - check_cudaMalloc((void**)&d_heavy, sizeof(bool) * n_atoms); - check_cudaMalloc((void**)&d_velocities, sizeof(dvel_t) * n_atoms); - - cudaMemcpy(d_restrseq, restrseqs, sizeof(restrseq_t) * n_restrseqs, cudaMemcpyHostToDevice); - cudaMemcpy(d_atypes, atypes, sizeof(atype_t) * n_atypes, cudaMemcpyHostToDevice); - cudaMemcpy(d_catypes, catypes, sizeof(catype_t) * n_catypes, cudaMemcpyHostToDevice); - cudaMemcpy(d_heavy, heavy, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice); - - is_initialized = true; - } - - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_coords_top, coords_top, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_velocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice); + CudaContext& ctx = CudaContext::instance(); + auto d_restrseq = ctx.d_restrseqs; + auto d_coords = ctx.d_coords; + auto d_coords_top = ctx.d_coords_top; + auto d_atypes = ctx.d_atypes; + auto d_catypes = ctx.d_catypes; + auto d_heavy = ctx.d_heavy; + auto d_dvelocities = ctx.d_dvelocities; cudaMemset(d_upres_energy, 0, sizeof(double)); + // ctx.sync_all_to_device(); + int blockSize = 256; int numBlocks = (n_restrseqs + blockSize - 1) / blockSize; calc_restrseq_forces_kernel<<>>( @@ -155,28 +136,28 @@ void calc_restrseq_forces_host() { d_atypes, d_catypes, d_heavy, - d_velocities, - d_upres_energy - ); + d_dvelocities, + d_upres_energy); cudaDeviceSynchronize(); double upres_energy; cudaMemcpy(&upres_energy, d_upres_energy, sizeof(double), cudaMemcpyDeviceToHost); E_restraint.Upres = upres_energy; printf("Restrseq U_upres: %f\n", upres_energy); - cudaMemcpy(dvelocities, d_velocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); + cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost); +} + +void init_restrseq_force_kernel_data() { + using namespace CudaRestrseqForce; + if (!is_initialized) { + check_cudaMalloc((void**)&d_upres_energy, sizeof(double)); + is_initialized = true; + } } void cleanup_restrseq_force() { using namespace CudaRestrseqForce; if (is_initialized) { - cudaFree(d_restrseq); - cudaFree(d_coords); - cudaFree(d_coords_top); cudaFree(d_upres_energy); - cudaFree(d_atypes); - cudaFree(d_catypes); - cudaFree(d_heavy); - cudaFree(d_velocities); is_initialized = false; } } diff --git a/src/core/cuda/src/CudaRestrwallForce.cu b/src/core/cuda/src/CudaRestrwallForce.cu index 08c187f3..d26ce34c 100644 --- a/src/core/cuda/src/CudaRestrwallForce.cu +++ b/src/core/cuda/src/CudaRestrwallForce.cu @@ -1,15 +1,12 @@ +#include + +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaRestrwallForce.cuh" #include "utils.h" -#include namespace CudaRestrwallForce { bool is_initialized = false; -restrwall_t* d_restrwalls = nullptr; -coord_t* d_coords = nullptr; - -double* d_energies = nullptr; -dvel_t* d_dvelocities = nullptr; -bool* d_heavy = nullptr; +double* d_energies; } // namespace CudaRestrwallForce __global__ void calc_restrwall_forces_kernel( @@ -54,21 +51,15 @@ __global__ void calc_restrwall_forces_kernel( } void calc_restrwall_forces_host() { + if (n_restrwalls == 0) return; using namespace CudaRestrwallForce; - if (!is_initialized) { - check_cudaMalloc((void**)&d_restrwalls, sizeof(restrwall_t) * n_restrwalls); - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_energies, sizeof(double)); - check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms); - check_cudaMalloc((void**)&d_heavy, sizeof(bool) * n_atoms); + CudaContext& ctx = CudaContext::instance(); + auto d_restrwalls = ctx.d_restrwalls; + auto d_coords = ctx.d_coords; + auto d_dvelocities = ctx.d_dvelocities; + auto d_heavy = ctx.d_heavy; + cudaMemset(d_energies, 0, sizeof(double)); - cudaMemcpy(d_restrwalls, restrwalls, sizeof(restrwall_t) * n_restrwalls, cudaMemcpyHostToDevice); - cudaMemcpy(d_heavy, heavy, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice); - is_initialized = true; - } - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_energies, 0, sizeof(double), cudaMemcpyHostToDevice); - cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice); int blockSize = 256; int numBlocks = (n_restrwalls + blockSize - 1) / blockSize; calc_restrwall_forces_kernel<<>>( @@ -84,14 +75,19 @@ void calc_restrwall_forces_host() { printf("Restrwall energy: %f\n", h_energy); E_restraint.Upres += h_energy; } + +void init_restrwall_force_kernel_data() { + using namespace CudaRestrwallForce; + if (!is_initialized) { + check_cudaMalloc((void**)&d_energies, sizeof(double)); + is_initialized = true; + } +} + void cleanup_restrwall_force() { using namespace CudaRestrwallForce; if (is_initialized) { - cudaFree(d_restrwalls); - cudaFree(d_coords); cudaFree(d_energies); - cudaFree(d_dvelocities); - cudaFree(d_heavy); is_initialized = false; } -} +} \ No newline at end of file diff --git a/src/core/cuda/src/CudaShakeConstraints.cu b/src/core/cuda/src/CudaShakeConstraints.cu index 799291dc..1c446735 100644 --- a/src/core/cuda/src/CudaShakeConstraints.cu +++ b/src/core/cuda/src/CudaShakeConstraints.cu @@ -1,17 +1,14 @@ #include +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaShakeConstraints.cuh" #include "utils.h" namespace CudaShakeConstraints { + bool is_initialized = false; -int* d_mol_n_shakes = nullptr; -shake_bond_t* d_shake_bonds = nullptr; -coord_t* d_coords = nullptr; -coord_t* d_xcoords = nullptr; -double* d_winv = nullptr; -int* d_total_iterations = nullptr; -int* d_mol_shake_offset = nullptr; +int* d_mol_shake_offset; +int* d_total_iterations; } // namespace CudaShakeConstraints __global__ void calc_shake_constraints_kernel( @@ -99,41 +96,48 @@ __global__ void calc_shake_constraints_kernel( } } -int calc_shake_constraints_host() { +void init_shake_constraints_kernel_data() { using namespace CudaShakeConstraints; - if (!is_initialized) { - check_cudaMalloc((void**)&d_mol_n_shakes, sizeof(int) * n_molecules); - check_cudaMalloc((void**)&d_shake_bonds, sizeof(shake_bond_t) * n_shake_constraints); - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_xcoords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_winv, sizeof(double) * n_atoms); - check_cudaMalloc((void**)&d_total_iterations, sizeof(int)); - check_cudaMalloc((void**)&d_mol_shake_offset, sizeof(int) * n_molecules); - - cudaMemcpy(d_mol_n_shakes, mol_n_shakes, sizeof(int) * n_molecules, cudaMemcpyHostToDevice); - cudaMemcpy(d_shake_bonds, shake_bonds, sizeof(shake_bond_t) * n_shake_constraints, cudaMemcpyHostToDevice); - cudaMemcpy(d_winv, winv, sizeof(double) * n_atoms, cudaMemcpyHostToDevice); - int* mol_shake_offset_host = (int*)malloc(sizeof(int) * n_molecules); mol_shake_offset_host[0] = 0; for (int i = 1; i < n_molecules; i++) { mol_shake_offset_host[i] = mol_shake_offset_host[i - 1] + mol_n_shakes[i - 1]; } + check_cudaMalloc((void**)&d_mol_shake_offset, sizeof(int) * n_molecules); // calculation data, not initialized in the beginning. cudaMemcpy(d_mol_shake_offset, mol_shake_offset_host, sizeof(int) * n_molecules, cudaMemcpyHostToDevice); - free(mol_shake_offset_host); + check_cudaMalloc((void**)&d_total_iterations, sizeof(int)); + is_initialized = true; } +} + +void cleanup_shake_constraints() { + using namespace CudaShakeConstraints; + if (is_initialized) { + cudaFree(d_mol_shake_offset); + cudaFree(d_total_iterations); + is_initialized = false; + } +} - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_xcoords, xcoords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); +int calc_shake_constraints_host() { + using namespace CudaShakeConstraints; int total_iterations_host = 0; cudaMemcpy(d_total_iterations, &total_iterations_host, sizeof(int), cudaMemcpyHostToDevice); int blocks = n_molecules; int threads = 32; + + CudaContext& ctx = CudaContext::instance(); + auto d_mol_n_shakes = ctx.d_mol_n_shakes; + auto d_shake_bonds = ctx.d_shake_bonds; + auto d_coords = ctx.d_coords; + auto d_xcoords = ctx.d_xcoords; + auto d_winv = ctx.d_winv; + calc_shake_constraints_kernel<<>>( n_molecules, d_mol_n_shakes, @@ -148,18 +152,3 @@ int calc_shake_constraints_host() { cudaMemcpy(coords, d_coords, sizeof(coord_t) * n_atoms, cudaMemcpyDeviceToHost); return total_iterations_host; } - -void cleanup_shake_constraints() { - using namespace CudaShakeConstraints; - - if (is_initialized) { - cudaFree(d_mol_n_shakes); - cudaFree(d_shake_bonds); - cudaFree(d_coords); - cudaFree(d_xcoords); - cudaFree(d_winv); - cudaFree(d_total_iterations); - cudaFree(d_mol_shake_offset); - is_initialized = false; - } -} \ No newline at end of file diff --git a/src/core/cuda/src/CudaTemperature.cu b/src/core/cuda/src/CudaTemperature.cu index c55e1870..f28157f6 100644 --- a/src/core/cuda/src/CudaTemperature.cu +++ b/src/core/cuda/src/CudaTemperature.cu @@ -1,3 +1,4 @@ +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaTemperature.cuh" #include "cuda/include/CudaUtility.cuh" #include "iostream" @@ -5,10 +6,6 @@ namespace CudaTemperature { bool is_initialized = false; -atype_t* d_atypes; -catype_t* d_catypes; -vel_t* d_velocities; -bool* d_excluded; double* d_Temp_solute; double* d_Tfree_solute; double* d_Texcl_solute; @@ -49,26 +46,6 @@ __global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t void calc_temperature_host() { printf("Ndegf = %f, Ndegfree = %f, n_excluded = %d, Ndegfree_solvent = %f, Ndegfree_solute = %f\n", Ndegf, Ndegfree, n_excluded, Ndegfree_solvent, Ndegfree_solute); using namespace CudaTemperature; - if (!is_initialized) { - check_cudaMalloc((void**)&d_atypes, sizeof(atype_t) * n_atypes); - check_cudaMalloc((void**)&d_catypes, sizeof(catype_t) * n_catypes); - check_cudaMalloc((void**)&d_velocities, sizeof(vel_t) * n_atoms); - check_cudaMalloc((void**)&d_excluded, sizeof(bool) * n_atoms); - check_cudaMalloc((void**)&d_Temp_solute, sizeof(double)); - check_cudaMalloc((void**)&d_Tfree_solute, sizeof(double)); - check_cudaMalloc((void**)&d_Texcl_solute, sizeof(double)); - check_cudaMalloc((void**)&d_Temp_solvent, sizeof(double)); - check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(double)); - check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(double)); - - cudaMemcpy(d_atypes, atypes, sizeof(atype_t) * n_atypes, cudaMemcpyHostToDevice); - cudaMemcpy(d_catypes, catypes, sizeof(catype_t) * n_catypes, cudaMemcpyHostToDevice); - cudaMemcpy(d_excluded, excluded, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice); - - is_initialized = true; - } - cudaMemcpy(d_velocities, velocities, sizeof(vel_t) * n_atoms, cudaMemcpyHostToDevice); - double h_Temp_solute = 0.0, h_Tfree_solute = 0.0, h_Texcl_solute = 0.0, h_Temp_solvent = 0.0, h_Tfree_solvent = 0.0, h_Texcl_solvent = 0.0; cudaMemcpy(d_Temp_solute, &h_Temp_solute, sizeof(double), cudaMemcpyHostToDevice); @@ -78,6 +55,11 @@ void calc_temperature_host() { cudaMemcpy(d_Tfree_solvent, &h_Tfree_solvent, sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(d_Texcl_solvent, &h_Texcl_solvent, sizeof(double), cudaMemcpyHostToDevice); + CudaContext& ctx = CudaContext::instance(); + atype_t* d_atypes = ctx.d_atypes; + catype_t* d_catypes = ctx.d_catypes; + vel_t* d_velocities = ctx.d_velocities; + bool* d_excluded = ctx.d_excluded; int blockSize = 256; int numBlocks = (n_atoms + blockSize - 1) / blockSize; @@ -111,13 +93,22 @@ void calc_temperature_host() { printf("Tscale = %f, tau_T = %f, Temp = %f, Tfree = %f\n", Tscale_solvent, tau_T, Temp, Tfree); } +void init_temperature_kernel_data() { + using namespace CudaTemperature; + if (!is_initialized) { + check_cudaMalloc((void**)&d_Temp_solute, sizeof(double)); + check_cudaMalloc((void**)&d_Tfree_solute, sizeof(double)); + check_cudaMalloc((void**)&d_Texcl_solute, sizeof(double)); + check_cudaMalloc((void**)&d_Temp_solvent, sizeof(double)); + check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(double)); + check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(double)); + is_initialized = true; + } +} + void cleanup_temperature() { using namespace CudaTemperature; if (is_initialized) { - cudaFree(d_atypes); - cudaFree(d_catypes); - cudaFree(d_velocities); - cudaFree(d_excluded); cudaFree(d_Temp_solute); cudaFree(d_Tfree_solute); cudaFree(d_Texcl_solute); @@ -125,5 +116,5 @@ void cleanup_temperature() { cudaFree(d_Tfree_solvent); cudaFree(d_Texcl_solvent); is_initialized = false; - } + } } diff --git a/src/core/cuda/src/CudaTorsionForce.cu b/src/core/cuda/src/CudaTorsionForce.cu index 4c06adc5..6f56fedf 100644 --- a/src/core/cuda/src/CudaTorsionForce.cu +++ b/src/core/cuda/src/CudaTorsionForce.cu @@ -1,13 +1,10 @@ +#include "cuda/include/CudaContext.cuh" #include "cuda/include/CudaTorsionForce.cuh" #include "cuda/include/CudaUtility.cuh" #include "utils.h" namespace CudaTorsionForce { bool is_initialized = false; -torsion_t* d_torsions = nullptr; -ctorsion_t* d_ctorsions = nullptr; -coord_t* d_coords = nullptr; -dvel_t* d_dvelocities = nullptr; double* d_energy_sum = nullptr; } // namespace CudaTorsionForce @@ -133,21 +130,16 @@ double calc_torsion_forces_host(int start, int end) { if (N <= 0) return 0.0; int blockSize = 256; int numBlocks = (N + blockSize - 1) / blockSize; - if (!is_initialized) { - check_cudaMalloc((void**)&d_torsions, sizeof(torsion_t) * n_torsions); - check_cudaMalloc((void**)&d_ctorsions, sizeof(ctorsion_t) * n_ctorsions); - check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms); - check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms); - check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); - is_initialized = true; - } - cudaMemcpy(d_torsions, torsions, sizeof(torsion_t) * n_torsions, cudaMemcpyHostToDevice); - cudaMemcpy(d_ctorsions, ctorsions, sizeof(ctorsion_t) * n_ctorsions, cudaMemcpyHostToDevice); - cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice); - cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice); double zero = 0.0; cudaMemcpy(d_energy_sum, &zero, sizeof(double), cudaMemcpyHostToDevice); + + CudaContext& ctx = CudaContext::instance(); + coord_t* d_coords = ctx.d_coords; + dvel_t* d_dvelocities = ctx.d_dvelocities; + torsion_t* d_torsions = ctx.d_torsions; + ctorsion_t* d_ctorsions = ctx.d_ctorsions; + calc_torsion_forces_kernel<<>>(start, end, d_torsions, d_ctorsions, d_coords, d_dvelocities, d_energy_sum); cudaDeviceSynchronize(); cudaMemcpy(&zero, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost); @@ -155,13 +147,19 @@ double calc_torsion_forces_host(int start, int end) { return zero; } + + +void init_torsion_force_kernel_data() { + using namespace CudaTorsionForce; + if (!is_initialized) { + check_cudaMalloc((void**)&d_energy_sum, sizeof(double)); + is_initialized = true; + } +} + void cleanup_torsion_force() { using namespace CudaTorsionForce; if (is_initialized) { - cudaFree(d_torsions); - cudaFree(d_ctorsions); - cudaFree(d_coords); - cudaFree(d_dvelocities); cudaFree(d_energy_sum); is_initialized = false; } diff --git a/src/core/system.cu b/src/core/system.cu index 65dbc209..afb999cf 100644 --- a/src/core/system.cu +++ b/src/core/system.cu @@ -22,6 +22,13 @@ #include "cuda/include/CudaPshellForce.cuh" #include "cuda/include/CudaRadixWaterForce.cuh" #include "cuda/include/CudaTemperature.cuh" +#include "cuda/include/CudaContext.cuh" +#include "cuda/include/CudaShakeConstraints.cuh" +#include "cuda/include/CudaNonbondedPPForce.cuh" +#include "cuda/include/CudaNonbondedQPForce.cuh" +#include "cuda/include/CudaNonbondedQWForce.cuh" +#include "cuda/include/CudaNonbondedPWForce.cuh" +#include "cuda/include/CudaNonbondedWWForce.cuh" #include #include @@ -1068,6 +1075,10 @@ void calc_integration_step(int iteration) { // Reset derivatives & energies reset_energies(); + if (run_gpu) { + CudaContext &ctx = CudaContext::instance(); + ctx.reset_energies(); + } // Determine temperature and kinetic energy if (run_gpu) { @@ -1087,10 +1098,10 @@ void calc_integration_step(int iteration) { clock_t start_pp, end_pp, start_qp, end_qp; if (run_gpu) { start_qp = clock(); - calc_nonbonded_qp_forces_host(); + calc_nonbonded_qp_forces_host_v2(); end_qp = clock(); start_pp = clock(); - calc_nonbonded_pp_forces_host(); + calc_nonbonded_pp_forces_host_v2(); end_pp = clock(); } else { @@ -1107,12 +1118,12 @@ void calc_integration_step(int iteration) { if (n_waters > 0) { if (run_gpu) { start_ww = clock(); - calc_nonbonded_ww_forces_host(); + calc_nonbonded_ww_forces_host_v2(); end_ww = clock(); start_pw = clock(); - calc_nonbonded_pw_forces_host(); + calc_nonbonded_pw_forces_host_v2(); end_pw = clock(); - calc_nonbonded_qw_forces_host(); + calc_nonbonded_qw_forces_host_v2(); } else { start_ww = clock(); @@ -1138,6 +1149,7 @@ void calc_integration_step(int iteration) { if (md.polarisation) { if (run_gpu) { calc_polx_water_forces_host(iteration); + // calc_polx_w_forces(iteration); // todo: polx water gpu version has bug!!! } else { calc_polx_w_forces(iteration); } @@ -1196,7 +1208,6 @@ void calc_integration_step(int iteration) { } else { calc_temperature(); } - // Update total potential energies with an average of all states for (int state = 0; state < n_lambdas; state++) { if (lambdas[state] == 0) { @@ -1304,6 +1315,30 @@ void calc_integration_step(int iteration) { } +void init_cuda_kernel_data() { + init_angle_force_kernel_data(); + init_bond_force_kernel_data(); + init_improper2_force_kernel_data(); + init_leapfrog_kernel_data(); + init_nonbonded_pp_force_kernel_data(); + init_nonbonded_pw_force_kernel_data(); + init_nonbonded_qp_force_kernel_data(); + init_nonbonded_qq_force_kernel_data(); + init_nonbonded_qw_force_kernel_data(); + init_nonbonded_ww_force_kernel_data(); + init_polx_water_force_kernel_data(); + init_pshell_force_kernel_data(); + init_radix_water_force_kernel_data(); + init_restrang_force_kernel_data(); + init_restrdis_force_kernel_data(); + init_restrpos_force_kernel_data(); + init_restrseq_force_kernel_data(); + init_restrwall_force_kernel_data(); + init_shake_constraints_kernel_data(); + init_temperature_kernel_data(); + init_torsion_force_kernel_data(); +} + void init_variables() { // From MD file init_md("md.csv"); @@ -1423,6 +1458,9 @@ void init_variables() { write_header("coords.csv"); write_header("velocities.csv"); write_energy_header(); + + CudaContext::instance().init(); + init_cuda_kernel_data(); } void clean_variables() { @@ -1519,17 +1557,24 @@ void clean_variables() { cleanup_angle_force(); cleanup_bond_force(); cleanup_improper2_force(); - cleanup_torsion_force(); + cleanup_leapfrog(); + cleanup_nonbonded_pp_force(); + cleanup_nonbonded_pw_force(); + cleanup_nonbonded_qp_force(); cleanup_nonbonded_qq_force(); - cleanup_restrwall_force(); + cleanup_nonbonded_qw_force(); + cleanup_nonbonded_ww_force(); + cleanup_polx_water_force(); + cleanup_pshell_force(); + cleanup_radix_water_force(); cleanup_restrang_force(); - cleanup_restrpos_force(); cleanup_restrdis_force(); + cleanup_restrpos_force(); cleanup_restrseq_force(); - cleanup_pshell_force(); + cleanup_restrwall_force(); + cleanup_shake_constraints(); cleanup_temperature(); - cleanup_radix_water_force(); - cleanup_leapfrog(); - cleanup_polx_water_force(); + cleanup_torsion_force(); + } } diff --git a/src/core/utils.cu b/src/core/utils.cu index a0fe2729..4e269a86 100644 --- a/src/core/utils.cu +++ b/src/core/utils.cu @@ -1,3 +1,4 @@ +#include #include #include #include @@ -29,4 +30,11 @@ void check_cudaMalloc(void** devPtr, size_t size) { printf(">>> FATAL: cudaMalloc failed with error code %d: %s\n", error, cudaGetErrorString(error)); exit(EXIT_FAILURE); } -} \ No newline at end of file +} + +void check_cuda(cudaError_t status) { + if (status != cudaSuccess) { + printf(">>> FATAL: CUDA call failed with error code %d: %s\n", status, cudaGetErrorString(status)); + exit(EXIT_FAILURE); + } +} diff --git a/src/core/utils.h b/src/core/utils.h index 32d50900..96e224e8 100644 --- a/src/core/utils.h +++ b/src/core/utils.h @@ -15,6 +15,9 @@ double to_radians(double degrees); * ============================================= */ +#include + +void check_cuda(cudaError_t status); void check_cudaMalloc(void** devPtr, size_t size); -#endif /* __UTILS_H__ */ \ No newline at end of file +#endif /* __UTILS_H__ */