diff --git a/src/core/cuda/include/CudaAngleForce.cuh b/src/core/cuda/include/CudaAngleForce.cuh
index 2782255f..7659a103 100644
--- a/src/core/cuda/include/CudaAngleForce.cuh
+++ b/src/core/cuda/include/CudaAngleForce.cuh
@@ -1,9 +1,6 @@
-#ifndef CUDA_ANGLE_FORCE_H
-#define CUDA_ANGLE_FORCE_H
+#pragma once
 #include "system.h"
-__global__ void calc_angle_forces_kernel(int start, int end, angle_t *angles, coord_t *coords, cangle_t *cangles, dvel_t *dvelocities, double *energy_sum);
 
+void init_angle_force_kernel_data();
 double calc_angle_forces_host(int start, int end);
-
 void cleanup_angle_force();
-#endif
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaBondForce.cuh b/src/core/cuda/include/CudaBondForce.cuh
index 61f2d2b8..0e7a9ba2 100644
--- a/src/core/cuda/include/CudaBondForce.cuh
+++ b/src/core/cuda/include/CudaBondForce.cuh
@@ -1,9 +1,6 @@
-#ifndef CUDA_BOND_FORCE_H
-#define CUDA_BOND_FORCE_H
+#pragma once
 #include "system.h"
 
-__global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t *coords, cbond_t *cbonds, dvel_t *dvelocities, double *energy_sum);
+void init_bond_force_kernel_data();
 double calc_bond_forces_host(int start, int end);
-
 void cleanup_bond_force();
-#endif
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaContext.cuh b/src/core/cuda/include/CudaContext.cuh
new file mode 100644
index 00000000..3c2883fa
--- /dev/null
+++ b/src/core/cuda/include/CudaContext.cuh
@@ -0,0 +1,140 @@
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include "system.h"
+#include "utils.h"
+
+class CudaContext {
+   public:
+    /*
+    Common data
+    */
+    coord_t* d_coords = nullptr;
+    dvel_t* d_dvelocities = nullptr;
+    vel_t* d_velocities = nullptr;
+
+    /*
+    Used in CudaAngleForce.cu
+    */
+    angle_t* d_angles = nullptr;
+    cangle_t* d_cangles = nullptr;
+
+    /*
+    Used in CudaBondForce.cu
+    */
+    bond_t* d_bonds = nullptr;
+    cbond_t* d_cbonds = nullptr;
+
+    /*
+    Used in CudaImproper2Force.cu
+    */
+    improper_t* d_impropers = nullptr;
+    cimproper_t* d_cimpropers = nullptr;
+
+    /*
+    Used in CudaLeapfrog.cu
+    */
+    atype_t* d_atypes = nullptr;
+    catype_t* d_catypes = nullptr;
+
+    /*
+    Used in CudaShakeConstraints.cu
+    */
+    int* d_mol_n_shakes = nullptr;
+    shake_bond_t* d_shake_bonds = nullptr;
+    double* d_winv = nullptr;
+    coord_t* d_xcoords = nullptr;
+
+    /*
+    Used in CudaNonbondedQQForce.cu
+    */
+    q_atom_t* d_q_atoms = nullptr;
+    q_charge_t* d_q_charges = nullptr;
+    int* d_LJ_matrix = nullptr;
+    bool* d_excluded = nullptr;
+    q_elscale_t* d_q_elscales = nullptr;
+    q_catype_t* d_q_catypes = nullptr;
+    q_atype_t* d_q_atypes = nullptr;
+    E_nonbonded_t* d_EQ_nonbond_qq = nullptr;
+    double* d_lambdas = nullptr;
+
+    /*
+    Used in CudaPolxWaterForce.cu
+    */
+    shell_t* d_wshells = nullptr;
+    /*
+    Used in CudaPshellForce.cu
+    */
+    bool* d_shell = nullptr;
+    coord_t* d_coords_top = nullptr;
+
+    /*
+    Used in CudaRestrangForce.cu
+    */
+    restrang_t* d_restrangs = nullptr;
+    E_restraint_t* d_EQ_restraint = nullptr;
+    restrdis_t* d_restrdists = nullptr;
+
+    /*
+    Used in CudaRestrposForce.cu
+    */
+    restrpos_t* d_restrspos = nullptr;
+
+    /*
+    Used in CudaRestrseqForce.cu
+    */
+    restrseq_t* d_restrseqs = nullptr;
+    bool* d_heavy = nullptr;
+
+    /*
+    Used in CudaRestrwallForce.cu
+    */
+    restrwall_t* d_restrwalls = nullptr;
+
+    /*
+    Used in CudaTorsionForce.cu
+    */
+    torsion_t* d_torsions = nullptr;
+    ctorsion_t* d_ctorsions = nullptr;
+
+    /*
+    Used in CudaNonbondedPPForce.cu
+    */
+    ccharge_t* d_ccharges;
+    charge_t* d_charges;
+    p_atom_t* d_p_atoms;
+
+    static CudaContext& instance() {
+        static CudaContext ctx;
+        return ctx;
+    }
+    void init();
+
+    template <typename T>
+    void sync_array_to_device(T* dst, const T* src, int count);
+
+    template <typename T>
+    void sync_array_to_host(T* dst, const T* src, int count);
+
+    void sync_all_to_device();
+    void sync_all_to_host();
+    void reset_energies();
+
+   private:
+    CudaContext() = default;
+
+    void free();
+
+    ~CudaContext() { free(); }
+    CudaContext(const CudaContext&) = delete;
+    CudaContext& operator=(const CudaContext&) = delete;
+};
+template <typename T>
+void CudaContext::sync_array_to_device(T* dst, const T* src, int count) {
+    check_cuda(cudaMemcpy(dst, src, count * sizeof(T), cudaMemcpyHostToDevice));
+}
+template <typename T>
+void CudaContext::sync_array_to_host(T* dst, const T* src, int count) {
+    check_cuda(cudaMemcpy(dst, src, count * sizeof(T), cudaMemcpyDeviceToHost));
+}
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaImproper2Force.cuh b/src/core/cuda/include/CudaImproper2Force.cuh
index 97097f84..232fbf6b 100644
--- a/src/core/cuda/include/CudaImproper2Force.cuh
+++ b/src/core/cuda/include/CudaImproper2Force.cuh
@@ -1,11 +1,6 @@
-#ifndef CUDA_IMPROPER2_FORCE
-#define CUDA_IMPROPER2_FORCE_H
+#pragma once
 #include "system.h"
 
-__global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, double* energy_sum);
-
+void init_improper2_force_kernel_data();
 double calc_improper2_forces_host(int start, int end);
-
 void cleanup_improper2_force();
-
-#endif
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaLeapfrog.cuh b/src/core/cuda/include/CudaLeapfrog.cuh
index ed2d47b0..c3ad57a7 100644
--- a/src/core/cuda/include/CudaLeapfrog.cuh
+++ b/src/core/cuda/include/CudaLeapfrog.cuh
@@ -1,8 +1,6 @@
-#ifndef CUDA_LEAPFROG_H
-#define CUDA_LEAPFROG_H
+#pragma once
 #include "system.h"
 
+void init_leapfrog_kernel_data();
 void calc_leapfrog_host();
 void cleanup_leapfrog();
-
-#endif
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaNonbondedPPForce.cuh b/src/core/cuda/include/CudaNonbondedPPForce.cuh
new file mode 100644
index 00000000..ea8cd22d
--- /dev/null
+++ b/src/core/cuda/include/CudaNonbondedPPForce.cuh
@@ -0,0 +1,8 @@
+#pragma once
+#include "system.h"
+
+void init_nonbonded_pp_force_kernel_data();
+
+void calc_nonbonded_pp_forces_host_v2();
+
+void cleanup_nonbonded_pp_force();
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaNonbondedPWForce.cuh b/src/core/cuda/include/CudaNonbondedPWForce.cuh
new file mode 100644
index 00000000..dc0cb7c4
--- /dev/null
+++ b/src/core/cuda/include/CudaNonbondedPWForce.cuh
@@ -0,0 +1,7 @@
+#pragma once
+#include "system.h"
+
+void init_nonbonded_pw_force_kernel_data();
+void calc_nonbonded_pw_forces_host_v2();
+
+void cleanup_nonbonded_pw_force();
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaNonbondedQPForce.cuh b/src/core/cuda/include/CudaNonbondedQPForce.cuh
new file mode 100644
index 00000000..d48acdd6
--- /dev/null
+++ b/src/core/cuda/include/CudaNonbondedQPForce.cuh
@@ -0,0 +1,6 @@
+#pragma once
+
+void init_nonbonded_qp_force_kernel_data();
+void calc_nonbonded_qp_forces_host_v2();
+
+void cleanup_nonbonded_qp_force();
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaNonbondedQQForce.cuh b/src/core/cuda/include/CudaNonbondedQQForce.cuh
index b21dbc4b..aba8a911 100644
--- a/src/core/cuda/include/CudaNonbondedQQForce.cuh
+++ b/src/core/cuda/include/CudaNonbondedQQForce.cuh
@@ -1,8 +1,7 @@
-#ifndef CUDA_NONBONDED_QQ_FORCE_H
-#define CUDA_NONBONDED_QQ_FORCE_H
+#pragma once
 #include "system.h"
 
+void init_nonbonded_qq_force_kernel_data();
 void calc_nonbonded_qq_forces_host();
-void cleanup_nonbonded_qq_force();
 
-#endif
\ No newline at end of file
+void cleanup_nonbonded_qq_force();
diff --git a/src/core/cuda/include/CudaNonbondedQWForce.cuh b/src/core/cuda/include/CudaNonbondedQWForce.cuh
new file mode 100644
index 00000000..14979b6f
--- /dev/null
+++ b/src/core/cuda/include/CudaNonbondedQWForce.cuh
@@ -0,0 +1,7 @@
+#pragma once
+#include "system.h"
+void init_nonbonded_qw_force_kernel_data();
+
+void calc_nonbonded_qw_forces_host_v2();
+
+void cleanup_nonbonded_qw_force();
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaNonbondedWWForce.cuh b/src/core/cuda/include/CudaNonbondedWWForce.cuh
new file mode 100644
index 00000000..1aaa3f6a
--- /dev/null
+++ b/src/core/cuda/include/CudaNonbondedWWForce.cuh
@@ -0,0 +1,7 @@
+#pragma once
+#include "system.h"
+void init_nonbonded_ww_force_kernel_data();
+
+void calc_nonbonded_ww_forces_host_v2();
+
+void cleanup_nonbonded_ww_force();
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaPolxWaterForce.cuh b/src/core/cuda/include/CudaPolxWaterForce.cuh
index 706585b5..5b699f35 100644
--- a/src/core/cuda/include/CudaPolxWaterForce.cuh
+++ b/src/core/cuda/include/CudaPolxWaterForce.cuh
@@ -1,10 +1,8 @@
-#ifndef __CUDA_POLX_WATER_FORCE_CUH__
-#define __CUDA_POLX_WATER_FORCE_CUH__
-
+#pragma once
 #include "system.h"
 
+void init_polx_water_force_kernel_data();
+
 void calc_polx_water_forces_host(int iteration);
 
 void cleanup_polx_water_force();
-
-#endif
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaPshellForce.cuh b/src/core/cuda/include/CudaPshellForce.cuh
index 0e2c7099..c86bfb4f 100644
--- a/src/core/cuda/include/CudaPshellForce.cuh
+++ b/src/core/cuda/include/CudaPshellForce.cuh
@@ -1,17 +1,7 @@
-#ifndef CUDA_PSHELL_FORCE_CUH
-#define CUDA_PSHELL_FORCE_CUH
+#pragma once
 #include "system.h"
-__global__ void calc_pshell_force_kernel(
-    int n_atoms_solute,
-    bool* shell,
-    bool* excluded,
-    coord_t* coords,
-    coord_t* coords_top,
-    double* ufix_energy,
-    double* ushell_energy,
-    dvel_t* dvelocities
-);
+void init_pshell_force_kernel_data();
+
 void calc_pshell_forces_host();
-void cleanup_pshell_force();
 
-#endif  // CUDA_PSHELL_FORCE_CUH
\ No newline at end of file
+void cleanup_pshell_force();
diff --git a/src/core/cuda/include/CudaRadixWaterForce.cuh b/src/core/cuda/include/CudaRadixWaterForce.cuh
index 891a8861..c13330e3 100644
--- a/src/core/cuda/include/CudaRadixWaterForce.cuh
+++ b/src/core/cuda/include/CudaRadixWaterForce.cuh
@@ -1,22 +1,6 @@
-#ifndef CUDA_RADIX_WATER_FORCE_CU
-#define CUDA_RADIX_WATER_FORCE_CU
+#pragma once
 #include "system.h"
 
-
-__global__ void calc_radix_water_forces_kernel(
-    coord_t* coords, 
-    double shift, 
-    int n_atoms_solute,
-    int n_atoms,
-    topo_t topo,
-    md_t md,
-    double Dwmz,
-    double awmz,
-    dvel_t* dvelocities, 
-    double* energy);
-    
+void init_radix_water_force_kernel_data();
 void calc_radix_water_forces_host();
-
 void cleanup_radix_water_force();
-
-#endif
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaRestrangForce.cuh b/src/core/cuda/include/CudaRestrangForce.cuh
index f44f9b0c..c5c5e98d 100644
--- a/src/core/cuda/include/CudaRestrangForce.cuh
+++ b/src/core/cuda/include/CudaRestrangForce.cuh
@@ -1,8 +1,7 @@
-#ifndef CUDA_RESTRANG_FORCE_CUH
-#define CUDA_RESTRANG_FORCE_CUH
+#pragma once
 #include "system.h"
 
+void init_restrang_force_kernel_data();
 void calc_restrang_force_host();
-void cleanup_restrang_force();
 
-#endif  // CUDA_RESTRANG_FORCE_CUH
\ No newline at end of file
+void cleanup_restrang_force();
diff --git a/src/core/cuda/include/CudaRestrdisForce.cuh b/src/core/cuda/include/CudaRestrdisForce.cuh
index 04abe85f..09fd9350 100644
--- a/src/core/cuda/include/CudaRestrdisForce.cuh
+++ b/src/core/cuda/include/CudaRestrdisForce.cuh
@@ -1,10 +1,7 @@
-#ifndef CUDA_RESTRDIS_FORCE_CUH
-#define CUDA_RESTRDIS_FORCE_CUH
+#pragma once
 #include "system.h"
-__global__ void calc_restrdis_forces_kernel();
 
+void init_restrdis_force_kernel_data();
 void calc_restrdis_forces_host();
 
 void cleanup_restrdis_force();
-
-#endif
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaRestrposForce.cuh b/src/core/cuda/include/CudaRestrposForce.cuh
index 693ab5bb..ddbd29d1 100644
--- a/src/core/cuda/include/CudaRestrposForce.cuh
+++ b/src/core/cuda/include/CudaRestrposForce.cuh
@@ -1,8 +1,7 @@
-#ifndef CUDA_RESTRPOS_FORCE_CUH
-#define CUDA_RESTRPOS_FORCE_CUH
+#pragma once
 #include "system.h"
 
+void init_restrpos_force_kernel_data();
 void calc_restrpos_forces_host();
-void cleanup_restrpos_force();
 
-#endif  // CUDA_RESTRPOS_FORCE_CUH
\ No newline at end of file
+void cleanup_restrpos_force();
diff --git a/src/core/cuda/include/CudaRestrseqForce.cuh b/src/core/cuda/include/CudaRestrseqForce.cuh
index ae013466..efbe60bc 100644
--- a/src/core/cuda/include/CudaRestrseqForce.cuh
+++ b/src/core/cuda/include/CudaRestrseqForce.cuh
@@ -1,11 +1,6 @@
-#ifndef CUDA_RESTRSEQ_FORCE_CUH
-#define CUDA_RESTRSEQ_FORCE_CUH
+#pragma once
 #include "system.h"
-__global__ void calc_restrseq_forces_kernel();
 
+void init_restrseq_force_kernel_data();
 void calc_restrseq_forces_host();
-
 void cleanup_restrseq_force();
-
-
-#endif // CUDA_RESTRSEQ_FORCE_CUH
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaRestrwallForce.cuh b/src/core/cuda/include/CudaRestrwallForce.cuh
index c5fa7fe5..c78a4759 100644
--- a/src/core/cuda/include/CudaRestrwallForce.cuh
+++ b/src/core/cuda/include/CudaRestrwallForce.cuh
@@ -1,8 +1,6 @@
-#ifndef CUDA_RESTRWALL_FORCE_CUH
-#define CUDA_RESTRWALL_FORCE_CUH
+#pragma once
 #include "system.h"
 
+void init_restrwall_force_kernel_data();
 void calc_restrwall_forces_host();
 void cleanup_restrwall_force();
-
-#endif
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaShakeConstraints.cuh b/src/core/cuda/include/CudaShakeConstraints.cuh
index c4f85736..e05739a8 100644
--- a/src/core/cuda/include/CudaShakeConstraints.cuh
+++ b/src/core/cuda/include/CudaShakeConstraints.cuh
@@ -1,22 +1,7 @@
-#ifndef CUDA_SHAKE_CONSTRAINTS_CUH
-#define CUDA_SHAKE_CONSTRAINTS_CUH
+#pragma once
 #include "system.h"
 
-__global__ void calc_shake_constraints_kernel(
-    int n_molecules,
-    int *mol_n_shakes,
-    shake_bond_t *shake_bonds,
-    coord_t *coords,
-    coord_t *xcoords,
-    double *winv,
-    int *total_iterations,
-    int *mol_shake_offset
-);
+void init_shake_constraints_kernel_data();
 
 int calc_shake_constraints_host();
-
 void cleanup_shake_constraints();
-
-
-
-#endif
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaTemperature.cuh b/src/core/cuda/include/CudaTemperature.cuh
index 89ffb6df..ad7b8a9d 100644
--- a/src/core/cuda/include/CudaTemperature.cuh
+++ b/src/core/cuda/include/CudaTemperature.cuh
@@ -1,11 +1,7 @@
-#ifndef CUDA_TEMPERATURE_CUH
-#define CUDA_TEMPERATURE_CUH
+#pragma once
 #include "system.h"
-__global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t* atypes, catype_t *catypes, vel_t *velocities, bool* excluded, double boltz, double ekinmax, 
-double *Temp_solute, double *Tfree_solute, double *Texcl_solute, double *Temp_solvent, double *Tfree_solvent, double *Texcl_solvent);
 
+void init_temperature_kernel_data();
 void calc_temperature_host();
 
 void cleanup_temperature();
-
-#endif
\ No newline at end of file
diff --git a/src/core/cuda/include/CudaTorsionForce.cuh b/src/core/cuda/include/CudaTorsionForce.cuh
index 9e51543a..b2ec94b8 100644
--- a/src/core/cuda/include/CudaTorsionForce.cuh
+++ b/src/core/cuda/include/CudaTorsionForce.cuh
@@ -1,11 +1,7 @@
-#ifndef CUDA_TORSION_FORCE_H
-#define CUDA_TORSION_FORCE_H
+#pragma once
 #include "system.h"
 
-__global__ void calc_torsion_forces_kernel(int start, int end, torsion_t* torsions, ctorsion_t* ctorsions, coord_t* coords, dvel_t* dvelocities, double* energy_sum);
-
+void init_torsion_force_kernel_data();
 double calc_torsion_forces_host(int start, int end);
 
 void cleanup_torsion_force();
-
-#endif
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaAngleForce.cu b/src/core/cuda/src/CudaAngleForce.cu
index f9467b97..452d64cf 100644
--- a/src/core/cuda/src/CudaAngleForce.cu
+++ b/src/core/cuda/src/CudaAngleForce.cu
@@ -1,17 +1,13 @@
 #include "cuda/include/CudaAngleForce.cuh"
-#include "utils.h"
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaUtility.cuh"
+#include "utils.h"
 
 namespace CudaAngleForce {
 bool is_initialized = false;
-angle_t* d_angles = nullptr;
-coord_t* d_coords = nullptr;
-cangle_t* d_cangles = nullptr;
-dvel_t* d_dvelocities = nullptr;
-double* d_energy_sum = nullptr;
+double* d_energy_sum;
 }  // namespace CudaAngleForce
 
-
 __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, coord_t* coords, cangle_t* cangles, dvel_t* dvelocities, double* energy_sum) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x + start;
     if (idx >= end) return;
@@ -76,30 +72,19 @@ __global__ void calc_angle_forces_kernel(int start, int end, angle_t* angles, co
 }
 
 double calc_angle_forces_host(int start, int end) {
-    using namespace CudaAngleForce;
     int N = end - start;
     if (N <= 0) return 0.0;
+    using namespace CudaAngleForce;
     int blockSize = 256;
     int numBlocks = (N + blockSize - 1) / blockSize;
 
-    if (!is_initialized) {
-        is_initialized = true;
-        check_cudaMalloc((void**)&d_angles, n_angles * sizeof(angle_t));
-        check_cudaMalloc((void**)&d_coords, n_atoms * sizeof(coord_t));
-        check_cudaMalloc((void**)&d_cangles, n_cangles * sizeof(cangle_t));
-        check_cudaMalloc((void**)&d_dvelocities, n_atoms * sizeof(dvel_t));
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
-    
-
-
-        // This data is constant, copy once
-        cudaMemcpy(d_angles, angles, n_angles * sizeof(angle_t), cudaMemcpyHostToDevice);
-        cudaMemcpy(d_cangles, cangles, n_cangles * sizeof(cangle_t), cudaMemcpyHostToDevice);
-    }
-
-    // copy data to device
-    cudaMemcpy(d_coords, coords, n_atoms * sizeof(coord_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_dvelocities, dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyHostToDevice);
+    CudaContext& ctx = CudaContext::instance();
+    auto d_angles = ctx.d_angles;
+    auto d_coords = ctx.d_coords;
+    auto d_cangles = ctx.d_cangles;
+    auto d_dvelocities = ctx.d_dvelocities;
+    // todo: now have to do that, after moving all to CudaContext, can remove it
+    // ctx.sync_all_to_device();
 
     double h_energy_sum = 0.0;
     cudaMemcpy(d_energy_sum, &h_energy_sum, sizeof(double), cudaMemcpyHostToDevice);
@@ -108,20 +93,25 @@ double calc_angle_forces_host(int start, int end) {
     calc_angle_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_angles, d_coords, d_cangles, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
 
+    // todo: Now have to do that, after moving all to CudaContext, can remove it
     // copy results back to host
     cudaMemcpy(&h_energy_sum, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
-    cudaMemcpy(dvelocities, d_dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dvelocities, ctx.d_dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyDeviceToHost);
     return h_energy_sum;
 }
 
+void init_angle_force_kernel_data() {
+    using namespace CudaAngleForce;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
+        is_initialized = true;
+    }
+}
+
 void cleanup_angle_force() {
     using namespace CudaAngleForce;
     if (is_initialized) {
-        cudaFree(d_angles);
-        cudaFree(d_coords);
-        cudaFree(d_cangles);
-        cudaFree(d_dvelocities);
         cudaFree(d_energy_sum);
         is_initialized = false;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/cuda/src/CudaBondForce.cu b/src/core/cuda/src/CudaBondForce.cu
index b4119c38..47869e76 100644
--- a/src/core/cuda/src/CudaBondForce.cu
+++ b/src/core/cuda/src/CudaBondForce.cu
@@ -1,15 +1,10 @@
 #include "cuda/include/CudaBondForce.cuh"
+#include "cuda/include/CudaContext.cuh"
 #include "utils.h"
-
 namespace CudaBondForce {
 bool is_initialized = false;
-bond_t* d_bonds = nullptr;
-coord_t* d_coords = nullptr;
-cbond_t* d_cbonds = nullptr;
-dvel_t* d_dvelocities = nullptr;
-double* d_energy_sum = nullptr;
+double* d_energy_sum;
 }  // namespace CudaBondForce
-
 __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord_t* coords, cbond_t* cbonds, dvel_t* dvelocities, double* energy_sum) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x + start;
     if (idx >= end) return;
@@ -39,42 +34,41 @@ __global__ void calc_bond_forces_kernel(int start, int end, bond_t* bonds, coord
 }
 
 double calc_bond_forces_host(int start, int end) {
-    using namespace CudaBondForce;
     int N = end - start;
     if (N <= 0) return 0.0;
+    using namespace CudaBondForce;
     int blockSize = 256;
     int numBlocks = (N + blockSize - 1) / blockSize;
 
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_bonds, sizeof(bond_t) * n_bonds);
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_cbonds, sizeof(cbond_t) * n_cbonds);
-        check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms);
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
-        is_initialized = true;
-    }
+    double energy = 0.0;
+    cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice);
+
+    CudaContext& context = CudaContext::instance();
+    bond_t* d_bonds = context.d_bonds;
+    coord_t* d_coords = context.d_coords;
+    cbond_t* d_cbonds = context.d_cbonds;
+    dvel_t* d_dvelocities = context.d_dvelocities;
 
-    cudaMemcpy(d_bonds, bonds, sizeof(bond_t) * n_bonds, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_cbonds, cbonds, sizeof(cbond_t) * n_cbonds, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice);
-    double zero = 0.0;
-    cudaMemcpy(d_energy_sum, &zero, sizeof(double), cudaMemcpyHostToDevice);
     calc_bond_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_bonds, d_coords, d_cbonds, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
-    cudaMemcpy(&zero, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
     cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
-    return zero;
+
+    return energy;
+}
+
+void init_bond_force_kernel_data() {
+    using namespace CudaBondForce;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
+        is_initialized = true;
+    }
 }
 
 void cleanup_bond_force() {
     using namespace CudaBondForce;
     if (is_initialized) {
-        cudaFree(d_bonds);
-        cudaFree(d_coords);
-        cudaFree(d_cbonds);
-        cudaFree(d_dvelocities);
         cudaFree(d_energy_sum);
         is_initialized = false;
     }
-}
\ No newline at end of file
+}
diff --git a/src/core/cuda/src/CudaContext.cu b/src/core/cuda/src/CudaContext.cu
new file mode 100644
index 00000000..19f22165
--- /dev/null
+++ b/src/core/cuda/src/CudaContext.cu
@@ -0,0 +1,217 @@
+#include <iostream>
+
+#include "cuda/include/CudaContext.cuh"
+
+void CudaContext::init() {
+    check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
+    check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms);
+    check_cudaMalloc((void**)&d_velocities, sizeof(vel_t) * n_atoms);
+    check_cudaMalloc((void**)&d_angles, sizeof(angle_t) * n_angles);
+    check_cudaMalloc((void**)&d_cangles, sizeof(cangle_t) * n_cangles);
+    check_cudaMalloc((void**)&d_bonds, sizeof(bond_t) * n_bonds);
+    check_cudaMalloc((void**)&d_cbonds, sizeof(cbond_t) * n_cbonds);
+    check_cudaMalloc((void**)&d_impropers, sizeof(improper_t) * n_impropers);
+    check_cudaMalloc((void**)&d_cimpropers, sizeof(cimproper_t) * n_cimpropers);
+
+    check_cudaMalloc((void**)&d_mol_n_shakes, sizeof(int) * n_molecules);
+    check_cudaMalloc((void**)&d_shake_bonds, sizeof(shake_bond_t) * n_shake_constraints);
+    check_cudaMalloc((void**)&d_winv, sizeof(double) * n_atoms);
+    check_cudaMalloc((void**)&d_xcoords, sizeof(coord_t) * n_atoms);
+
+    check_cudaMalloc((void**)&d_atypes, sizeof(atype_t) * n_atypes);
+    check_cudaMalloc((void**)&d_catypes, sizeof(catype_t) * n_catypes);
+
+
+    check_cudaMalloc((void**)&d_q_atoms, sizeof(q_atom_t) * n_qatoms);
+    check_cudaMalloc((void**)&d_q_charges, sizeof(q_charge_t) * n_qatoms * n_lambdas);
+    check_cudaMalloc((void**)&d_LJ_matrix, sizeof(int) * n_atoms_solute * n_atoms_solute);
+    check_cudaMalloc((void**)&d_excluded, sizeof(bool) * n_atoms);
+    check_cudaMalloc((void**)&d_q_elscales, sizeof(q_elscale_t) * n_qelscales);
+    check_cudaMalloc((void**)&d_q_catypes, sizeof(q_catype_t) * n_qcatypes);
+    check_cudaMalloc((void**)&d_q_atypes, sizeof(q_atype_t) * n_qatoms * n_lambdas);
+    check_cudaMalloc((void**)&d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas);
+    check_cudaMalloc((void**)&d_lambdas, sizeof(double) * n_lambdas);
+
+    check_cudaMalloc((void**)&d_wshells, n_shells * sizeof(shell_t));
+    check_cudaMalloc((void**)&d_shell, sizeof(bool) * n_atoms);
+    check_cudaMalloc((void**)&d_coords_top, sizeof(coord_t) * n_atoms);
+
+    check_cudaMalloc((void**)&d_restrangs, sizeof(restrang_t) * n_restrangs);
+    check_cudaMalloc((void**)&d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas);
+    check_cudaMalloc((void**)&d_restrdists, sizeof(restrdis_t) * n_restrdists);
+
+    check_cudaMalloc((void**)&d_restrspos, sizeof(restrpos_t) * n_restrspos);
+
+    check_cudaMalloc((void**)&d_restrseqs, sizeof(restrseq_t) * n_restrseqs);
+    check_cudaMalloc((void**)&d_heavy, sizeof(bool) * n_atoms);
+
+    check_cudaMalloc((void**)&d_restrwalls, sizeof(restrwall_t) * n_restrwalls);
+
+    check_cudaMalloc((void**)&d_torsions, sizeof(torsion_t) * n_torsions);
+    check_cudaMalloc((void**)&d_ctorsions, sizeof(ctorsion_t) * n_ctorsions);
+
+    check_cudaMalloc((void**)&d_ccharges, sizeof(ccharge_t) * n_ccharges);
+    check_cudaMalloc((void**)&d_charges, sizeof(charge_t) * n_charges);
+    check_cudaMalloc((void**)&d_p_atoms, sizeof(p_atom_t) * n_patoms);
+
+    sync_all_to_device();
+}
+
+void CudaContext::sync_all_to_device() {
+    sync_array_to_device<coord_t>(d_coords, coords, n_atoms);
+    sync_array_to_device<dvel_t>(d_dvelocities, dvelocities, n_atoms);
+    sync_array_to_device<vel_t>(d_velocities, velocities, n_atoms);
+    sync_array_to_device<angle_t>(d_angles, angles, n_angles);
+    sync_array_to_device<cangle_t>(d_cangles, cangles, n_cangles);
+    sync_array_to_device<bond_t>(d_bonds, bonds, n_bonds);
+    sync_array_to_device<cbond_t>(d_cbonds, cbonds, n_cbonds);
+    sync_array_to_device<improper_t>(d_impropers, impropers, n_impropers);
+    sync_array_to_device<cimproper_t>(d_cimpropers, cimpropers, n_cimpropers);
+
+    sync_array_to_device<int>(d_mol_n_shakes, mol_n_shakes, n_molecules);
+    sync_array_to_device<shake_bond_t>(d_shake_bonds, shake_bonds, n_shake_constraints);
+    sync_array_to_device<double>(d_winv, winv, n_atoms);
+    sync_array_to_device<coord_t>(d_xcoords, xcoords, n_atoms);
+
+    sync_array_to_device<atype_t>(d_atypes, atypes, n_atypes);
+    sync_array_to_device<catype_t>(d_catypes, catypes, n_catypes);
+
+    sync_array_to_device<q_atom_t>(d_q_atoms, q_atoms, n_qatoms);
+    sync_array_to_device<q_charge_t>(d_q_charges, q_charges, n_qatoms * n_lambdas);
+    sync_array_to_device<int>(d_LJ_matrix, LJ_matrix, n_atoms_solute * n_atoms_solute);
+    sync_array_to_device<bool>(d_excluded, excluded, n_atoms);
+    sync_array_to_device<q_elscale_t>(d_q_elscales, q_elscales, n_qelscales);
+    sync_array_to_device<q_catype_t>(d_q_catypes, q_catypes, n_qcatypes);
+    sync_array_to_device<q_atype_t>(d_q_atypes, q_atypes, n_qatoms * n_lambdas);
+    sync_array_to_device<E_nonbonded_t>(d_EQ_nonbond_qq, EQ_nonbond_qq, n_lambdas);
+    sync_array_to_device<double>(d_lambdas, lambdas, n_lambdas);
+    sync_array_to_device<shell_t>(d_wshells, wshells, n_shells);
+
+    sync_array_to_device<bool>(d_shell, shell, n_atoms);
+    sync_array_to_device<coord_t>(d_coords_top, coords_top, n_atoms);
+
+    sync_array_to_device<restrang_t>(d_restrangs, restrangs, n_restrangs);
+    sync_array_to_device<E_restraint_t>(d_EQ_restraint, EQ_restraint, n_lambdas);
+    sync_array_to_device<restrdis_t>(d_restrdists, restrdists, n_restrdists);
+
+    sync_array_to_device<restrpos_t>(d_restrspos, restrspos, n_restrspos);
+
+    sync_array_to_device<restrseq_t>(d_restrseqs, restrseqs, n_restrseqs);
+    sync_array_to_device<bool>(d_heavy, heavy, n_atoms);
+    sync_array_to_device<restrwall_t>(d_restrwalls, restrwalls, n_restrwalls);
+
+    sync_array_to_device<torsion_t>(d_torsions, torsions, n_torsions);
+    sync_array_to_device<ctorsion_t>(d_ctorsions, ctorsions, n_ctorsions);
+
+    sync_array_to_device<ccharge_t>(d_ccharges, ccharges, n_ccharges);
+    sync_array_to_device<charge_t>(d_charges, charges, n_charges);
+    sync_array_to_device<p_atom_t>(d_p_atoms, p_atoms, n_patoms);
+}
+
+void CudaContext::sync_all_to_host() {
+    sync_array_to_host<coord_t>(coords, d_coords, n_atoms);
+    sync_array_to_host<dvel_t>(dvelocities, d_dvelocities, n_atoms);
+    sync_array_to_host<vel_t>(velocities, d_velocities, n_atoms);
+    sync_array_to_host<angle_t>(angles, d_angles, n_angles);
+    sync_array_to_host<cangle_t>(cangles, d_cangles, n_cangles);
+    sync_array_to_host<bond_t>(bonds, d_bonds, n_bonds);
+    sync_array_to_host<cbond_t>(cbonds, d_cbonds, n_cbonds);
+    sync_array_to_host<improper_t>(impropers, d_impropers, n_impropers);
+    sync_array_to_host<cimproper_t>(cimpropers, d_cimpropers, n_cimpropers);
+
+    sync_array_to_host<int>(mol_n_shakes, d_mol_n_shakes, n_molecules);
+    sync_array_to_host<shake_bond_t>(shake_bonds, d_shake_bonds, n_shake_constraints);
+    sync_array_to_host<double>(winv, d_winv, n_atoms);
+    sync_array_to_host<coord_t>(xcoords, d_xcoords, n_atoms);
+
+    sync_array_to_host<atype_t>(atypes, d_atypes, n_atypes);
+    sync_array_to_host<catype_t>(catypes, d_catypes, n_catypes);
+
+    sync_array_to_host<q_atom_t>(q_atoms, d_q_atoms, n_qatoms);
+    sync_array_to_host<q_charge_t>(q_charges, d_q_charges, n_qatoms * n_lambdas);
+    sync_array_to_host<int>(LJ_matrix, d_LJ_matrix, n_atoms_solute * n_atoms_solute);
+    sync_array_to_host<bool>(excluded, d_excluded, n_atoms);
+    sync_array_to_host<q_elscale_t>(q_elscales, d_q_elscales, n_qelscales);
+    sync_array_to_host<q_catype_t>(q_catypes, d_q_catypes, n_qcatypes);
+    sync_array_to_host<q_atype_t>(q_atypes, d_q_atypes, n_qatoms * n_lambdas);
+    sync_array_to_host<E_nonbonded_t>(EQ_nonbond_qq, d_EQ_nonbond_qq, n_lambdas);
+    sync_array_to_host<double>(lambdas, d_lambdas, n_lambdas);
+    sync_array_to_host<shell_t>(wshells, d_wshells, n_shells);
+    sync_array_to_host<bool>(shell, d_shell, n_atoms);
+    sync_array_to_host<coord_t>(coords_top, d_coords_top, n_atoms);
+
+    sync_array_to_host<restrang_t>(restrangs, d_restrangs, n_restrangs);
+    sync_array_to_host<E_restraint_t>(EQ_restraint, d_EQ_restraint, n_lambdas);
+    sync_array_to_host<restrdis_t>(restrdists, d_restrdists, n_restrdists);
+
+    sync_array_to_host<restrpos_t>(restrspos, d_restrspos, n_restrspos);
+
+    sync_array_to_host<restrseq_t>(restrseqs, d_restrseqs, n_restrseqs);
+    sync_array_to_host<bool>(heavy, d_heavy, n_atoms);
+    sync_array_to_host<restrwall_t>(restrwalls, d_restrwalls, n_restrwalls);
+
+    sync_array_to_host<torsion_t>(torsions, d_torsions, n_torsions);
+    sync_array_to_host<ctorsion_t>(ctorsions, d_ctorsions, n_ctorsions);
+
+    sync_array_to_host<ccharge_t>(ccharges, d_ccharges, n_ccharges);
+    sync_array_to_host<charge_t>(charges, d_charges, n_charges);
+    sync_array_to_host<p_atom_t>(p_atoms, d_p_atoms, n_patoms);
+}
+
+void CudaContext::free() {
+    cudaFree(d_coords);
+    cudaFree(d_dvelocities);
+    cudaFree(d_velocities);
+    cudaFree(d_angles);
+    cudaFree(d_cangles);
+    cudaFree(d_bonds);
+    cudaFree(d_cbonds);
+    cudaFree(d_impropers);
+    cudaFree(d_cimpropers);
+
+    cudaFree(d_atypes);
+    cudaFree(d_catypes);
+
+    cudaFree(d_mol_n_shakes);
+    cudaFree(d_shake_bonds);
+    cudaFree(d_winv);
+    cudaFree(d_xcoords);
+
+    cudaFree(d_q_atoms);
+    cudaFree(d_q_charges);
+    cudaFree(d_LJ_matrix);
+    cudaFree(d_excluded);
+    cudaFree(d_q_elscales);
+    cudaFree(d_q_catypes);
+    cudaFree(d_q_atypes);
+    cudaFree(d_EQ_nonbond_qq);
+    cudaFree(d_lambdas);
+
+    cudaFree(d_wshells);
+    cudaFree(d_shell);
+    cudaFree(d_coords_top);
+
+    cudaFree(d_restrangs);
+    cudaFree(d_EQ_restraint);
+    cudaFree(d_restrdists);
+
+    cudaFree(d_restrspos);
+
+    cudaFree(d_restrseqs);
+    cudaFree(d_heavy);
+
+    cudaFree(d_restrwalls);
+
+    cudaFree(d_torsions);
+    cudaFree(d_ctorsions);
+
+    cudaFree(d_ccharges);
+    cudaFree(d_charges);
+    cudaFree(d_p_atoms);
+}
+
+void CudaContext::reset_energies() {
+    cudaMemset(d_dvelocities, 0, sizeof(dvel_t) * n_atoms);
+    cudaMemset(d_EQ_nonbond_qq, 0, sizeof(E_nonbonded_t) * n_lambdas);
+    cudaMemset(d_EQ_restraint, 0, sizeof(E_restraint_t) * n_lambdas);
+}
diff --git a/src/core/cuda/src/CudaImproper2Force.cu b/src/core/cuda/src/CudaImproper2Force.cu
index 4d5e8670..21f1e1a5 100644
--- a/src/core/cuda/src/CudaImproper2Force.cu
+++ b/src/core/cuda/src/CudaImproper2Force.cu
@@ -1,14 +1,11 @@
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaImproper2Force.cuh"
 #include "cuda/include/CudaUtility.cuh"
 #include "utils.h"
 
 namespace CudaImproper2Force {
 bool is_initialized = false;
-improper_t* d_impropers = nullptr;
-cimproper_t* d_cimpropers = nullptr;
-coord_t* d_coords = nullptr;
-dvel_t* d_dvelocities = nullptr;
-double* d_energy_sum = nullptr;
+double* d_energy_sum;
 }  // namespace CudaImproper2Force
 
 __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* impropers, cimproper_t* cimpropers, coord_t* coords, dvel_t* dvelocities, double* energy_sum) {
@@ -125,45 +122,44 @@ __global__ void calc_improper2_forces_kernel(int start, int end, improper_t* imp
     atomicAdd(&dvelocities[aki].z, dv * dpk.z);
     atomicAdd(&dvelocities[ali].x, dv * dpl.x);
     atomicAdd(&dvelocities[ali].y, dv * dpl.y);
-    atomicAdd(&dvelocities[ali].z, dv * dpl.z); 
+    atomicAdd(&dvelocities[ali].z, dv * dpl.z);
 }
 
 double calc_improper2_forces_host(int start, int end) {
-    using namespace CudaImproper2Force;
     int N = end - start;
     if (N <= 0) return 0.0;
+    using namespace CudaImproper2Force;
     int blockSize = 256;
     int numBlocks = (N + blockSize - 1) / blockSize;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_impropers, sizeof(improper_t) * n_impropers);
-        check_cudaMalloc((void**)&d_cimpropers, sizeof(cimproper_t) * n_cimpropers);
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms);
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
-        is_initialized = true;
-    }
 
-    cudaMemcpy(d_impropers, impropers, sizeof(improper_t) * n_impropers, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_cimpropers, cimpropers, sizeof(cimproper_t) * n_cimpropers, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice);
-    double zero = 0.0;
-    cudaMemcpy(d_energy_sum, &zero, sizeof(double), cudaMemcpyHostToDevice);
+    double energy = 0.0;
+    cudaMemcpy(d_energy_sum, &energy, sizeof(double), cudaMemcpyHostToDevice);
+
+    CudaContext& context = CudaContext::instance();
+    // context.sync_all_to_device();
+    coord_t* d_coords = context.d_coords;
+    dvel_t* d_dvelocities = context.d_dvelocities;
+    improper_t* d_impropers = context.d_impropers;
+    cimproper_t* d_cimpropers = context.d_cimpropers;
+
     calc_improper2_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_impropers, d_cimpropers, d_coords, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
-    double energy = 0.0;
     cudaMemcpy(&energy, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
     cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
     return energy;
 }
 
+void init_improper2_force_kernel_data() {
+    using namespace CudaImproper2Force;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
+        is_initialized = true;
+    }
+}
+
 void cleanup_improper2_force() {
     using namespace CudaImproper2Force;
     if (is_initialized) {
-        cudaFree(d_impropers);
-        cudaFree(d_cimpropers);
-        cudaFree(d_coords);
-        cudaFree(d_dvelocities);
         cudaFree(d_energy_sum);
         is_initialized = false;
     }
diff --git a/src/core/cuda/src/CudaLeapfrog.cu b/src/core/cuda/src/CudaLeapfrog.cu
index 5d0af953..d246b069 100644
--- a/src/core/cuda/src/CudaLeapfrog.cu
+++ b/src/core/cuda/src/CudaLeapfrog.cu
@@ -1,19 +1,13 @@
 #include <iostream>
 
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaLeapfrog.cuh"
 #include "cuda/include/CudaShakeConstraints.cuh"
 #include "utils.h"
 
 namespace CudaLeapfrog {
-bool is_initialized = false;
-atype_t* d_atypes = nullptr;
-catype_t* d_catypes = nullptr;
-vel_t* d_velocities = nullptr;
-dvel_t* d_dvelocities = nullptr;
-coord_t* d_coords = nullptr;
-coord_t* d_xcoords = nullptr;
 
-}  // namespace CudaLeapfrog
+}
 
 __global__ void calc_leapfrog_kernel(
     atype_t* atypes,
@@ -52,23 +46,13 @@ __global__ void calc_leapfrog_kernel(
 }
 
 void calc_leapfrog_host() {
-    using namespace CudaLeapfrog;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_atypes, sizeof(atype_t) * n_atypes);
-        check_cudaMalloc((void**)&d_catypes, sizeof(catype_t) * n_catypes);
-        check_cudaMalloc((void**)&d_velocities, sizeof(vel_t) * n_atoms);
-        check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms);
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_xcoords, sizeof(coord_t) * n_atoms);
-
-        cudaMemcpy(d_atypes, atypes, sizeof(atype_t) * n_atypes, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_catypes, catypes, sizeof(catype_t) * n_catypes, cudaMemcpyHostToDevice);
-
-        is_initialized = true;
-    }
-    cudaMemcpy(d_velocities, velocities, sizeof(vel_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
+    CudaContext& ctx = CudaContext::instance();
+    auto d_atypes = ctx.d_atypes;
+    auto d_catypes = ctx.d_catypes;
+    auto d_velocities = ctx.d_velocities;
+    auto d_dvelocities = ctx.d_dvelocities;
+    auto d_coords = ctx.d_coords;
+    auto d_xcoords = ctx.d_xcoords;
 
     int blockSize = 256;
     int numBlocks = (n_atoms + blockSize - 1) / blockSize;
@@ -102,15 +86,7 @@ void calc_leapfrog_host() {
         }
     }
 }
-void cleanup_leapfrog() {
-    using namespace CudaLeapfrog;
-    if (is_initialized) {
-        cudaFree(d_atypes);
-        cudaFree(d_catypes);
-        cudaFree(d_velocities);
-        cudaFree(d_dvelocities);
-        cudaFree(d_coords);
-        cudaFree(d_xcoords);
-        is_initialized = false;
-    }
-}
+
+void init_leapfrog_kernel_data() {}
+
+void cleanup_leapfrog() {}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaNonbondedPPForce.cu b/src/core/cuda/src/CudaNonbondedPPForce.cu
new file mode 100644
index 00000000..d32db32b
--- /dev/null
+++ b/src/core/cuda/src/CudaNonbondedPPForce.cu
@@ -0,0 +1,311 @@
+#include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedPPForce.cuh"
+#include "utils.h"
+
+namespace CudaNonbondedPPForce {
+bool is_initialized = false;
+dvel_t* PP_MAT;
+double *D_PP_Evdw, *D_PP_Ecoul;
+double *D_PP_evdw_TOT, *D_PP_ecoul_TOT;
+
+__device__ void calc_pp_dvel_matrix_incr(int row, int pi, int column, int pj,
+                                         coord_t* Xs, coord_t* Ys, int* LJs, bool* excluded_s, double* Evdw, double* Ecoul, dvel_t* patom_a, dvel_t* patom_b,
+                                         ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, topo_t D_topo) {
+    bool bond14, bond23;
+    double scaling;
+    coord_t da;
+    double r2a, ra, r6a;
+    double Vela, V_a, V_b;
+    double dva;
+    double crg_i, crg_j;
+    double ai_aii, aj_aii, ai_bii, aj_bii;
+    catype_t ai_type, aj_type;
+
+    bond23 = LJs[row * BLOCK_SIZE + column] == 3;
+    bond14 = LJs[row * BLOCK_SIZE + column] == 1;
+
+    if (bond23) return;
+    if (excluded_s[row] || excluded_s[BLOCK_SIZE + column]) return;
+
+    scaling = bond14 ? D_topo.el14_scale : 1;
+
+    crg_i = D_ccharges[D_charges[pi].code - 1].charge;
+    crg_j = D_ccharges[D_charges[pj].code - 1].charge;
+
+    ai_type = D_catypes[D_atypes[pi].code - 1];
+    aj_type = D_catypes[D_atypes[pj].code - 1];
+
+    da.x = Ys[column].x - Xs[row].x;
+    da.y = Ys[column].y - Xs[row].y;
+    da.z = Ys[column].z - Xs[row].z;
+    r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2));
+    ra = sqrt(r2a);
+    r6a = r2a * r2a * r2a;
+
+    Vela = scaling * D_topo.coulomb_constant * crg_i * crg_j * ra;
+
+    ai_aii = bond14 ? ai_type.aii_1_4 : ai_type.aii_normal;
+    aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
+    ai_bii = bond14 ? ai_type.bii_1_4 : ai_type.bii_normal;
+    aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;
+
+    V_a = r6a * r6a * ai_aii * aj_aii;
+    V_b = r6a * ai_bii * aj_bii;
+    dva = r2a * (-Vela - 12 * V_a + 6 * V_b);
+
+    patom_a->x = -dva * da.x;
+    patom_a->y = -dva * da.y;
+    patom_a->z = -dva * da.z;
+
+    patom_b->x = dva * da.x;
+    patom_b->y = dva * da.y;
+    patom_b->z = dva * da.z;
+
+    *Ecoul += Vela;
+    *Evdw += (V_a - V_b);
+}
+
+__global__ void calc_pp_dvel_matrix(int n_patoms, int n_atoms_solute,
+                                    coord_t* X, double* Evdw, double* Ecoul, dvel_t* PP_MAT,
+                                    ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, int* D_LJ_matrix, bool* D_excluded, topo_t D_topo) {
+    // Block index
+    int bx = blockIdx.x;
+    int by = blockIdx.y;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    int aStart = BLOCK_SIZE * by;
+    int bStart = BLOCK_SIZE * bx;
+
+    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+
+    Ecoul_S[ty][tx] = 0;
+    Evdw_S[ty][tx] = 0;
+
+    if (aStart + ty >= n_patoms) return;
+    if (bStart + tx >= n_patoms) return;
+
+    __shared__ coord_t Xs[BLOCK_SIZE];
+    __shared__ coord_t Ys[BLOCK_SIZE];
+    __shared__ int LJs[BLOCK_SIZE * BLOCK_SIZE];
+    __shared__ bool excluded_s[2 * BLOCK_SIZE];
+
+    int pi = D_patoms[aStart + ty].a - 1;
+    int pj = D_patoms[bStart + tx].a - 1;
+
+    if (tx == 0) {
+        Xs[ty] = X[pi];
+        excluded_s[ty] = D_excluded[pi];
+    }
+
+    if (ty == 0) {
+        Ys[tx] = X[pj];
+        excluded_s[BLOCK_SIZE + tx] = D_excluded[pj];
+    }
+    LJs[ty * BLOCK_SIZE + tx] = D_LJ_matrix[pi * n_atoms_solute + pj];
+
+    __syncthreads();
+
+    if (bx < by || (bx == by && tx < ty)) return;
+
+    dvel_t patom_a, patom_b;
+    memset(&patom_a, 0, sizeof(dvel_t));
+    memset(&patom_b, 0, sizeof(dvel_t));
+
+    int row = by * BLOCK_SIZE + ty;
+    int column = bx * BLOCK_SIZE + tx;
+
+    if (bx != by || tx != ty) {
+        double evdw = 0, ecoul = 0;
+        calc_pp_dvel_matrix_incr(ty, pi, tx, pj, Xs, Ys, LJs, excluded_s, &evdw, &ecoul, &patom_a,
+                                 &patom_b, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo);
+        Evdw_S[ty][tx] = evdw;
+        Ecoul_S[ty][tx] = ecoul;
+    }
+
+    PP_MAT[row * n_patoms + column] = patom_a;
+    PP_MAT[column * n_patoms + row] = patom_b;
+
+    __syncthreads();
+
+    int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    if (tx == 0 && ty == 0) {
+        double tot_Evdw = 0;
+        double tot_Ecoul = 0;
+        for (int i = 0; i < BLOCK_SIZE; i++) {
+            for (int j = 0; j < BLOCK_SIZE; j++) {
+                tot_Evdw += Evdw_S[i][j];
+                tot_Ecoul += Ecoul_S[i][j];
+            }
+        }
+        Evdw[rowlen * by + bx] = tot_Evdw;
+        Ecoul[rowlen * by + bx] = tot_Ecoul;
+
+        // printf("bx = %d by = %d tot_Evdw = %f tot_Ecoul = %f\n", bx, by, Evdw[rowlen * by + bx], Ecoul[rowlen * by + bx]);
+    }
+
+    __syncthreads();
+}
+
+__global__ void calc_pp_dvel_vector(int n_patoms, dvel_t* DV_X, dvel_t* PP_MAT, p_atom_t* D_patoms) {
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= n_patoms) return;
+
+    dvel_t dP;
+    dP.x = 0;
+    dP.y = 0;
+    dP.z = 0;
+
+    for (int i = 0; i < n_patoms; i++) {
+        if (i != row) {
+            dP.x += PP_MAT[i + n_patoms * row].x;
+            dP.y += PP_MAT[i + n_patoms * row].y;
+            dP.z += PP_MAT[i + n_patoms * row].z;
+        }
+    }
+
+    int p = D_patoms[row].a - 1;
+
+    DV_X[p].x += dP.x;
+    DV_X[p].y += dP.y;
+    DV_X[p].z += dP.z;
+
+    __syncthreads();
+}
+
+__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) {
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+
+    // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work
+    double coul_TOT = 0;
+    double vdw_TOT = 0;
+    for (int i = ty; i < rows; i += BLOCK_SIZE) {
+        for (int j = tx; j < columns; j += BLOCK_SIZE) {
+            if (i <= j || !upper_diagonal) {
+                coul_TOT += Ecoul[i * columns + j];
+                vdw_TOT += Evdw[i * columns + j];
+            }
+        }
+    }
+    Ecoul_S[ty][tx] = coul_TOT;
+    Evdw_S[ty][tx] = vdw_TOT;
+
+    __syncthreads();
+
+    if (tx == 0 && ty == 0) {
+        double Evdw_temp = 0;
+        double Ecoul_temp = 0;
+
+        for (int i = 0; i < BLOCK_SIZE; i++) {
+            for (int j = 0; j < BLOCK_SIZE; j++) {
+                Evdw_temp += Evdw_S[i][j];
+                Ecoul_temp += Ecoul_S[i][j];
+            }
+        }
+
+        *Evdw_TOT = Evdw_temp;
+        *Ecoul_TOT = Ecoul_temp;
+    }
+}
+}  // namespace CudaNonbondedPPForce
+void calc_nonbonded_pp_forces_host_v2() {
+    using namespace CudaNonbondedPPForce;
+
+    int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t);
+
+    int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    CudaContext& ctx = CudaContext::instance();
+    auto X = ctx.d_coords;
+    auto DV_X = ctx.d_dvelocities;
+    auto D_ccharges = ctx.d_ccharges;
+    auto D_charges = ctx.d_charges;
+    auto D_catypes = ctx.d_catypes;
+    auto D_atypes = ctx.d_atypes;
+    auto D_patoms = ctx.d_p_atoms;
+    auto D_LJ_matrix = ctx.d_LJ_matrix;
+    auto D_excluded = ctx.d_excluded;
+
+    dim3 threads, grid;
+
+    threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
+    grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y);
+
+    calc_pp_dvel_matrix<<<grid, threads>>>(n_patoms, n_atoms_solute, X, D_PP_Evdw, D_PP_Ecoul, PP_MAT, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_LJ_matrix, D_excluded, topo);
+    calc_pp_dvel_vector<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, DV_X, PP_MAT, D_patoms);
+
+    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
+
+#ifdef DEBUG
+    cudaMemcpy(h_PP_MAT, PP_MAT, mem_size_PP_MAT, cudaMemcpyDeviceToHost);
+#endif
+
+#ifdef DEBUG
+    for (int i = 0; i < n_patoms; i++) {
+        for (int j = 0; j < n_patoms; j++) {
+            if (h_PP_MAT[i * n_patoms + j].x > 100)
+                printf("PP_MAT[%d][%d] = %f %f %f\n", i, j, h_PP_MAT[i * n_patoms + j].x, h_PP_MAT[i * n_patoms + j].y, h_PP_MAT[i * n_patoms + j].z);
+        }
+    }
+#endif
+    cudaMemset(D_PP_evdw_TOT, 0, sizeof(double));
+    cudaMemset(D_PP_ecoul_TOT, 0, sizeof(double));
+
+    calc_energy_sum<<<1, threads>>>(n_blocks, n_blocks, D_PP_evdw_TOT, D_PP_ecoul_TOT, D_PP_Evdw, D_PP_Ecoul, true);
+    double PP_evdw_TOT, PP_ecoul_TOT;
+    cudaMemcpy(&PP_evdw_TOT, D_PP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&PP_ecoul_TOT, D_PP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+
+    E_nonbond_pp.Uvdw += PP_evdw_TOT;
+    E_nonbond_pp.Ucoul += PP_ecoul_TOT;
+}
+
+void init_nonbonded_pp_force_kernel_data() {
+    using namespace CudaNonbondedPPForce;
+
+    if (!is_initialized) {
+        int n_blocks = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+        int mem_size_PP_MAT = n_patoms * n_patoms * sizeof(dvel_t);
+
+        int mem_size_PP_Evdw = n_blocks * n_blocks * sizeof(double);
+        int mem_size_PP_Ecoul = n_blocks * n_blocks * sizeof(double);
+#ifdef DEBUG
+        printf("Allocating PP_MAT\n");
+#endif
+        check_cudaMalloc((void**)&PP_MAT, mem_size_PP_MAT);
+#ifdef DEBUG
+        printf("Allocating D_PP_Evdw\n");
+#endif
+        check_cudaMalloc((void**)&D_PP_Evdw, mem_size_PP_Evdw);
+#ifdef DEBUG
+        printf("Allocating D_PP_Ecoul\n");
+#endif
+        check_cudaMalloc((void**)&D_PP_Ecoul, mem_size_PP_Ecoul);
+
+        check_cudaMalloc((void**)&D_PP_evdw_TOT, sizeof(double));
+        check_cudaMalloc((void**)&D_PP_ecoul_TOT, sizeof(double));
+
+        is_initialized = true;
+    }
+}
+
+void cleanup_nonbonded_pp_force() {
+    using namespace CudaNonbondedPPForce;
+    if (is_initialized) {
+        cudaFree(PP_MAT);
+        cudaFree(D_PP_Evdw);
+        cudaFree(D_PP_Ecoul);
+        cudaFree(D_PP_evdw_TOT);
+        cudaFree(D_PP_ecoul_TOT);
+        is_initialized = false;
+    }
+}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaNonbondedPWForce.cu b/src/core/cuda/src/CudaNonbondedPWForce.cu
new file mode 100644
index 00000000..dec352db
--- /dev/null
+++ b/src/core/cuda/src/CudaNonbondedPWForce.cu
@@ -0,0 +1,380 @@
+#include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedPWForce.cuh"
+
+namespace CudaNonbondedPWForce {
+// Declare any necessary static variables or device pointers here
+bool is_initialized = false;
+struct calc_pw_t {
+    dvel_t P;
+    dvel_t W;
+};
+
+calc_pw_t *PW_MAT, *h_PW_MAT;
+double *D_PW_Evdw, *D_PW_Ecoul;
+double *D_PW_evdw_TOT, *D_PW_ecoul_TOT, PW_evdw_TOT, PW_ecoul_TOT;
+
+__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) {
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+
+    // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work
+    double coul_TOT = 0;
+    double vdw_TOT = 0;
+    for (int i = ty; i < rows; i += BLOCK_SIZE) {
+        for (int j = tx; j < columns; j += BLOCK_SIZE) {
+            if (i <= j || !upper_diagonal) {
+                coul_TOT += Ecoul[i * columns + j];
+                vdw_TOT += Evdw[i * columns + j];
+            }
+        }
+    }
+    Ecoul_S[ty][tx] = coul_TOT;
+    Evdw_S[ty][tx] = vdw_TOT;
+
+    __syncthreads();
+
+    if (tx == 0 && ty == 0) {
+        double Evdw_temp = 0;
+        double Ecoul_temp = 0;
+
+        for (int i = 0; i < BLOCK_SIZE; i++) {
+            for (int j = 0; j < BLOCK_SIZE; j++) {
+                Evdw_temp += Evdw_S[i][j];
+                Ecoul_temp += Ecoul_S[i][j];
+            }
+        }
+
+        *Evdw_TOT = Evdw_temp;
+        *Ecoul_TOT = Ecoul_temp;
+    }
+}
+
+__device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_atoms_solute,
+                                         coord_t* Xs, coord_t* Ws, bool* excluded_s, double* Evdw, double* Ecoul, calc_pw_t* pw,
+                                         ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, topo_t D_topo) {
+    coord_t da;
+    double r2a, ra, r6a;
+    double Vela, V_a, V_b;
+    double dva;
+    double qi, qj;
+    double ai_aii, aj_aii, ai_bii, aj_bii;
+    catype_t ai_type, aj_type;
+    atype_t i_type, j_type;
+
+    if (excluded_s[row]) return;
+    qi = D_ccharges[D_charges[pi].code - 1].charge;
+    qj = D_ccharges[D_charges[n_atoms_solute + j].code - 1].charge;  // TODO: FIX THIS!!! WILL NOT WORK WITH QATOMS!!!!!
+
+    // if (pi < 100 && j < 100){
+    //     printf("qi = %f qj = %f\n", qi, qj);
+    // }
+
+    ai_type = D_catypes[D_atypes[pi].code - 1];
+    aj_type = D_catypes[D_atypes[n_atoms_solute + j].code - 1];
+
+    da.x = Ws[column].x - Xs[row].x;
+    da.y = Ws[column].y - Xs[row].y;
+    da.z = Ws[column].z - Xs[row].z;
+    r2a = 1 / (pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2));
+    ra = sqrt(r2a);
+    r6a = r2a * r2a * r2a;
+
+    Vela = D_topo.coulomb_constant * qi * qj * ra;
+
+    ai_aii = ai_type.aii_normal;
+    aj_aii = aj_type.aii_normal;
+    ai_bii = ai_type.bii_normal;
+    aj_bii = aj_type.bii_normal;
+
+    V_a = r6a * r6a * ai_aii * aj_aii;
+    V_b = r6a * ai_bii * aj_bii;
+    dva = r2a * (-Vela - 12 * V_a + 6 * V_b);
+
+    pw->P.x -= dva * da.x;
+    pw->P.y -= dva * da.y;
+    pw->P.z -= dva * da.z;
+
+    pw->W.x += dva * da.x;
+    pw->W.y += dva * da.y;
+    pw->W.z += dva * da.z;
+
+    *Ecoul += Vela;
+    *Evdw += (V_a - V_b);
+
+    // if (pi == 522 && j == 175) {
+    //     printf("Vela = %f V_a = %f V_b = %f P = %f %f %f ai_aii = %f aj_aii = %f\n", Vela, V_a, V_b, pw->P.x, pw->P.y, pw->P.z, ai_aii, aj_aii);
+    // }
+
+    // if (pi < 100 && j < 100) printf("Evdw = %f Ecoul = %f\n", *Evdw, *Ecoul);
+}
+
+__global__ void calc_pw_dvel_vector_row(int n_patoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_pw_t* PW_MAT, p_atom_t* D_patoms) {
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= n_patoms) return;
+
+    dvel_t dP;
+
+    dP.x = 0;
+    dP.y = 0;
+    dP.z = 0;
+
+    for (int i = 0; i < 3 * n_waters; i++) {
+        dP.x += PW_MAT[i + 3 * n_waters * row].P.x;
+        dP.y += PW_MAT[i + 3 * n_waters * row].P.y;
+        dP.z += PW_MAT[i + 3 * n_waters * row].P.z;
+    }
+
+    int p = D_patoms[row].a - 1;
+
+    DV_X[p].x += dP.x;
+    DV_X[p].y += dP.y;
+    DV_X[p].z += dP.z;
+
+    __syncthreads();
+}
+
+__global__ void calc_pw_dvel_vector_column(int n_patoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_pw_t* PW_MAT) {
+    int column = blockIdx.x * blockDim.x + threadIdx.x;
+    if (column >= 3 * n_waters) return;
+
+    dvel_t dW;
+
+    dW.x = 0;
+    dW.y = 0;
+    dW.z = 0;
+
+    for (int i = 0; i < n_patoms; i++) {
+        dW.x += PW_MAT[column + 3 * n_waters * i].W.x;
+        dW.y += PW_MAT[column + 3 * n_waters * i].W.y;
+        dW.z += PW_MAT[column + 3 * n_waters * i].W.z;
+    }
+
+    DV_W[column].x += dW.x;
+    DV_W[column].y += dW.y;
+    DV_W[column].z += dW.z;
+
+    __syncthreads();
+}
+
+__global__ void calc_pw_dvel_matrix(int n_patoms, int n_atoms_solute, int n_waters,
+                                    coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_pw_t* PW_MAT,
+                                    ccharge_t* D_ccharges, charge_t* D_charges, catype_t* D_catypes, atype_t* D_atypes, p_atom_t* D_patoms, bool* D_excluded, topo_t D_topo) {
+    // Block index
+    int bx = blockIdx.x;
+    int by = blockIdx.y;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+
+    Ecoul_S[ty][tx] = 0;
+    Evdw_S[ty][tx] = 0;
+
+    // if (tx == 0 && ty == 0) printf("bx = %d by = %d\n", bx, by);
+
+    // if (bx == 0 && by == 0) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty);
+
+    int aStart = BLOCK_SIZE * by;
+    int bStart = BLOCK_SIZE * bx;
+
+    if (aStart + ty >= n_patoms) return;
+    if (bStart + tx >= 3 * n_waters) return;
+
+    // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty);
+
+    __shared__ coord_t Xs[BLOCK_SIZE];
+    __shared__ coord_t Ws[BLOCK_SIZE];
+    __shared__ bool excluded_s[BLOCK_SIZE];
+
+    int pi = D_patoms[aStart + ty].a - 1;
+
+    Xs[ty] = X[pi];
+    Ws[tx] = W[bStart + tx];
+
+    if (tx == 0) {
+        excluded_s[ty] = D_excluded[pi];
+    }
+
+    __syncthreads();
+
+    calc_pw_t pw;
+    memset(&pw, 0, sizeof(calc_pw_t));
+
+    int row = by * BLOCK_SIZE + ty;
+    int column = bx * BLOCK_SIZE + tx;
+
+    // if (row == 0 && column == 1) {
+    //     printf("Xs[0] = %f\n", Xs[0]);
+    //     printf("Ys[0] = %f\n", Ys[0]);
+    //     printf("Xs[1] = %f\n", Xs[1]);
+    //     printf("Ys[1] = %f\n", Ys[1]);
+    //     printf("Xs[2] = %f\n", Xs[2]);
+    //     printf("Ys[2] = %f\n", Ys[2]);
+    //     printf("Xs[3] = %f\n", Xs[3]);
+    //     printf("Ys[3] = %f\n", Ys[3]);
+    //     printf("Xs[4] = %f\n", Xs[4]);
+    //     printf("Ys[4] = %f\n", Ys[4]);
+
+    //     printf("Ys[%d] = %f Xs[%d] = %f\n", 3 * ty, Ys[3 * ty], 3 * tx, Xs[3 * tx]);
+    // }
+
+    // if (bx == 8 && by == 1) printf("bx = %d by = %d tx = %d ty = %d\n", bx, by, tx, ty);
+    // __device__ void calc_pw_dvel_matrix_incr(int row, int pi, int column, int j, int n_patoms,
+    // coord_t *Ps, coord_t *Xs, double *Evdw, double *Ecoul, calc_pw_t *pw,
+    // ccharge_t *D_ccharges, charge_t *D_charges, catype_t *D_catypes, atype_t *D_atypes, p_atom_t *D_patoms)
+    double evdw = 0, ecoul = 0;
+    calc_pw_dvel_matrix_incr(ty, pi, tx, bStart + tx, n_atoms_solute, Xs, Ws, excluded_s, &evdw, &ecoul, &pw, D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_topo);
+    Evdw_S[ty][tx] = evdw;
+    Ecoul_S[ty][tx] = ecoul;
+
+    // if (row == 0 && column == 1) {
+    //     printf("water_a = %f %f %f water_b = %f %f %f\n", water_a[0].x, water_a[0].y, water_a[0].z, water_b[0].x, water_b[0].y, water_b[0].z);
+    // }
+
+    // if (bx == 8 && by == 1) printf("n_qatoms = %d\n", n_qatoms);
+    // if (bx == 8 && by == 1) printf("qi = %d j = %d charge[%d] = %f\n", row, column, row + n_qatoms, D_qcharges[row + n_qatoms * 1].q);
+
+    PW_MAT[column + 3 * n_waters * row] = pw;
+
+    __syncthreads();
+
+    int rowlen = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    if (tx == 0 && ty == 0) {
+        double tot_Evdw = 0;
+        double tot_Ecoul = 0;
+        for (int i = 0; i < BLOCK_SIZE; i++) {
+            for (int j = 0; j < BLOCK_SIZE; j++) {
+                tot_Evdw += Evdw_S[i][j];
+                tot_Ecoul += Ecoul_S[i][j];
+            }
+        }
+        Evdw[rowlen * by + bx] = tot_Evdw;
+        Ecoul[rowlen * by + bx] = tot_Ecoul;
+    }
+
+    __syncthreads();
+}
+
+}  // namespace CudaNonbondedPWForce
+
+void calc_nonbonded_pw_forces_host_v2() {
+    using namespace CudaNonbondedPWForce;
+    int mem_size_W = 3 * n_waters * sizeof(coord_t);
+    int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t);
+    int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t);
+    int mem_size_PW_MAT = 3 * n_waters * n_patoms * sizeof(calc_pw_t);
+
+    int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int n_blocks_w = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(double);
+    int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(double);
+
+    CudaContext& ctx = CudaContext::instance();
+    auto X = ctx.d_coords;
+    auto DV_X = ctx.d_dvelocities;
+    auto D_ccharges = ctx.d_ccharges;
+    auto D_charges = ctx.d_charges;
+    auto D_catypes = ctx.d_catypes;
+    auto D_atypes = ctx.d_atypes;
+    auto D_patoms = ctx.d_p_atoms;
+    auto D_excluded = ctx.d_excluded;
+
+    dim3 threads, grid;
+
+    threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
+    grid = dim3((3 * n_waters + BLOCK_SIZE - 1) / threads.x, (n_patoms + BLOCK_SIZE - 1) / threads.y);
+
+    calc_pw_dvel_matrix<<<grid, threads>>>(n_patoms, n_atoms_solute, n_waters, X, X + n_atoms_solute, D_PW_Evdw, D_PW_Ecoul, PW_MAT,
+                                           D_ccharges, D_charges, D_catypes, D_atypes, D_patoms, D_excluded, topo);
+    calc_pw_dvel_vector_column<<<((3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, n_waters, DV_X, DV_X + n_atoms_solute, PW_MAT);
+    calc_pw_dvel_vector_row<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_patoms, n_waters, DV_X, DV_X + n_atoms_solute, PW_MAT, D_patoms);
+
+#ifdef DEBUG
+    cudaMemcpy(h_PW_MAT, PW_MAT, mem_size_PW_MAT, cudaMemcpyDeviceToHost);
+#endif
+
+// for (int i = 0; i < n_waters; i++) {
+//     printf("X[%d] = %f %f %f\n", i, coords[i].x, coords[i].y, coords[i].z);
+// }
+
+// printf("n_patoms = %d n_watoms = %d\n", n_patoms, 3 * n_waters);
+#ifdef DEBUG
+    for (int i = 0; i < n_patoms; i++) {
+        for (int j = 0; j < 3 * n_waters; j++) {
+            if (h_PW_MAT[3 * i * n_waters + j].P.x > 100)
+                printf("PW_MAT[%d][%d].P = %f %f %f\n", i, j, h_PW_MAT[3 * i * n_waters + j].P.x, h_PW_MAT[3 * i * n_waters + j].P.y, h_PW_MAT[3 * i * n_waters + j].P.z);
+        }
+    }
+#endif
+
+    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
+    cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost);
+
+    calc_energy_sum<<<1, threads>>>(n_blocks_p, n_blocks_w, D_PW_evdw_TOT, D_PW_ecoul_TOT, D_PW_Evdw, D_PW_Ecoul, false);
+
+    cudaMemcpy(&PW_evdw_TOT, D_PW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&PW_ecoul_TOT, D_PW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+
+    E_nonbond_pw.Uvdw += PW_evdw_TOT;
+    E_nonbond_pw.Ucoul += PW_ecoul_TOT;
+
+    // for (int i = 0; i < n_atoms; i++) {
+    //     printf("dvelocities[%d] = %f %f %f\n", i, dvelocities[i].x, dvelocities[i].y, dvelocities[i].z);
+    // }
+}
+
+void init_nonbonded_pw_force_kernel_data() {
+    using namespace CudaNonbondedPWForce;
+    if (!is_initialized) {
+        int mem_size_PW_MAT = 3 * n_waters * n_patoms * sizeof(calc_pw_t);
+
+        int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        int n_blocks_w = (3 * n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        int mem_size_PW_Evdw = n_blocks_p * n_blocks_w * sizeof(double);
+        int mem_size_PW_Ecoul = n_blocks_p * n_blocks_w * sizeof(double);
+#ifdef DEBUG
+        printf("Allocating PW_MAT\n");
+#endif
+        check_cudaMalloc((void**)&PW_MAT, mem_size_PW_MAT);
+
+#ifdef DEBUG
+        printf("Allocating D_PW_Evdw\n");
+#endif
+        check_cudaMalloc((void**)&D_PW_Evdw, mem_size_PW_Evdw);
+#ifdef DEBUG
+        printf("Allocating D_PW_Ecoul\n");
+#endif
+        check_cudaMalloc((void**)&D_PW_Ecoul, mem_size_PW_Ecoul);
+
+        check_cudaMalloc((void**)&D_PW_evdw_TOT, sizeof(double));
+        check_cudaMalloc((void**)&D_PW_ecoul_TOT, sizeof(double));
+
+#ifdef DEBUG
+        printf("All GPU solvent memory allocated\n");
+#endif
+
+        h_PW_MAT = (calc_pw_t*)malloc(mem_size_PW_MAT);
+        is_initialized = true;
+    }
+}
+
+void cleanup_nonbonded_pw_force() {
+    using namespace CudaNonbondedPWForce;
+    if (is_initialized) {
+        cudaFree(PW_MAT);
+        cudaFree(D_PW_Evdw);
+        cudaFree(D_PW_Ecoul);
+        cudaFree(D_PW_evdw_TOT);
+        cudaFree(D_PW_ecoul_TOT);
+        free(h_PW_MAT);
+        is_initialized = false;
+    }
+}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaNonbondedQPForce.cu b/src/core/cuda/src/CudaNonbondedQPForce.cu
new file mode 100644
index 00000000..4eb53908
--- /dev/null
+++ b/src/core/cuda/src/CudaNonbondedQPForce.cu
@@ -0,0 +1,365 @@
+#include <iostream>
+
+#include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedQPForce.cuh"
+#include "system.h"
+
+namespace CudaNonbondedQPForce {
+struct calc_qw_t {
+    dvel_t Q;
+    dvel_t O;
+    dvel_t H1;
+    dvel_t H2;
+};
+
+struct calc_qp_t {
+    dvel_t Q;
+    dvel_t P;
+};
+bool is_initialized = false;
+double *D_QP_Evdw, *D_QP_Ecoul, *h_QP_Evdw, *h_QP_Ecoul;
+calc_qp_t *QP_MAT, *h_QP_MAT;
+double *D_QP_evdw_TOT, *D_QP_ecoul_TOT, QP_evdw_TOT, QP_ecoul_TOT;
+
+// General
+__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) {
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+
+    // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work
+    double coul_TOT = 0;
+    double vdw_TOT = 0;
+    for (int i = ty; i < rows; i += BLOCK_SIZE) {
+        for (int j = tx; j < columns; j += BLOCK_SIZE) {
+            if (i <= j || !upper_diagonal) {
+                coul_TOT += Ecoul[i * columns + j];
+                vdw_TOT += Evdw[i * columns + j];
+            }
+        }
+    }
+    Ecoul_S[ty][tx] = coul_TOT;
+    Evdw_S[ty][tx] = vdw_TOT;
+
+    __syncthreads();
+
+    if (tx == 0 && ty == 0) {
+        double Evdw_temp = 0;
+        double Ecoul_temp = 0;
+
+        for (int i = 0; i < BLOCK_SIZE; i++) {
+            for (int j = 0; j < BLOCK_SIZE; j++) {
+                Evdw_temp += Evdw_S[i][j];
+                Ecoul_temp += Ecoul_S[i][j];
+            }
+        }
+
+        *Evdw_TOT = Evdw_temp;
+        *Ecoul_TOT = Ecoul_temp;
+    }
+}
+__global__ void calc_qp_dvel_vector_row(int n_qatoms, int n_patoms, dvel_t* DV_X, calc_qp_t* QP_MAT, q_atom_t* D_qatoms) {
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= n_qatoms) return;
+
+    dvel_t dQ;
+
+    dQ.x = 0;
+    dQ.y = 0;
+    dQ.z = 0;
+
+    for (int i = 0; i < n_patoms; i++) {
+        dQ.x += QP_MAT[i + n_patoms * row].Q.x;
+        dQ.y += QP_MAT[i + n_patoms * row].Q.y;
+        dQ.z += QP_MAT[i + n_patoms * row].Q.z;
+    }
+
+    int q = D_qatoms[row].a - 1;
+
+    DV_X[q].x += dQ.x;
+    DV_X[q].y += dQ.y;
+    DV_X[q].z += dQ.z;
+
+    __syncthreads();
+}
+
+__global__ void calc_qp_dvel_vector_column(int n_qatoms, int n_patoms, dvel_t* DV_X, calc_qp_t* QP_MAT, p_atom_t* D_patoms) {
+    int column = blockIdx.x * blockDim.x + threadIdx.x;
+    if (column >= n_patoms) return;
+
+    dvel_t dP;
+
+    dP.x = 0;
+    dP.y = 0;
+    dP.z = 0;
+
+    for (int i = 0; i < n_qatoms; i++) {
+        dP.x += QP_MAT[column + n_patoms * i].P.x;
+        dP.y += QP_MAT[column + n_patoms * i].P.y;
+        dP.z += QP_MAT[column + n_patoms * i].P.z;
+    }
+
+    int p = D_patoms[column].a - 1;
+
+    DV_X[p].x += dP.x;
+    DV_X[p].y += dP.y;
+    DV_X[p].z += dP.z;
+
+    __syncthreads();
+}
+
+__device__ void calc_qp_dvel_matrix_incr(int row, int qi, int column, int pj, int n_lambdas, int n_qatoms,
+                                         coord_t* Qs, coord_t* Ps, int* LJs, bool* excluded_s, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qp_t* qp,
+                                         q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas,
+                                         catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) {
+    coord_t da;
+    double r2, r6, r;
+    double ai_aii, aj_aii, ai_bii, aj_bii;
+    q_catype_t qi_type;
+    catype_t aj_type;
+    bool bond23, bond14;
+    double scaling, Vel, V_a, V_b, dv;
+
+    int j = D_patoms[pj].a - 1;
+
+    bond23 = LJs[row * BLOCK_SIZE + column] == 3;
+    bond14 = LJs[row * BLOCK_SIZE + column] == 1;
+
+    if (bond23) return;
+    if (excluded_s[row] || excluded_s[BLOCK_SIZE + column]) return;
+
+    scaling = bond14 ? D_topo.el14_scale : 1;
+
+    da.x = Qs[row].x - Ps[column].x;
+    da.y = Qs[row].y - Ps[column].y;
+    da.z = Qs[row].z - Ps[column].z;
+
+    r2 = pow(da.x, 2) + pow(da.y, 2) + pow(da.z, 2);
+
+    r6 = r2 * r2 * r2;
+    r2 = 1 / r2;
+    r = sqrt(r2);
+
+    for (int state = 0; state < n_lambdas; state++) {
+        qi_type = D_qcatypes[D_qatypes[qi + n_qatoms * state].code - 1];
+        aj_type = D_catypes[D_atypes[j].code - 1];
+
+        ai_aii = bond14 ? qi_type.Ai_14 : qi_type.Ai;
+        aj_aii = bond14 ? aj_type.aii_1_4 : aj_type.aii_normal;
+        ai_bii = bond14 ? qi_type.Bi_14 : qi_type.Bi;
+        aj_bii = bond14 ? aj_type.bii_1_4 : aj_type.bii_normal;
+
+        Vel = D_topo.coulomb_constant * scaling * D_qcharges[qi + n_qatoms * state].q * D_ccharges[D_charges[j].code - 1].charge * r;
+        V_a = ai_aii * aj_aii / (r6 * r6);
+        V_b = ai_bii * aj_bii / r6;
+        dv = r2 * (-Vel - (12 * V_a - 6 * V_b)) * D_lambdas[state];
+
+        // if (state == 0 && qi == 0 && pj == 1) {
+        //     printf("crg_q = %f crg_j = %f r = %f\n", D_qcharges[qi + n_qatoms * state].q, D_ccharges[D_charges[pj].code - 1].charge, r);
+        //     printf("ai_aii = %f aj_aii = %f ai_bii = %f aj_bii = %f\n", ai_aii, aj_aii, ai_bii, aj_bii);
+        // }
+
+        // Update forces
+        qp->Q.x += dv * da.x;
+        qp->Q.y += dv * da.y;
+        qp->Q.z += dv * da.z;
+        qp->P.x -= dv * da.x;
+        qp->P.y -= dv * da.y;
+        qp->P.z -= dv * da.z;
+
+        // Update Q totals
+        Ecoul_S[row][state * BLOCK_SIZE + column] += Vel;
+        Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b);
+    }
+}
+
+__global__ void calc_qp_dvel_matrix(int n_qatoms, int n_patoms, int n_lambdas, int n_atoms_solute,
+                                    coord_t* X, double* Evdw, double* Ecoul, calc_qp_t* QP_MAT,
+                                    q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, p_atom_t* D_patoms, q_atom_t* D_qatoms, double* D_lambdas,
+                                    int* D_LJ_matrix, bool* D_excluded, catype_t* D_catypes, atype_t* D_atypes, ccharge_t* D_ccharges, charge_t* D_charges, topo_t D_topo) {
+    // Block index
+    int bx = blockIdx.x;
+    int by = blockIdx.y;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    // TODO implement >2 states on GPU
+    __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
+    __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];
+
+    Ecoul_S[ty][tx] = 0;
+    Evdw_S[ty][tx] = 0;
+    Ecoul_S[ty][tx + BLOCK_SIZE] = 0;
+    Evdw_S[ty][tx + BLOCK_SIZE] = 0;
+
+    int aStart = BLOCK_SIZE * by;
+    int bStart = BLOCK_SIZE * bx;
+
+    if (aStart + ty >= n_qatoms) return;
+    if (bStart + tx >= n_patoms) return;
+
+    int qi = D_qatoms[aStart + ty].a - 1;
+    int pj = D_patoms[bStart + tx].a - 1;
+
+    __shared__ coord_t Qs[BLOCK_SIZE];
+    __shared__ coord_t Ps[BLOCK_SIZE];
+    __shared__ int LJs[BLOCK_SIZE * BLOCK_SIZE];
+    __shared__ bool excluded_s[2 * BLOCK_SIZE];
+
+    if (tx == 0) {
+        Qs[ty] = X[qi];
+        excluded_s[ty] = D_excluded[qi];
+    }
+
+    if (ty == 0) {
+        Ps[tx] = X[pj];
+        excluded_s[BLOCK_SIZE + tx] = D_excluded[pj];
+    }
+    LJs[ty * BLOCK_SIZE + tx] = D_LJ_matrix[qi * n_atoms_solute + pj];
+
+    __syncthreads();
+
+    calc_qp_t qp;
+    memset(&qp, 0, sizeof(calc_qp_t));
+
+    int row = by * BLOCK_SIZE + ty;
+    int column = bx * BLOCK_SIZE + tx;
+
+    calc_qp_dvel_matrix_incr(ty, row, tx, column, n_lambdas, n_qatoms, Qs, Ps, LJs, excluded_s, Evdw_S, Ecoul_S,
+                             &qp, D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_catypes, D_atypes, D_ccharges, D_charges, D_topo);
+
+    QP_MAT[n_patoms * row + column] = qp;
+
+    __syncthreads();
+
+    int rowlen = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    if (tx == 0 && ty == 0) {
+        // TODO implement >2 states on GPU
+        for (int state = 0; state < min(2, n_lambdas); state++) {
+            double tot_Evdw = 0;
+            double tot_Ecoul = 0;
+            for (int i = 0; i < BLOCK_SIZE; i++) {
+                for (int j = 0; j < BLOCK_SIZE; j++) {
+                    tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE];
+                    tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE];
+                }
+            }
+            Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw;
+            Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul;
+        }
+    }
+
+    __syncthreads();
+}
+
+}  // namespace CudaNonbondedQPForce
+
+void calc_nonbonded_qp_forces_host_v2() {
+    using namespace CudaNonbondedQPForce;
+
+    int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    CudaContext& ctx = CudaContext::instance();
+    auto X = ctx.d_coords;
+    auto DV_X = ctx.d_dvelocities;
+    auto D_qcatypes = ctx.d_q_catypes;
+    auto D_qatypes = ctx.d_q_atypes;
+    auto D_qcharges = ctx.d_q_charges;
+    auto D_patoms = ctx.d_p_atoms;
+    auto D_qatoms = ctx.d_q_atoms;
+    auto D_lambdas = ctx.d_lambdas;
+    auto D_LJ_matrix = ctx.d_LJ_matrix;
+    auto D_excluded = ctx.d_excluded;
+    auto D_catypes = ctx.d_catypes;
+    auto D_atypes = ctx.d_atypes;
+    auto D_ccharges = ctx.d_ccharges;
+    auto D_charges = ctx.d_charges;
+
+    dim3 threads, grid;
+
+    threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
+    grid = dim3((n_patoms + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y);
+
+    calc_qp_dvel_matrix<<<grid, threads>>>(n_qatoms, n_patoms, n_lambdas, n_atoms_solute, X, D_QP_Evdw, D_QP_Ecoul, QP_MAT,
+                                           D_qcatypes, D_qatypes, D_qcharges, D_patoms, D_qatoms, D_lambdas, D_LJ_matrix, D_excluded,
+                                           D_catypes, D_atypes, D_ccharges, D_charges, topo);
+    calc_qp_dvel_vector_column<<<((n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_patoms);
+    calc_qp_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_patoms, DV_X, QP_MAT, D_qatoms);
+
+#ifdef DEBUG
+    cudaMemcpy(h_QP_MAT, QP_MAT, mem_size_QP_MAT, cudaMemcpyDeviceToHost);
+#endif
+
+#ifdef DEBUG
+    for (int i = 0; i < n_qatoms; i++) {
+        for (int j = 0; j < n_patoms; j++) {
+            if (i == 0)
+                // if (h_QP_MAT[i * n_patoms + j].Q.x > 100)
+                printf("QP_MAT[%d][%d].Q = %f %f %f\n", i, j, h_QP_MAT[i * n_patoms + j].Q.x, h_QP_MAT[i * n_patoms + j].Q.y, h_QP_MAT[i * n_patoms + j].Q.z);
+        }
+    }
+#endif
+    int mem_size_DV_X = n_atoms * sizeof(dvel_t);
+    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
+
+    // TODO: make Evdw & Ecoul work for # of states > 2
+    for (int state = 0; state < min(2, n_lambdas); state++) {
+        calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_p, D_QP_evdw_TOT, D_QP_ecoul_TOT, &D_QP_Evdw[state * n_blocks_p * n_blocks_q], &D_QP_Ecoul[state * n_blocks_p * n_blocks_q], false);
+
+        cudaMemcpy(&QP_evdw_TOT, D_QP_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&QP_ecoul_TOT, D_QP_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+
+        EQ_nonbond_qp[state].Uvdw += QP_evdw_TOT;
+        EQ_nonbond_qp[state].Ucoul += QP_ecoul_TOT;
+    }
+}
+
+void init_nonbonded_qp_force_kernel_data() {
+    using namespace CudaNonbondedQPForce;
+
+    if (!is_initialized) {
+        int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        int n_blocks_p = (n_patoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        // TODO make Evdw & Ecoul work for # of states > 2
+        int mem_size_QP_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
+        int mem_size_QP_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_p * sizeof(double);
+        int mem_size_QP_MAT = n_qatoms * n_patoms * sizeof(calc_qp_t);
+
+        check_cudaMalloc((void**)&D_QP_Evdw, mem_size_QP_Evdw);
+        check_cudaMalloc((void**)&D_QP_Ecoul, mem_size_QP_Ecoul);
+        h_QP_Evdw = (double*)malloc(mem_size_QP_Evdw);
+        h_QP_Ecoul = (double*)malloc(mem_size_QP_Ecoul);
+
+        check_cudaMalloc((void**)&QP_MAT, mem_size_QP_MAT);
+        h_QP_MAT = (calc_qp_t*)malloc(mem_size_QP_MAT);
+
+        check_cudaMalloc((void**)&D_QP_evdw_TOT, sizeof(double));
+        check_cudaMalloc((void**)&D_QP_ecoul_TOT, sizeof(double));
+
+        is_initialized = true;
+    }
+}
+
+void cleanup_nonbonded_qp_force() {
+    using namespace CudaNonbondedQPForce;
+
+    if (is_initialized) {
+        cudaFree(D_QP_Evdw);
+        cudaFree(D_QP_Ecoul);
+        free(h_QP_Evdw);
+        free(h_QP_Ecoul);
+        cudaFree(QP_MAT);
+        free(h_QP_MAT);
+        cudaFree(D_QP_evdw_TOT);
+        cudaFree(D_QP_ecoul_TOT);
+
+        is_initialized = false;
+    }
+}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaNonbondedQQForce.cu b/src/core/cuda/src/CudaNonbondedQQForce.cu
index ed2fc54e..e2138ed6 100644
--- a/src/core/cuda/src/CudaNonbondedQQForce.cu
+++ b/src/core/cuda/src/CudaNonbondedQQForce.cu
@@ -1,19 +1,6 @@
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaNonbondedQQForce.cuh"
 #include "utils.h"
-namespace CudaNonbondedQQForce {
-bool is_initialized = false;
-q_atom_t* d_q_atoms = nullptr;
-q_charge_t* d_q_charges = nullptr;
-int* d_LJ_matrix = nullptr;
-bool* d_excluded = nullptr;
-q_elscale_t* d_q_elscales = nullptr;
-q_catype_t* d_q_catypes = nullptr;
-q_atype_t* d_q_atypes = nullptr;
-coord_t* d_coords = nullptr;
-E_nonbonded_t* d_EQ_nonbond_qq = nullptr;
-dvel_t* d_dvelocities = nullptr;
-double* d_lambdas = nullptr;
-}  // namespace CudaNonbondedQQForce
 
 __global__ void calc_nonbonded_qq_forces_kernel(
     q_atom_t* q_atoms,
@@ -107,34 +94,20 @@ __global__ void calc_nonbonded_qq_forces_kernel(
 }
 
 void calc_nonbonded_qq_forces_host() {
-    using namespace CudaNonbondedQQForce;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_q_atoms, sizeof(q_atom_t) * n_qatoms);
-        check_cudaMalloc((void**)&d_q_charges, sizeof(q_charge_t) * n_qatoms * n_lambdas);
-        check_cudaMalloc((void**)&d_LJ_matrix, sizeof(int) * n_atoms_solute * n_atoms_solute);
-        check_cudaMalloc((void**)&d_excluded, sizeof(bool) * n_atoms);
-        check_cudaMalloc((void**)&d_q_elscales, sizeof(q_elscale_t) * n_qelscales);
-        check_cudaMalloc((void**)&d_q_catypes, sizeof(q_catype_t) * n_qcatypes);
-        check_cudaMalloc((void**)&d_q_atypes, sizeof(q_atype_t) * n_qatoms * n_lambdas);
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas);
-        check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms);
-        check_cudaMalloc((void**)&d_lambdas, sizeof(double) * n_lambdas);
-
-        cudaMemcpy(d_q_atoms, q_atoms, sizeof(q_atom_t) * n_qatoms, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_q_charges, q_charges, sizeof(q_charge_t) * n_qatoms * n_lambdas, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_LJ_matrix, LJ_matrix, sizeof(int) * n_atoms_solute * n_atoms_solute, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_excluded, excluded, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_q_elscales, q_elscales, sizeof(q_elscale_t) * n_qelscales, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_q_catypes, q_catypes, sizeof(q_catype_t) * n_qcatypes, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_q_atypes, q_atypes, sizeof(q_atype_t) * n_qatoms * n_lambdas, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_lambdas, lambdas, sizeof(double) * n_lambdas, cudaMemcpyHostToDevice);
-
-        is_initialized = true;
-    }
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_EQ_nonbond_qq, EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas, cudaMemcpyHostToDevice);
+    CudaContext& ctx = CudaContext::instance();
+    // ctx.sync_all_to_device();
+    auto d_q_atoms = ctx.d_q_atoms;
+    auto d_q_charges = ctx.d_q_charges;
+    auto d_LJ_matrix = ctx.d_LJ_matrix;
+    auto d_excluded = ctx.d_excluded;
+    auto d_q_elscales = ctx.d_q_elscales;
+    auto d_q_catypes = ctx.d_q_catypes;
+    auto d_q_atypes = ctx.d_q_atypes;
+    auto d_coords = ctx.d_coords;
+    auto d_EQ_nonbond_qq = ctx.d_EQ_nonbond_qq;
+    auto d_dvelocities = ctx.d_dvelocities;
+    auto d_lambdas = ctx.d_lambdas;
+
     int blockSize = 256;
     int numBlocks = (n_qatoms + blockSize - 1) / blockSize;
     calc_nonbonded_qq_forces_kernel<<<numBlocks, blockSize>>>(
@@ -157,20 +130,6 @@ void calc_nonbonded_qq_forces_host() {
     cudaMemcpy(EQ_nonbond_qq, d_EQ_nonbond_qq, sizeof(E_nonbonded_t) * n_lambdas, cudaMemcpyDeviceToHost);
     cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
 }
-void cleanup_nonbonded_qq_force() {
-    using namespace CudaNonbondedQQForce;
-    if (is_initialized) {
-        cudaFree(d_q_atoms);
-        cudaFree(d_q_charges);
-        cudaFree(d_LJ_matrix);
-        cudaFree(d_excluded);
-        cudaFree(d_q_elscales);
-        cudaFree(d_q_catypes);
-        cudaFree(d_q_atypes);
-        cudaFree(d_coords);
-        cudaFree(d_EQ_nonbond_qq);
-        cudaFree(d_dvelocities);
-        cudaFree(d_lambdas);
-        is_initialized = false;
-    }
-}
+
+void init_nonbonded_qq_force_kernel_data() {}
+void cleanup_nonbonded_qq_force() {}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaNonbondedQWForce.cu b/src/core/cuda/src/CudaNonbondedQWForce.cu
new file mode 100644
index 00000000..974c380c
--- /dev/null
+++ b/src/core/cuda/src/CudaNonbondedQWForce.cu
@@ -0,0 +1,404 @@
+#include <iostream>
+
+#include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedQWForce.cuh"
+#include "utils.h"
+
+namespace CudaNonbondedQWForce {
+struct calc_qw_t {
+    dvel_t Q;
+    dvel_t O;
+    dvel_t H1;
+    dvel_t H2;
+};
+bool is_initialized = false;
+calc_qw_t *QW_MAT, *h_QW_MAT;
+
+double *D_QW_Evdw, *D_QW_Ecoul, *h_QW_Evdw, *h_QW_Ecoul;
+double *D_QW_evdw_TOT, *D_QW_ecoul_TOT, QW_evdw_TOT, QW_ecoul_TOT;
+
+// General
+__global__ void calc_energy_sum(int rows, int columns, double* Evdw_TOT, double* Ecoul_TOT, double* Evdw, double* Ecoul, bool upper_diagonal) {
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    __shared__ double Ecoul_S[BLOCK_SIZE][BLOCK_SIZE];
+    __shared__ double Evdw_S[BLOCK_SIZE][BLOCK_SIZE];
+
+    // TODO: better way to distribute upper diagonal over threads? Seems like threads in left bottom corner have less work
+    double coul_TOT = 0;
+    double vdw_TOT = 0;
+    for (int i = ty; i < rows; i += BLOCK_SIZE) {
+        for (int j = tx; j < columns; j += BLOCK_SIZE) {
+            if (i <= j || !upper_diagonal) {
+                coul_TOT += Ecoul[i * columns + j];
+                vdw_TOT += Evdw[i * columns + j];
+            }
+        }
+    }
+    Ecoul_S[ty][tx] = coul_TOT;
+    Evdw_S[ty][tx] = vdw_TOT;
+
+    __syncthreads();
+
+    if (tx == 0 && ty == 0) {
+        double Evdw_temp = 0;
+        double Ecoul_temp = 0;
+
+        for (int i = 0; i < BLOCK_SIZE; i++) {
+            for (int j = 0; j < BLOCK_SIZE; j++) {
+                Evdw_temp += Evdw_S[i][j];
+                Ecoul_temp += Ecoul_S[i][j];
+            }
+        }
+
+        *Evdw_TOT = Evdw_temp;
+        *Ecoul_TOT = Ecoul_temp;
+    }
+}
+
+__global__ void calc_qw_dvel_vector_row(int n_qatoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_qw_t* MAT, q_atom_t* D_qatoms) {
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= n_qatoms) return;
+
+    dvel_t dQ;
+
+    dQ.x = 0;
+    dQ.y = 0;
+    dQ.z = 0;
+
+    for (int i = 0; i < n_waters; i++) {
+        dQ.x += MAT[i + n_waters * row].Q.x;
+        dQ.y += MAT[i + n_waters * row].Q.y;
+        dQ.z += MAT[i + n_waters * row].Q.z;
+    }
+
+    int q = D_qatoms[row].a - 1;
+
+    DV_X[q].x += dQ.x;
+    DV_X[q].y += dQ.y;
+    DV_X[q].z += dQ.z;
+
+    __syncthreads();
+}
+
+__global__ void calc_qw_dvel_vector_column(int n_qatoms, int n_waters, dvel_t* DV_X, dvel_t* DV_W, calc_qw_t* MAT) {
+    int column = blockIdx.x * blockDim.x + threadIdx.x;
+    if (column >= n_waters) return;
+
+    dvel_t dO, dH1, dH2;
+
+    dO.x = 0;
+    dO.y = 0;
+    dO.z = 0;
+    dH1.x = 0;
+    dH1.y = 0;
+    dH1.z = 0;
+    dH2.x = 0;
+    dH2.y = 0;
+    dH2.z = 0;
+
+    for (int i = 0; i < n_qatoms; i++) {
+        dO.x += MAT[column + n_waters * i].O.x;
+        dO.y += MAT[column + n_waters * i].O.y;
+        dO.z += MAT[column + n_waters * i].O.z;
+        dH1.x += MAT[column + n_waters * i].H1.x;
+        dH1.y += MAT[column + n_waters * i].H1.y;
+        dH1.z += MAT[column + n_waters * i].H1.z;
+        dH2.x += MAT[column + n_waters * i].H2.x;
+        dH2.y += MAT[column + n_waters * i].H2.y;
+        dH2.z += MAT[column + n_waters * i].H2.z;
+    }
+
+    DV_W[3 * column].x += dO.x;
+    DV_W[3 * column].y += dO.y;
+    DV_W[3 * column].z += dO.z;
+    DV_W[3 * column + 1].x += dH1.x;
+    DV_W[3 * column + 1].y += dH1.y;
+    DV_W[3 * column + 1].z += dH1.z;
+    DV_W[3 * column + 2].x += dH2.x;
+    DV_W[3 * column + 2].y += dH2.y;
+    DV_W[3 * column + 2].z += dH2.z;
+
+    __syncthreads();
+}
+
+__device__ void calc_qw_dvel_matrix_incr(int row, int qi, int column, int n_lambdas, int n_qatoms, double crg_ow, double crg_hw, double A_O, double B_O,
+                                         coord_t* Qs, coord_t* Ws, double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE], double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE], calc_qw_t* qw,
+                                         q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) {
+    int j;
+    coord_t dO, dH1, dH2;
+    double r2O, rH1, rH2, r6O, rO, r2H1, r2H2;
+    double dvO, dvH1, dvH2;
+    double V_a, V_b, VelO, VelH1, VelH2;
+    q_atype_t qa_type;
+    q_catype_t qi_type;
+    double ai_aii, ai_bii;
+
+    j = 3 * column;
+    dO.x = Ws[j].x - Qs[row].x;
+    dO.y = Ws[j].y - Qs[row].y;
+    dO.z = Ws[j].z - Qs[row].z;
+    dH1.x = Ws[j + 1].x - Qs[row].x;
+    dH1.y = Ws[j + 1].y - Qs[row].y;
+    dH1.z = Ws[j + 1].z - Qs[row].z;
+    dH2.x = Ws[j + 2].x - Qs[row].x;
+    dH2.y = Ws[j + 2].y - Qs[row].y;
+    dH2.z = Ws[j + 2].z - Qs[row].z;
+
+    r2O = pow(dO.x, 2) + pow(dO.y, 2) + pow(dO.z, 2);
+    rH1 = sqrt(1.0 / (pow(dH1.x, 2) + pow(dH1.y, 2) + pow(dH1.z, 2)));
+    rH2 = sqrt(1.0 / (pow(dH2.x, 2) + pow(dH2.y, 2) + pow(dH2.z, 2)));
+    r6O = r2O * r2O * r2O;
+    r2O = 1.0 / r2O;
+    rO = sqrt(r2O);
+    r2H1 = rH1 * rH1;
+    r2H2 = rH2 * rH2;
+
+    // Reset potential
+    dvO = 0;
+    dvH1 = 0;
+    dvH2 = 0;
+
+    for (int state = 0; state < n_lambdas; state++) {
+        qa_type = D_qatypes[qi + n_qatoms * state];
+        qi_type = D_qcatypes[qa_type.code - 1];
+
+        ai_aii = qi_type.Ai;
+        ai_bii = qi_type.Bi;
+
+        V_a = ai_aii * A_O / (r6O * r6O);
+        V_b = ai_bii * B_O / (r6O);
+
+        VelO = D_topo.coulomb_constant * crg_ow * D_qcharges[qi + n_qatoms * state].q * rO;
+        VelH1 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH1;
+        VelH2 = D_topo.coulomb_constant * crg_hw * D_qcharges[qi + n_qatoms * state].q * rH2;
+
+        dvO += r2O * (-VelO - (12 * V_a - 6 * V_b)) * D_lambdas[state];
+        dvH1 -= r2H1 * VelH1 * D_lambdas[state];
+        dvH2 -= r2H2 * VelH2 * D_lambdas[state];
+
+        // Update Q totals
+        Ecoul_S[row][state * BLOCK_SIZE + column] += (VelO + VelH1 + VelH2);
+        Evdw_S[row][state * BLOCK_SIZE + column] += (V_a - V_b);
+    }
+
+    // Note r6O is not the usual 1/rO^6, but rather rO^6. be careful!!!
+
+    // Update forces on Q-atom
+    (*qw).Q.x -= (dvO * dO.x + dvH1 * dH1.x + dvH2 * dH2.x);
+    (*qw).Q.y -= (dvO * dO.y + dvH1 * dH1.y + dvH2 * dH2.y);
+    (*qw).Q.z -= (dvO * dO.z + dvH1 * dH1.z + dvH2 * dH2.z);
+
+    // Update forces on water
+    (*qw).O.x += dvO * dO.x;
+    (*qw).O.y += dvO * dO.y;
+    (*qw).O.z += dvO * dO.z;
+    (*qw).H1.x += dvH1 * dH1.x;
+    (*qw).H1.y += dvH1 * dH1.y;
+    (*qw).H1.z += dvH1 * dH1.z;
+    (*qw).H2.x += dvH2 * dH2.x;
+    (*qw).H2.y += dvH2 * dH2.y;
+    (*qw).H2.z += dvH2 * dH2.z;
+}
+
+__global__ void calc_qw_dvel_matrix(int n_qatoms, int n_waters, int n_lambdas, double crg_ow, double crg_hw, double A_O, double B_O,
+                                    coord_t* X, coord_t* W, double* Evdw, double* Ecoul, calc_qw_t* MAT,
+                                    q_catype_t* D_qcatypes, q_atype_t* D_qatypes, q_charge_t* D_qcharges, q_atom_t* D_qatoms, double* D_lambdas, topo_t D_topo) {
+    // Block index
+    int bx = blockIdx.x;
+    int by = blockIdx.y;
+
+    // Thread index
+    int tx = threadIdx.x;
+    int ty = threadIdx.y;
+
+    // TODO implement >2 states on GPU
+    __shared__ double Evdw_S[BLOCK_SIZE][2 * BLOCK_SIZE];
+    __shared__ double Ecoul_S[BLOCK_SIZE][2 * BLOCK_SIZE];
+
+    Ecoul_S[ty][tx] = 0;
+    Evdw_S[ty][tx] = 0;
+    Ecoul_S[ty][tx + BLOCK_SIZE] = 0;
+    Evdw_S[ty][tx + BLOCK_SIZE] = 0;
+
+    int aStart = BLOCK_SIZE * by;
+    int bStart = 3 * BLOCK_SIZE * bx;
+
+    if (aStart + ty >= n_qatoms) return;
+    if (bStart + 3 * tx >= 3 * n_waters) return;
+
+    __shared__ coord_t Qs[BLOCK_SIZE];
+    __shared__ coord_t Ws[3 * BLOCK_SIZE];
+
+    if (tx == 0) {
+        Qs[ty] = X[D_qatoms[aStart + ty].a - 1];
+    }
+
+    if (ty == 0) {
+        Ws[3 * tx] = W[bStart + 3 * tx];
+        Ws[3 * tx + 1] = W[bStart + 3 * tx + 1];
+        Ws[3 * tx + 2] = W[bStart + 3 * tx + 2];
+    }
+
+    __syncthreads();
+
+    calc_qw_t qw;
+    memset(&qw, 0, sizeof(calc_qw_t));
+
+    int row = by * BLOCK_SIZE + ty;
+    int column = bx * BLOCK_SIZE + tx;
+
+    calc_qw_dvel_matrix_incr(ty, aStart + ty, tx, n_lambdas, n_qatoms, crg_ow, crg_hw, A_O, B_O, Qs, Ws, Evdw_S, Ecoul_S,
+                             &qw, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, D_topo);
+
+    MAT[column + n_waters * row] = qw;
+
+    __syncthreads();
+
+    int rowlen = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int collen = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    if (tx == 0 && ty == 0) {
+        // TODO implement >2 states on GPU
+        for (int state = 0; state < min(2, n_lambdas); state++) {
+            double tot_Evdw = 0;
+            double tot_Ecoul = 0;
+            for (int i = 0; i < BLOCK_SIZE; i++) {
+                for (int j = 0; j < BLOCK_SIZE; j++) {
+                    tot_Evdw += Evdw_S[i][j + state * BLOCK_SIZE];
+                    tot_Ecoul += Ecoul_S[i][j + state * BLOCK_SIZE];
+                }
+            }
+            Evdw[rowlen * collen * state + rowlen * by + bx] = tot_Evdw;
+            Ecoul[rowlen * collen * state + rowlen * by + bx] = tot_Ecoul;
+        }
+    }
+
+    __syncthreads();
+}
+}  // namespace CudaNonbondedQWForce
+
+void calc_nonbonded_qw_forces_host_v2() {
+    using namespace CudaNonbondedQWForce;
+
+    int mem_size_X = n_atoms_solute * sizeof(coord_t);
+    int mem_size_W = 3 * n_waters * sizeof(coord_t);
+    int mem_size_DV_X = n_atoms_solute * sizeof(dvel_t);
+    int mem_size_DV_W = 3 * n_waters * sizeof(dvel_t);
+    int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t);
+
+    int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    // TODO make Evdw & Ecoul work for # of states > 2
+    int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
+    int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
+
+    CudaContext& ctx = CudaContext::instance();
+    auto X = ctx.d_coords;
+    auto DV_X = ctx.d_dvelocities;
+    auto D_qcatypes = ctx.d_q_catypes;
+    auto D_qatypes = ctx.d_q_atypes;
+    auto D_qcharges = ctx.d_q_charges;
+    auto D_qatoms = ctx.d_q_atoms;
+    auto D_lambdas = ctx.d_lambdas;
+
+    dim3 threads, grid;
+
+    threads = dim3(BLOCK_SIZE, BLOCK_SIZE);
+    grid = dim3((n_waters + BLOCK_SIZE - 1) / threads.x, (n_qatoms + BLOCK_SIZE - 1) / threads.y);
+
+    double evdw, ecoul;
+
+    calc_qw_dvel_matrix<<<grid, threads>>>(n_qatoms, n_waters, n_lambdas, crg_ow, crg_hw, A_O, B_O, X, X + n_atoms_solute, D_QW_Evdw, D_QW_Ecoul,
+                                           QW_MAT, D_qcatypes, D_qatypes, D_qcharges, D_qatoms, D_lambdas, topo);
+    calc_qw_dvel_vector_column<<<((n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT);
+    calc_qw_dvel_vector_row<<<((n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE), BLOCK_SIZE>>>(n_qatoms, n_waters, DV_X, DV_X + n_atoms_solute, QW_MAT, D_qatoms);
+
+#ifdef DEBUG
+    cudaMemcpy(h_QW_MAT, QW_MAT, mem_size_MAT, cudaMemcpyDeviceToHost);
+#endif
+
+#ifdef DEBUG
+    for (int i = 0; i < n_qatoms; i++) {
+        for (int j = 0; j < 3 * n_waters; j++) {
+            if (h_QW_MAT[3 * i * n_waters + j].Q.x > 100)
+                printf("QW_MAT[%d][%d].Q = %f %f %f\n", i, j, h_QW_MAT[i * 3 * n_waters + j].Q.x, h_QW_MAT[i * 3 * n_waters + j].Q.y, h_QW_MAT[i * 3 * n_waters + j].Q.z);
+        }
+    }
+#endif
+
+    cudaMemcpy(dvelocities, DV_X, mem_size_DV_X, cudaMemcpyDeviceToHost);
+    cudaMemcpy(&dvelocities[n_atoms_solute], DV_X + n_atoms_solute, mem_size_DV_W, cudaMemcpyDeviceToHost);
+
+    // TODO make Evdw & Ecoul work for # of states > 2
+    for (int state = 0; state < min(2, n_lambdas); state++) {
+        calc_energy_sum<<<1, threads>>>(n_blocks_q, n_blocks_w, D_QW_evdw_TOT, D_QW_ecoul_TOT, &D_QW_Evdw[state * n_blocks_w * n_blocks_q],
+                                        &D_QW_Ecoul[state * n_blocks_w * n_blocks_q], false);
+
+        cudaMemcpy(&QW_evdw_TOT, D_QW_evdw_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&QW_ecoul_TOT, D_QW_ecoul_TOT, sizeof(double), cudaMemcpyDeviceToHost);
+
+        EQ_nonbond_qw[state].Uvdw += QW_evdw_TOT;
+        EQ_nonbond_qw[state].Ucoul += QW_ecoul_TOT;
+    }
+}
+
+void init_nonbonded_qw_force_kernel_data() {
+    using namespace CudaNonbondedQWForce;
+    if (!is_initialized) {
+        catype_t catype_ow;  // Atom type of first O atom
+
+        catype_ow = catypes[atypes[n_atoms_solute].code - 1];
+
+        A_O = catype_ow.aii_normal;
+        B_O = catype_ow.bii_normal;
+
+        int n_blocks_q = (n_qatoms + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        int n_blocks_w = (n_waters + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        int mem_size_MAT = 3 * n_waters * n_qatoms * sizeof(calc_qw_t);
+
+        int mem_size_QW_Evdw = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
+        int mem_size_QW_Ecoul = min(n_lambdas, 2) * n_blocks_q * n_blocks_w * sizeof(double);
+#ifdef DEBUG
+        printf("Allocating QW_MAT\n");
+#endif
+        check_cudaMalloc((void**)&QW_MAT, mem_size_MAT);
+
+#ifdef DEBUG
+        printf("Allocating D_QW_Evdw\n");
+#endif
+        check_cudaMalloc((void**)&D_QW_Evdw, mem_size_QW_Evdw);
+#ifdef DEBUG
+        printf("Allocating D_QW_Ecoul\n");
+#endif
+        check_cudaMalloc((void**)&D_QW_Ecoul, mem_size_QW_Ecoul);
+
+        check_cudaMalloc((void**)&D_QW_evdw_TOT, sizeof(double));
+        check_cudaMalloc((void**)&D_QW_ecoul_TOT, sizeof(double));
+
+        h_QW_Evdw = (double*)malloc(mem_size_QW_Evdw);
+        h_QW_Ecoul = (double*)malloc(mem_size_QW_Ecoul);
+
+        h_QW_MAT = (calc_qw_t*)malloc(mem_size_MAT);
+        is_initialized = true;
+    }
+}
+
+void cleanup_nonbonded_qw_force() {
+    using namespace CudaNonbondedQWForce;
+
+    if (is_initialized) {
+        cudaFree(QW_MAT);
+        cudaFree(D_QW_Evdw);
+        cudaFree(D_QW_Ecoul);
+        cudaFree(D_QW_evdw_TOT);
+        cudaFree(D_QW_ecoul_TOT);
+
+        free(h_QW_Evdw);
+        free(h_QW_Ecoul);
+        free(h_QW_MAT);
+
+        is_initialized = false;
+    }
+}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaNonbondedWWForce.cu b/src/core/cuda/src/CudaNonbondedWWForce.cu
new file mode 100644
index 00000000..84b6652c
--- /dev/null
+++ b/src/core/cuda/src/CudaNonbondedWWForce.cu
@@ -0,0 +1,321 @@
+#include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaNonbondedWWForce.cuh"
+namespace CudaNonbondedWWForce {
+
+bool is_initialized = false;
+double *D_WW_evdw_TOT, *D_WW_ecoul_TOT, WW_evdw_TOT, WW_ecoul_TOT;
+
+__device__ __forceinline__ void calculate_unforce_bound(
+    const int y, const int x, const coord_t& q, const coord_t& p,
+    const topo_t& D_topo, const double& crg_ow, const double& crg_hw,
+    const double& A_OO, const double& B_OO, double& evdw, double& ecoul,
+    double& dv, double& tmpx, double& tmpy, double& tmpz) {
+    int belong_y = y / 3;
+    int belong_x = x / 3;
+    if (belong_y == belong_x) {
+        return;
+    }
+
+    bool y_is_o = (y % 3 == 0);
+    bool x_is_o = (x % 3 == 0);
+
+    // Compute distance components
+    tmpx = p.x - q.x;
+    tmpy = p.y - q.y;
+    tmpz = p.z - q.z;
+    // double inv_dis = 1.0 / sqrt(pow(tmpx, 2) + pow(tmpy, 2) + pow(tmpz, 2));
+    double inv_dis = rsqrt(tmpx * tmpx + tmpy * tmpy + tmpz * tmpz);
+    double inv_dis2 = inv_dis * inv_dis;
+
+    ecoul = inv_dis * D_topo.coulomb_constant * (y_is_o ? crg_ow : crg_hw) *
+            (x_is_o ? crg_ow : crg_hw);
+    double v_a = 0, v_b = 0;
+    if (y_is_o && x_is_o) {
+        double inv_dis6 = inv_dis2 * inv_dis2 * inv_dis2;
+        double inv_dis12 = inv_dis6 * inv_dis6;
+        v_a = A_OO * inv_dis12;
+        v_b = B_OO * inv_dis6;
+        evdw = v_a - v_b;
+        dv = inv_dis * inv_dis * (-ecoul - 12.0 * v_a + 6.0 * v_b);
+    } else {
+        dv = inv_dis * inv_dis * -ecoul;
+    }
+}
+
+template <const int Thread_x, const int Thread_y, const int Block_x,
+          const int Block_y>
+__global__ void
+calc_ww(const int N, const double crg_ow, const double crg_hw,
+        const double A_OO, const double B_OO, const topo_t D_topo,
+        coord_t* __restrict__ W, dvel_t* __restrict__ DV_W,
+        double* __restrict__ Evdw_TOT, double* __restrict__ ecoul_TOT) {
+    // Calculate block boundaries
+    int NX = N;
+    int NY = (N + 1) / 2;
+    int x_cal_num = blockDim.x * Block_x;
+    int y_cal_num = blockDim.y * Block_y;
+    int block_x_left_begin = 1 + blockIdx.x * x_cal_num;
+    int block_y_left_begin = blockIdx.y * y_cal_num;
+    int block_x_left_end = min(block_x_left_begin + x_cal_num - 1, NX - 1);
+    int block_y_left_end = min(block_y_left_begin + y_cal_num - 1, NY - 1);
+
+    // Shared memory declarations with padding to avoid bank conflicts
+    __shared__ coord_t p[2 * Thread_x * Block_x + 1];
+    __shared__ coord_t q[2 * Thread_y * Block_y + 1];
+    __shared__ double sum_row_x[2 * Thread_y * Block_y + 1];
+    __shared__ double sum_row_y[2 * Thread_y * Block_y + 1];
+    __shared__ double sum_row_z[2 * Thread_y * Block_y + 1];
+
+    __shared__ double block_ecoul[(Thread_x * Thread_y + 31) / 32],
+        block_evdw[(Thread_x * Thread_y + 31) / 32];
+
+    // Thread indices
+    int thread_y_left_begin = block_y_left_begin + threadIdx.y * Block_y;
+    int thread_x_left_begin = block_x_left_begin + threadIdx.x * Block_x;
+    int cur_thread_num = blockDim.x * threadIdx.y + threadIdx.x;
+    int thread_num = blockDim.x * blockDim.y;
+
+// Optimized coordinate loading with coalesced memory access
+#pragma unroll
+    for (int i = cur_thread_num; i < x_cal_num && block_x_left_begin + i < NX; i += thread_num) {
+        p[i] = W[block_x_left_begin + i];
+        p[x_cal_num + i] = W[N - (block_x_left_begin + i)];
+    }
+
+#pragma unroll
+    for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) {
+        q[i] = W[block_y_left_begin + i];
+        q[y_cal_num + i] = W[N - 1 - (block_y_left_begin + i)];
+    }
+
+// Initialize sum arrays
+#pragma unroll
+    for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) {
+        sum_row_x[i] = sum_row_x[y_cal_num + i] = 0;
+        sum_row_y[i] = sum_row_y[y_cal_num + i] = 0;
+        sum_row_z[i] = sum_row_z[y_cal_num + i] = 0;
+    }
+    __syncthreads();
+    // Initialize column sums
+    double sum_col_x[2 * Block_x], sum_col_y[2 * Block_x], sum_col_z[2 * Block_x];
+#pragma unroll
+    for (int i = 0; i < Block_x; i++) {
+        sum_col_x[i] = sum_col_x[Block_x + i] = 0;
+        sum_col_y[i] = sum_col_y[Block_x + i] = 0;
+        sum_col_z[i] = sum_col_z[Block_x + i] = 0;
+    }
+
+    // Main computation loop with reduced thread divergence
+    double evdw_tot = 0, ecoul_tot = 0;
+#pragma unroll
+    for (int i = 0; i < Block_x; i++) {
+        int i2 = i;
+        int x = thread_x_left_begin + i2;
+        if (x >= NX) continue;
+
+        int offset_x = x - block_x_left_begin;
+        const coord_t& now_p0 = p[offset_x];
+        const coord_t& now_p1 = p[x_cal_num + offset_x];
+
+#pragma unroll
+        for (int j = 0; j < Block_y; j++) {
+            int j2 = (j + threadIdx.x) % Block_y;
+            int y = thread_y_left_begin + j2;
+            if (y >= NY) continue;
+
+            // Optimized condition check
+            if (y >= x && (N % 2 == 0 || y != NY - 1)) {
+                int offset_y = y - block_y_left_begin;
+                double evdw = 0, ecoul = 0, dv = 0;
+                double tmpx = 0, tmpy = 0, tmpz = 0;
+
+                int y2 = N - 1 - y;
+                int x2 = N - x;
+                calculate_unforce_bound(y2, x2, q[y_cal_num + offset_y], now_p1, D_topo,
+                                        crg_ow, crg_hw, A_OO, B_OO, evdw, ecoul, dv,
+                                        tmpx, tmpy, tmpz);
+
+                evdw_tot += evdw;
+                ecoul_tot += ecoul;
+                double v_x = dv * tmpx;
+                double v_y = dv * tmpy;
+                double v_z = dv * tmpz;
+
+                sum_col_x[Block_x + i2] += v_x;
+                sum_col_y[Block_x + i2] += v_y;
+                sum_col_z[Block_x + i2] += v_z;
+                sum_row_x[y_cal_num + offset_y] -= v_x;
+                sum_row_y[y_cal_num + offset_y] -= v_y;
+                sum_row_z[y_cal_num + offset_y] -= v_z;
+            } else if (y < x) {
+                int offset_y = y - block_y_left_begin;
+                double evdw = 0, ecoul = 0, dv = 0;
+                double tmpx = 0, tmpy = 0, tmpz = 0;
+
+                calculate_unforce_bound(y, x, q[offset_y], now_p0, D_topo, crg_ow,
+                                        crg_hw, A_OO, B_OO, evdw, ecoul, dv, tmpx, tmpy,
+                                        tmpz);
+
+                evdw_tot += evdw;
+                ecoul_tot += ecoul;
+                double v_x = dv * tmpx;
+                double v_y = dv * tmpy;
+                double v_z = dv * tmpz;
+
+                sum_col_x[i2] += v_x;
+                sum_col_y[i2] += v_y;
+                sum_col_z[i2] += v_z;
+                sum_row_x[offset_y] -= v_x;
+                sum_row_y[offset_y] -= v_y;
+                sum_row_z[offset_y] -= v_z;
+            }
+        }
+    }
+
+// Optimized reduction using warp-level primitives
+#pragma unroll
+    for (unsigned w = 16; w >= 1; w /= 2) {
+        ecoul_tot += __shfl_down_sync(0xffffffff, ecoul_tot, w);
+        evdw_tot += __shfl_down_sync(0xffffffff, evdw_tot, w);
+    }
+
+    // Store block results
+    int warp_id = cur_thread_num / warpSize;
+    int lane_id = cur_thread_num % warpSize;
+    if (lane_id == 0) {
+        block_evdw[warp_id] = evdw_tot;
+        block_ecoul[warp_id] = ecoul_tot;
+    }
+    __syncthreads();
+
+    // Final reduction
+    if (warp_id == 0) {
+        double val1 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_ecoul[lane_id] : 0;
+        double val2 = (lane_id < (Thread_x * Thread_y + 31) / 32) ? block_evdw[lane_id] : 0;
+
+#pragma unroll
+        for (unsigned w = 16; w >= 1; w /= 2) {
+            val1 += __shfl_down_sync(0xffffffff, val1, w);
+            val2 += __shfl_down_sync(0xffffffff, val2, w);
+        }
+
+        if (lane_id == 0) {
+            atomicAdd(ecoul_TOT, val1);
+            atomicAdd(Evdw_TOT, val2);
+        }
+    }
+
+// Optimized row reduction
+#pragma unroll
+    for (int i = cur_thread_num; i < y_cal_num && block_y_left_begin + i < NY; i += thread_num) {
+        int idx = block_y_left_begin + i;
+        if (idx < block_x_left_end) {
+            atomicAdd(&DV_W[idx].x, sum_row_x[i]);
+            atomicAdd(&DV_W[idx].y, sum_row_y[i]);
+            atomicAdd(&DV_W[idx].z, sum_row_z[i]);
+        }
+        if (idx >= block_x_left_begin) {
+            idx = N - 1 - idx;
+            atomicAdd(&DV_W[idx].x, sum_row_x[i + y_cal_num]);
+            atomicAdd(&DV_W[idx].y, sum_row_y[i + y_cal_num]);
+            atomicAdd(&DV_W[idx].z, sum_row_z[i + y_cal_num]);
+        }
+    }
+
+// Optimized column reduction
+#pragma unroll
+    for (int i = 0; i < Block_x; i++) {
+        int i2 = (i + threadIdx.y) % Block_x;
+        int idx = thread_x_left_begin + i2;
+        if (idx < N) {
+            if (block_y_left_begin < idx) {
+                atomicAdd(&DV_W[idx].x, sum_col_x[i2]);
+                atomicAdd(&DV_W[idx].y, sum_col_y[i2]);
+                atomicAdd(&DV_W[idx].z, sum_col_z[i2]);
+            }
+            if (block_y_left_end >= idx) {
+                atomicAdd(&DV_W[N - idx].x, sum_col_x[i2 + Block_x]);
+                atomicAdd(&DV_W[N - idx].y, sum_col_y[i2 + Block_x]);
+                atomicAdd(&DV_W[N - idx].z, sum_col_z[i2 + Block_x]);
+            }
+        }
+    }
+}
+}  // namespace CudaNonbondedWWForce
+void calc_nonbonded_ww_forces_host_v2() {
+    using namespace CudaNonbondedWWForce;
+    int N = 3 * n_waters;
+    int mem_size_W = N * sizeof(coord_t);
+    int mem_size_DV_W = N * sizeof(dvel_t);
+
+    WW_evdw_TOT = 0;
+    WW_ecoul_TOT = 0;
+    cudaMemcpy(D_WW_evdw_TOT, &WW_evdw_TOT, sizeof(double),
+               cudaMemcpyHostToDevice);
+    cudaMemcpy(D_WW_ecoul_TOT, &WW_ecoul_TOT, sizeof(double),
+               cudaMemcpyHostToDevice);
+
+    CudaContext& ctx = CudaContext::instance();
+    auto W = ctx.d_coords + n_atoms_solute;
+    auto DV_W = ctx.d_dvelocities + n_atoms_solute;
+
+    // Optimize thread block configuration
+    const int thread_num_x = 32;  // Keep at 32 for better warp utilization
+    const int thread_num_y = 1;   // Keep at 1 for better memory coalescing
+    dim3 block_sz = dim3(thread_num_x, thread_num_y);
+
+    const int N_ITERATION_Y = 32;  // Keep at 32 for better memory access pattern
+    const int N_ITERATION_X = 1;   // Keep at 1 for better memory coalescing
+
+    int block_element_sz_x = N_ITERATION_X * block_sz.x;
+    int block_element_sz_y = N_ITERATION_Y * block_sz.y;
+
+    int grid_sz_x = (N - 1 + block_element_sz_x - 1) / (block_element_sz_x);
+    int grid_sz_y = ((N + 1) / 2 + block_element_sz_y - 1) / block_element_sz_y;
+    dim3 grid = dim3(grid_sz_x, grid_sz_y);
+
+    calc_ww<thread_num_x, thread_num_y, N_ITERATION_X, N_ITERATION_Y>
+        <<<grid, block_sz>>>(N, crg_ow, crg_hw, A_OO, B_OO, topo, W, DV_W,
+                             D_WW_evdw_TOT, D_WW_ecoul_TOT);
+
+    cudaDeviceSynchronize();
+    cudaMemcpy(&dvelocities[n_atoms_solute], DV_W, mem_size_DV_W,
+               cudaMemcpyDeviceToHost);
+    cudaMemcpy(&WW_evdw_TOT, D_WW_evdw_TOT, sizeof(double),
+               cudaMemcpyDeviceToHost);
+    cudaMemcpy(&WW_ecoul_TOT, D_WW_ecoul_TOT, sizeof(double),
+               cudaMemcpyDeviceToHost);
+    E_nonbond_ww.Uvdw += WW_evdw_TOT;
+    E_nonbond_ww.Ucoul += WW_ecoul_TOT;
+}
+
+void init_nonbonded_ww_force_kernel_data() {
+    using namespace CudaNonbondedWWForce;
+    if (!is_initialized) {
+        catype_t catype_ow;                // Atom type of first O, H atom
+        ccharge_t ccharge_ow, ccharge_hw;  // Charge of first O, H atom
+
+        catype_ow = catypes[atypes[n_atoms_solute].code - 1];
+        ccharge_ow = ccharges[charges[n_atoms_solute].code - 1];
+        ccharge_hw = ccharges[charges[n_atoms_solute + 1].code - 1];
+
+        A_OO = pow(catype_ow.aii_normal, 2);
+        B_OO = pow(catype_ow.bii_normal, 2);
+
+        crg_ow = ccharge_ow.charge;
+        crg_hw = ccharge_hw.charge;
+
+        check_cudaMalloc((void**)&D_WW_evdw_TOT, sizeof(double));
+        check_cudaMalloc((void**)&D_WW_ecoul_TOT, sizeof(double));
+        is_initialized = true;
+    }
+}
+
+void cleanup_nonbonded_ww_force() {
+    using namespace CudaNonbondedWWForce;
+    if (is_initialized) {
+        cudaFree(D_WW_evdw_TOT);
+        cudaFree(D_WW_ecoul_TOT);
+        is_initialized = false;
+    }
+}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaPolxWaterForce.cu b/src/core/cuda/src/CudaPolxWaterForce.cu
index ea4d280b..53546714 100644
--- a/src/core/cuda/src/CudaPolxWaterForce.cu
+++ b/src/core/cuda/src/CudaPolxWaterForce.cu
@@ -1,26 +1,24 @@
 #include <iostream>
 
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaPolxWaterForce.cuh"
 #include "utils.h"
 
 namespace CudaPolxWaterForce {
 bool is_initialized = false;
-shell_t* d_wshells = nullptr;
-int* d_list_sh = nullptr; // use 1d array to simulate 2d array
-coord_t* d_coords = nullptr;
-dvel_t* d_velocities = nullptr;
-double* d_theta = nullptr;
-double* d_theta0 = nullptr;
-double* d_tdum = nullptr;
-double* d_energy = nullptr;
-
-int *d_water_shell = nullptr;
-int *d_water_rank = nullptr;
 
 // in host
 int* water_shell = nullptr;
 int* water_rank = nullptr;
-int* polx_list_sh = nullptr; // use 1d array to simulate 2d array
+int* polx_list_sh = nullptr;  // use 1d array to simulate 2d array
+
+double* d_energy;
+int* d_list_sh = nullptr;
+double* d_theta = nullptr;
+double* d_theta0 = nullptr;
+double* d_tdum = nullptr;
+int* d_water_shell = nullptr;
+int* d_water_rank = nullptr;
 
 }  // namespace CudaPolxWaterForce
 
@@ -88,7 +86,7 @@ __global__ void calc_polx_water_forces_kernel(
 
     int il = water_rank[idx];
     int is = water_shell[idx];
-    if (is < 0) return; // water not in any shell this step
+    if (is < 0) return;  // water not in any shell this step
 
     int wi, ii;
     coord_t rmu, rcu, f1O, f1H1, f1H2, f2;
@@ -180,7 +178,6 @@ void sort_waters() {
             for (int jl = 0; jl < wshells[is].n_inshell; jl++) {
                 // printf("Searching water %d in shell %d, total number: %d\n", jl, is, wshells[is].n_inshell);
                 jw = polx_list_sh[jl * n_shells + is];
-                // printf("zzzzzzzzz %d\n", jw);
                 // printf("Water %d in shell %d has theta %f\n", jw, is, tdum[jw] * 180 / M_PI);
                 if (tdum[jw] < tmin) {
                     jmin = jw;
@@ -197,6 +194,8 @@ void sort_waters() {
 }
 
 void calc_polx_water_forces_host(int iteration) {
+    CudaContext& ctx = CudaContext::instance();
+
     for (int is = 0; is < n_shells; is++) {
         wshells[is].n_inshell = 0;
         if (iteration == 0) {
@@ -204,31 +203,14 @@ void calc_polx_water_forces_host(int iteration) {
         }
     }
     using namespace CudaPolxWaterForce;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_wshells, n_shells * sizeof(shell_t));
-        // todo: change the dimension
-        check_cudaMalloc((void**)&d_list_sh, n_max_inshell * n_shells * sizeof(int));
-
-        check_cudaMalloc((void**)&d_coords, n_atoms * sizeof(coord_t));
-        check_cudaMalloc((void**)&d_velocities, n_atoms * sizeof(dvel_t));
-        check_cudaMalloc((void**)&d_theta, n_waters * sizeof(double));
-        check_cudaMalloc((void**)&d_theta0, n_waters * sizeof(double));
-        check_cudaMalloc((void**)&d_tdum, n_waters * sizeof(double));
-        check_cudaMalloc((void**)&d_energy, sizeof(double));
-        check_cudaMalloc((void**)&d_water_rank, n_waters * sizeof(int));
-        check_cudaMalloc((void**)&d_water_shell, n_waters * sizeof(int));
 
-        water_rank = new int[n_waters];
-        water_shell = new int[n_waters];
-        polx_list_sh = new int[n_max_inshell * n_shells];
+    // ctx.sync_all_to_device();
+    cudaMemcpy(ctx.d_wshells, wshells, n_shells * sizeof(shell_t), cudaMemcpyHostToDevice);
 
-        is_initialized = true;
-    }
-    cudaMemcpy(d_wshells, wshells, n_shells * sizeof(shell_t), cudaMemcpyHostToDevice);
+    coord_t* d_coords = ctx.d_coords;
+    dvel_t* d_dvelocities = ctx.d_dvelocities;
+    shell_t* d_wshells = ctx.d_wshells;
 
-    cudaMemset(d_energy, 0, sizeof(double));
-    cudaMemcpy(d_coords, coords, n_atoms * sizeof(coord_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_velocities, dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyHostToDevice);
     int blockSize = 256;
     int numBlocks = (n_waters + blockSize - 1) / blockSize;
     // printf("Calculated theta for %d waters in %d shells\n", n_waters, n_shells);
@@ -270,38 +252,51 @@ void calc_polx_water_forces_host(int iteration) {
     }
 
     // Calculate energy and force
-
-    check_cudaMalloc((void**)&d_energy, sizeof(double));
+    cudaMemset(d_energy, 0, sizeof(double));
     calc_polx_water_forces_kernel<<<numBlocks, blockSize>>>(
-        n_waters, n_atoms_solute, d_wshells, d_coords, d_velocities, topo,
+        n_waters, n_atoms_solute, d_wshells, d_coords, d_dvelocities, topo,
         d_theta, md, d_energy, d_water_rank, d_water_shell);
     double energy;
     cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost);
     E_restraint.Upolx += energy;
     cudaMemcpy(wshells, d_wshells, n_shells * sizeof(shell_t), cudaMemcpyDeviceToHost);
     // Copy back forces for all atoms (solute + solvent); water forces were being dropped.
-    cudaMemcpy(dvelocities, d_velocities, n_atoms * sizeof(dvel_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(dvelocities, d_dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyDeviceToHost);
 }
 
-void cleanup_polx_water_force(
+void init_polx_water_force_kernel_data() {
+    using namespace CudaPolxWaterForce;
+    if (!is_initialized) {
+        water_rank = new int[n_waters];
+        water_shell = new int[n_waters];
+        polx_list_sh = new int[n_max_inshell * n_shells];
+
+        check_cudaMalloc((void**)&d_energy, sizeof(double));
+        check_cudaMalloc((void**)&d_list_sh, n_max_inshell * n_shells * sizeof(int));
+        check_cudaMalloc((void**)&d_theta, n_waters * sizeof(double));
+        check_cudaMalloc((void**)&d_theta0, n_waters * sizeof(double));
+        check_cudaMalloc((void**)&d_tdum, n_waters * sizeof(double));
+        check_cudaMalloc((void**)&d_water_rank, n_waters * sizeof(int));
+        check_cudaMalloc((void**)&d_water_shell, n_waters * sizeof(int));
 
-) {
+        is_initialized = true;
+    }
+}
+
+void cleanup_polx_water_force() {
     using namespace CudaPolxWaterForce;
     if (is_initialized) {
-        cudaFree(d_wshells);
+        delete[] water_rank;
+        delete[] water_shell;
+        delete[] polx_list_sh;
+
+        cudaFree(d_energy);
         cudaFree(d_list_sh);
-        cudaFree(d_coords);
-        cudaFree(d_velocities);
         cudaFree(d_theta);
         cudaFree(d_theta0);
         cudaFree(d_tdum);
-        cudaFree(d_energy);
         cudaFree(d_water_rank);
         cudaFree(d_water_shell);
-
-        delete[] water_rank;
-        delete[] water_shell;
-
         is_initialized = false;
     }
 }
diff --git a/src/core/cuda/src/CudaPshellForce.cu b/src/core/cuda/src/CudaPshellForce.cu
index c17ed4ab..61c63bfb 100644
--- a/src/core/cuda/src/CudaPshellForce.cu
+++ b/src/core/cuda/src/CudaPshellForce.cu
@@ -1,16 +1,13 @@
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaPshellForce.cuh"
 #include "utils.h"
+#include <iostream>
 namespace CudaPshellForce {
 bool is_initialized = false;
-bool* d_shell;
-bool* d_excluded;
-coord_t* d_coords;
-coord_t* d_coords_top;
 double* d_ufix_energy;
 double* d_ushell_energy;
-dvel_t* d_dvelocities;
-}  // namespace CudaPshellForce
 
+}  // namespace CudaPshellForce
 __global__ void calc_pshell_force_kernel(
     int n_atoms_solute,
     bool* shell,
@@ -50,27 +47,18 @@ __global__ void calc_pshell_force_kernel(
 }
 
 void calc_pshell_forces_host() {
+    CudaContext& ctx = CudaContext::instance();
     using namespace CudaPshellForce;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_shell, sizeof(bool) * n_atoms);
-        check_cudaMalloc((void**)&d_excluded, sizeof(bool) * n_atoms);
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_coords_top, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_ufix_energy, sizeof(double));
-        check_cudaMalloc((void**)&d_ushell_energy, sizeof(double));
-        check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms);
 
-        cudaMemcpy(d_shell, shell, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_excluded, excluded, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice);
-        is_initialized = true;
-    }
-
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_coords_top, coords_top, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice);
+    auto d_shell = ctx.d_shell;
+    auto d_excluded = ctx.d_excluded;
+    auto d_coords = ctx.d_coords;
+    auto d_coords_top = ctx.d_coords_top;
+    auto d_dvelocities = ctx.d_dvelocities;
 
     cudaMemset(d_ufix_energy, 0, sizeof(double));
     cudaMemset(d_ushell_energy, 0, sizeof(double));
+
     int blockSize = 256;
     int numBlocks = (n_atoms_solute + blockSize - 1) / blockSize;
     calc_pshell_force_kernel<<<numBlocks, blockSize>>>(
@@ -89,22 +77,25 @@ void calc_pshell_forces_host() {
     cudaMemcpy(&ushell_energy, d_ushell_energy, sizeof(double), cudaMemcpyDeviceToHost);
     cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
 
-
     E_restraint.Ufix += ufix_energy;
     E_restraint.Ushell += ushell_energy;
+    // ctx.sync_all_to_host();
+}
+
+void init_pshell_force_kernel_data() {
+    using namespace CudaPshellForce;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_ufix_energy, sizeof(double));
+        check_cudaMalloc((void**)&d_ushell_energy, sizeof(double));
+        is_initialized = true;
+    }
 }
 
 void cleanup_pshell_force() {
     using namespace CudaPshellForce;
     if (is_initialized) {
-        cudaFree(d_shell);
-        cudaFree(d_excluded);
-        cudaFree(d_coords);
-        cudaFree(d_coords_top);
         cudaFree(d_ufix_energy);
         cudaFree(d_ushell_energy);
-        cudaFree(d_dvelocities);
         is_initialized = false;
     }
-
 }
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaRadixWaterForce.cu b/src/core/cuda/src/CudaRadixWaterForce.cu
index 92fb93f7..26406dc4 100644
--- a/src/core/cuda/src/CudaRadixWaterForce.cu
+++ b/src/core/cuda/src/CudaRadixWaterForce.cu
@@ -1,25 +1,24 @@
 #include <stdexcept>
 
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaRadixWaterForce.cuh"
 #include "cuda/include/CudaUtility.cuh"
 #include "utils.h"
-
 namespace CudaRadixWaterForce {
 bool is_initialized = false;
-coord_t* d_coords = nullptr;
-dvel_t* d_dvelocities = nullptr;
-double* d_energy = nullptr;
+double* d_energy;
 }  // namespace CudaRadixWaterForce
+
 __global__ void calc_radix_water_forces_kernel(
-    coord_t* coords, 
-    double shift, 
+    coord_t* coords,
+    double shift,
     int n_atoms_solute,
     int n_atoms,
     topo_t topo,
     md_t md,
     double Dwmz,
     double awmz,
-    dvel_t* dvelocities, 
+    dvel_t* dvelocities,
     double* energy) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     i = n_atoms_solute + i * 3;  // Process only oxygen atoms of water molecules
@@ -60,6 +59,7 @@ void calc_radix_water_forces_host() {
     if (water_atoms == 0) {
         return;
     }
+    using namespace CudaRadixWaterForce;
     int blockSize = 256;
     if (water_atoms % 3 != 0) {
         throw std::runtime_error("Number of water atoms is not a multiple of 3");
@@ -67,16 +67,11 @@ void calc_radix_water_forces_host() {
     int oxygen_atoms = water_atoms / 3;
     int numBlocks = (oxygen_atoms + blockSize - 1) / blockSize;
 
-    using namespace CudaRadixWaterForce;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms);
-        check_cudaMalloc((void**)&d_energy, sizeof(double));
-        is_initialized = true;
-    }
+    CudaContext& ctx = CudaContext::instance();
+    // ctx.sync_all_to_device();
 
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice);
+    auto d_coords = ctx.d_coords;
+    auto d_dvelocities = ctx.d_dvelocities;
     double energy = 0.0;
     cudaMemcpy(d_energy, &energy, sizeof(double), cudaMemcpyHostToDevice);
 
@@ -86,28 +81,34 @@ void calc_radix_water_forces_host() {
     } else {
         shift = 0;
     }
-    calc_radix_water_forces_kernel<<<numBlocks, blockSize>>>(d_coords, 
-        shift, 
-        n_atoms_solute, 
-        n_atoms, 
-        topo, 
-        md, 
-        Dwmz, 
-        awmz, 
-        d_dvelocities, 
-        d_energy);
+    calc_radix_water_forces_kernel<<<numBlocks, blockSize>>>(d_coords,
+                                                             shift,
+                                                             n_atoms_solute,
+                                                             n_atoms,
+                                                             topo,
+                                                             md,
+                                                             Dwmz,
+                                                             awmz,
+                                                             d_dvelocities,
+                                                             d_energy);
     cudaDeviceSynchronize();
     cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
     cudaMemcpy(&energy, d_energy, sizeof(double), cudaMemcpyDeviceToHost);
     E_restraint.Uradx += energy;
 }
 
+void init_radix_water_force_kernel_data() {
+    using namespace CudaRadixWaterForce;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_energy, sizeof(double));
+        is_initialized = true;
+    }
+}
+
 void cleanup_radix_water_force() {
     using namespace CudaRadixWaterForce;
     if (is_initialized) {
-        cudaFree(d_coords);
-        cudaFree(d_dvelocities);
         cudaFree(d_energy);
         is_initialized = false;
     }
-}
+}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaRestrangForce.cu b/src/core/cuda/src/CudaRestrangForce.cu
index fd568687..310336e5 100644
--- a/src/core/cuda/src/CudaRestrangForce.cu
+++ b/src/core/cuda/src/CudaRestrangForce.cu
@@ -1,17 +1,11 @@
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaRestrangForce.cuh"
 #include "cuda/include/CudaUtility.cuh"
 #include "utils.h"
-
-namespace CudaRestrangeForce {
+namespace CudaRestrangForce {
 bool is_initialized = false;
-restrang_t* d_restrangs = nullptr;
-coord_t* d_coords = nullptr;
-double* d_lambdas = nullptr;
-dvel_t* d_velocities = nullptr;
-E_restraint_t* d_EQ_restraint = nullptr;
-double* d_E_restraint = nullptr;
-restrdis_t* d_restrdists = nullptr;
-}  // namespace CudaRestrangeForce
+double* d_E_restraint;
+}  // namespace CudaRestrangForce
 
 __global__ void calc_restrang_force_kernel(
     restrang_t* restrangs,
@@ -86,7 +80,6 @@ __global__ void calc_restrang_force_kernel(
     dk.y = f1 * (dr.y / (rij * rjk) - cos_th * dr2.y / r2jk);
     dk.z = f1 * (dr.z / (rij * rjk) - cos_th * dr2.z / r2jk);
 
-
     atomicAdd(&dvelocities[i].x, dv * di.x);
     atomicAdd(&dvelocities[i].y, dv * di.y);
     atomicAdd(&dvelocities[i].z, dv * di.z);
@@ -110,26 +103,20 @@ __global__ void calc_restrang_force_kernel(
 }
 
 void calc_restrang_force_host() {
-    using namespace CudaRestrangeForce;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_restrangs, sizeof(restrang_t) * n_restrangs);
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_lambdas, sizeof(double) * n_lambdas);
-        check_cudaMalloc((void**)&d_velocities, sizeof(dvel_t) * n_atoms);
-        check_cudaMalloc((void**)&d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas);
-        check_cudaMalloc((void**)&d_E_restraint, sizeof(double));
-        check_cudaMalloc((void**)&d_restrdists, sizeof(restrdis_t) * n_restrdists);
+    if (n_restrangs == 0) return;
+    using namespace CudaRestrangForce;
+    CudaContext& ctx = CudaContext::instance();
+
+    auto d_restrangs = ctx.d_restrangs;
+    auto d_coords = ctx.d_coords;
+    auto d_lambdas = ctx.d_lambdas;
+    auto d_dvelocities = ctx.d_dvelocities;
+    auto d_EQ_restraint = ctx.d_EQ_restraint;
+    auto d_restrdists = ctx.d_restrdists;
 
-        cudaMemcpy(d_restrangs, restrangs, sizeof(restrang_t) * n_restrangs, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_lambdas, lambdas, sizeof(double) * n_lambdas, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_restrdists, restrdists, sizeof(restrdis_t) * n_restrdists, cudaMemcpyHostToDevice);
-        is_initialized = true;
-    }
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_velocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_EQ_restraint, EQ_restraint, sizeof(E_restraint_t) * n_lambdas, cudaMemcpyHostToDevice);
     double val = 0;
     cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice);
+
     int blockSize = 256;
     int numBlocks = (n_restrangs + blockSize - 1) / blockSize;
     calc_restrang_force_kernel<<<numBlocks, blockSize>>>(
@@ -139,28 +126,30 @@ void calc_restrang_force_host() {
         n_atoms,
         d_lambdas,
         n_lambdas,
-        d_velocities,
+        d_dvelocities,
         d_EQ_restraint,
         d_E_restraint,
         d_restrdists);
     cudaDeviceSynchronize();
     cudaMemcpy(coords, d_coords, sizeof(coord_t) * n_atoms, cudaMemcpyDeviceToHost);
-    cudaMemcpy(dvelocities, d_velocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
+    cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
     cudaMemcpy(EQ_restraint, d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas, cudaMemcpyDeviceToHost);
     cudaMemcpy(&val, d_E_restraint, sizeof(double), cudaMemcpyDeviceToHost);
     E_restraint.Urestr += val;
 }
 
+void init_restrang_force_kernel_data() {
+    using namespace CudaRestrangForce;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_E_restraint, sizeof(double));
+        is_initialized = true;
+    }
+}
+
 void cleanup_restrang_force() {
-    using namespace CudaRestrangeForce;
+    using namespace CudaRestrangForce;
     if (is_initialized) {
-        cudaFree(d_restrangs);
-        cudaFree(d_coords);
-        cudaFree(d_lambdas);
-        cudaFree(d_velocities);
-        cudaFree(d_EQ_restraint);
         cudaFree(d_E_restraint);
-        cudaFree(d_restrdists);
         is_initialized = false;
     }
 }
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaRestrdisForce.cu b/src/core/cuda/src/CudaRestrdisForce.cu
index b95ee4f5..af92e6d4 100644
--- a/src/core/cuda/src/CudaRestrdisForce.cu
+++ b/src/core/cuda/src/CudaRestrdisForce.cu
@@ -1,17 +1,12 @@
+#include <iostream>
+
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaRestrdisForce.cuh"
 #include "cuda/include/CudaUtility.cuh"
 #include "utils.h"
-#include <iostream>
-
 namespace CudaRestrdisForce {
 bool is_initialized = false;
-restrdis_t* d_restrdists = nullptr;
-coord_t* d_coords = nullptr;
-double* d_lambdas = nullptr;
-dvel_t* d_dvelocities = nullptr;
-E_restraint_t* d_EQ_restraint = nullptr;
-double* d_E_restraint = nullptr;
-
+double* d_E_restraint;
 }  // namespace CudaRestrdisForce
 
 __global__ void calc_restrdis_forces_kernel(
@@ -80,22 +75,15 @@ __global__ void calc_restrdis_forces_kernel(
 
 void calc_restrdis_forces_host() {
     using namespace CudaRestrdisForce;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_restrdists, n_restrdists * sizeof(restrdis_t));
-        check_cudaMalloc((void**)&d_coords, n_atoms * sizeof(coord_t));
-        check_cudaMalloc((void**)&d_lambdas, n_lambdas * sizeof(double));
-        check_cudaMalloc((void**)&d_dvelocities, n_atoms * sizeof(dvel_t));
-        check_cudaMalloc((void**)&d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas);
-        check_cudaMalloc((void**)&d_E_restraint, sizeof(double) * n_lambdas);
-
-        cudaMemcpy(d_restrdists, restrdists, n_restrdists * sizeof(restrdis_t), cudaMemcpyHostToDevice);
-        cudaMemcpy(d_lambdas, lambdas, n_lambdas * sizeof(double), cudaMemcpyHostToDevice);
-        is_initialized = true;
-    }
-    cudaMemcpy(d_coords, coords, n_atoms * sizeof(coord_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_dvelocities, dvelocities, n_atoms * sizeof(dvel_t), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_EQ_restraint, EQ_restraint, sizeof(E_restraint_t) * n_lambdas, cudaMemcpyHostToDevice);
-    cudaMemset(d_E_restraint, 0, sizeof(double) * n_lambdas);
+    CudaContext& ctx = CudaContext::instance();
+    auto d_restrdists = ctx.d_restrdists;
+    auto d_coords = ctx.d_coords;
+    auto d_lambdas = ctx.d_lambdas;
+    auto d_dvelocities = ctx.d_dvelocities;
+    auto d_EQ_restraint = ctx.d_EQ_restraint;
+
+    cudaMemset(d_E_restraint, 0, sizeof(double));
+
     int blockSize = 256;
     int numBlocks = (n_restrdists + blockSize - 1) / blockSize;
     calc_restrdis_forces_kernel<<<numBlocks, blockSize>>>(
@@ -116,15 +104,18 @@ void calc_restrdis_forces_host() {
     E_restraint.Urestr += ener;
 }
 
+void init_restrdis_force_kernel_data() {
+    using namespace CudaRestrdisForce;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_E_restraint, sizeof(double));
+        is_initialized = true;
+    }
+}
+
 void cleanup_restrdis_force() {
     using namespace CudaRestrdisForce;
     if (is_initialized) {
-        cudaFree(d_restrdists);
-        cudaFree(d_coords);
-        cudaFree(d_lambdas);
-        cudaFree(d_dvelocities);
-        cudaFree(d_EQ_restraint);
         cudaFree(d_E_restraint);
         is_initialized = false;
     }
-}
+}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaRestrposForce.cu b/src/core/cuda/src/CudaRestrposForce.cu
index 80e5e083..0468e6b0 100644
--- a/src/core/cuda/src/CudaRestrposForce.cu
+++ b/src/core/cuda/src/CudaRestrposForce.cu
@@ -1,16 +1,13 @@
+#include <iostream>
+
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaRestrposForce.cuh"
 #include "cuda/include/CudaUtility.cuh"
 #include "utils.h"
 
 namespace CudaRestrposForce {
 bool is_initialized = false;
-restrpos_t* d_restrpos = nullptr;
-coord_t* d_coords = nullptr;
-double* d_lambdas = nullptr;
-E_restraint_t* d_EQ_restraint = nullptr;
-double* d_E_restraint = nullptr;
-dvel_t* d_dvelocities = nullptr;
-
+double* d_E_restraint;
 }  // namespace CudaRestrposForce
 
 __global__ void calc_restrpos_forces_kernel(
@@ -65,29 +62,22 @@ __global__ void calc_restrpos_forces_kernel(
     }
 }
 void calc_restrpos_forces_host() {
+    if (n_restrspos == 0) return;
     using namespace CudaRestrposForce;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_restrpos, sizeof(restrpos_t) * n_restrspos);
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_lambdas, sizeof(double) * n_lambdas);
-        check_cudaMalloc((void**)&d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas);
-        check_cudaMalloc((void**)&d_E_restraint, sizeof(double));
-        check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms);
-
-        cudaMemcpy(d_restrpos, restrspos, sizeof(restrpos_t) * n_restrspos, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_lambdas, lambdas, sizeof(double) * n_lambdas, cudaMemcpyHostToDevice);
-        is_initialized = true;
-    }
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_EQ_restraint, EQ_restraint, sizeof(E_restraint_t) * n_lambdas, cudaMemcpyHostToDevice);
     double val = 0.0;
     cudaMemcpy(d_E_restraint, &val, sizeof(double), cudaMemcpyHostToDevice);
 
+    CudaContext& ctx = CudaContext::instance();
+    auto d_restrspos = ctx.d_restrspos;
+    auto d_coords = ctx.d_coords;
+    auto d_lambdas = ctx.d_lambdas;
+    auto d_EQ_restraint = ctx.d_EQ_restraint;
+    auto d_dvelocities = ctx.d_dvelocities;
+
     int blockSize = 256;
     int numBlocks = (n_restrspos + blockSize - 1) / blockSize;
     calc_restrpos_forces_kernel<<<numBlocks, blockSize>>>(
-        d_restrpos,
+        d_restrspos,
         n_restrspos,
         d_coords,
         d_lambdas,
@@ -101,15 +91,19 @@ void calc_restrpos_forces_host() {
     E_restraint.Urestr += val;
     cudaMemcpy(EQ_restraint, d_EQ_restraint, sizeof(E_restraint_t) * n_lambdas, cudaMemcpyDeviceToHost);
 }
+
+void init_restrpos_force_kernel_data() {
+    using namespace CudaRestrposForce;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_E_restraint, sizeof(double));
+        is_initialized = true;
+    }
+}
+
 void cleanup_restrpos_force() {
     using namespace CudaRestrposForce;
     if (is_initialized) {
-        cudaFree(d_restrpos);
-        cudaFree(d_coords);
-        cudaFree(d_lambdas);
-        cudaFree(d_EQ_restraint);
         cudaFree(d_E_restraint);
-        cudaFree(d_dvelocities);
         is_initialized = false;
     }
-}
+}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaRestrseqForce.cu b/src/core/cuda/src/CudaRestrseqForce.cu
index 4e167357..ff805df7 100644
--- a/src/core/cuda/src/CudaRestrseqForce.cu
+++ b/src/core/cuda/src/CudaRestrseqForce.cu
@@ -1,20 +1,12 @@
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaRestrseqForce.cuh"
-#include "utils.h"
 #include "iostream"
+#include "utils.h"
 
 namespace CudaRestrseqForce {
 bool is_initialized = false;
-restrseq_t* d_restrseq = nullptr;
-coord_t* d_coords = nullptr;
-coord_t* d_coords_top = nullptr;
-double* d_upres_energy = nullptr;
-atype_t* d_atypes = nullptr;
-catype_t* d_catypes = nullptr;
-bool* d_heavy = nullptr;
-dvel_t* d_velocities = nullptr;
-
+double* d_upres_energy;
 }  // namespace CudaRestrseqForce
-
 __global__ void calc_restrseq_forces_kernel(
     int n_restrseqs,
     restrseq_t* restrseqs,
@@ -112,7 +104,7 @@ __global__ void calc_restrseq_forces_kernel(
                 r2 = pow(dr.x, 2) + pow(dr.y, 2) + pow(dr.z, 2);
                 ener = .5 * k * r2;
                 atomicAdd(upres_energy, ener);
-                
+
                 atomicAdd(&dvelocities[i].x, k * dr.x);
                 atomicAdd(&dvelocities[i].y, k * dr.y);
                 atomicAdd(&dvelocities[i].z, k * dr.z);
@@ -123,28 +115,17 @@ __global__ void calc_restrseq_forces_kernel(
 
 void calc_restrseq_forces_host() {
     using namespace CudaRestrseqForce;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_restrseq, sizeof(restrseq_t) * n_restrseqs);
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_coords_top, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_upres_energy, sizeof(double));
-        check_cudaMalloc((void**)&d_atypes, sizeof(atype_t) * n_atypes);
-        check_cudaMalloc((void**)&d_catypes, sizeof(catype_t) * n_catypes);
-        check_cudaMalloc((void**)&d_heavy, sizeof(bool) * n_atoms);
-        check_cudaMalloc((void**)&d_velocities, sizeof(dvel_t) * n_atoms);
-
-        cudaMemcpy(d_restrseq, restrseqs, sizeof(restrseq_t) * n_restrseqs, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_atypes, atypes, sizeof(atype_t) * n_atypes, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_catypes, catypes, sizeof(catype_t) * n_catypes, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_heavy, heavy, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice);
-
-        is_initialized = true;
-    }
-
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_coords_top, coords_top, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_velocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice);
+    CudaContext& ctx = CudaContext::instance();
+    auto d_restrseq = ctx.d_restrseqs;
+    auto d_coords = ctx.d_coords;
+    auto d_coords_top = ctx.d_coords_top;
+    auto d_atypes = ctx.d_atypes;
+    auto d_catypes = ctx.d_catypes;
+    auto d_heavy = ctx.d_heavy;
+    auto d_dvelocities = ctx.d_dvelocities;
     cudaMemset(d_upres_energy, 0, sizeof(double));
+    // ctx.sync_all_to_device();
+
     int blockSize = 256;
     int numBlocks = (n_restrseqs + blockSize - 1) / blockSize;
     calc_restrseq_forces_kernel<<<numBlocks, blockSize>>>(
@@ -155,28 +136,28 @@ void calc_restrseq_forces_host() {
         d_atypes,
         d_catypes,
         d_heavy,
-        d_velocities,
-        d_upres_energy
-    );
+        d_dvelocities,
+        d_upres_energy);
     cudaDeviceSynchronize();
     double upres_energy;
     cudaMemcpy(&upres_energy, d_upres_energy, sizeof(double), cudaMemcpyDeviceToHost);
     E_restraint.Upres = upres_energy;
     printf("Restrseq U_upres: %f\n", upres_energy);
-    cudaMemcpy(dvelocities, d_velocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
+    cudaMemcpy(dvelocities, d_dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyDeviceToHost);
+}
+
+void init_restrseq_force_kernel_data() {
+    using namespace CudaRestrseqForce;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_upres_energy, sizeof(double));
+        is_initialized = true;
+    }
 }
 
 void cleanup_restrseq_force() {
     using namespace CudaRestrseqForce;
     if (is_initialized) {
-        cudaFree(d_restrseq);
-        cudaFree(d_coords);
-        cudaFree(d_coords_top);
         cudaFree(d_upres_energy);
-        cudaFree(d_atypes);
-        cudaFree(d_catypes);
-        cudaFree(d_heavy);
-        cudaFree(d_velocities);
         is_initialized = false;
     }
 }
diff --git a/src/core/cuda/src/CudaRestrwallForce.cu b/src/core/cuda/src/CudaRestrwallForce.cu
index 08c187f3..d26ce34c 100644
--- a/src/core/cuda/src/CudaRestrwallForce.cu
+++ b/src/core/cuda/src/CudaRestrwallForce.cu
@@ -1,15 +1,12 @@
+#include <iostream>
+
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaRestrwallForce.cuh"
 #include "utils.h"
-#include <iostream>
 
 namespace CudaRestrwallForce {
 bool is_initialized = false;
-restrwall_t* d_restrwalls = nullptr;
-coord_t* d_coords = nullptr;
-
-double* d_energies = nullptr;
-dvel_t* d_dvelocities = nullptr;
-bool* d_heavy = nullptr;
+double* d_energies;
 }  // namespace CudaRestrwallForce
 
 __global__ void calc_restrwall_forces_kernel(
@@ -54,21 +51,15 @@ __global__ void calc_restrwall_forces_kernel(
 }
 
 void calc_restrwall_forces_host() {
+    if (n_restrwalls == 0) return;
     using namespace CudaRestrwallForce;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_restrwalls, sizeof(restrwall_t) * n_restrwalls);
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_energies, sizeof(double));
-        check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms);
-        check_cudaMalloc((void**)&d_heavy, sizeof(bool) * n_atoms);
+    CudaContext& ctx = CudaContext::instance();
+    auto d_restrwalls = ctx.d_restrwalls;
+    auto d_coords = ctx.d_coords;
+    auto d_dvelocities = ctx.d_dvelocities;
+    auto d_heavy = ctx.d_heavy;
+    cudaMemset(d_energies, 0, sizeof(double));
 
-        cudaMemcpy(d_restrwalls, restrwalls, sizeof(restrwall_t) * n_restrwalls, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_heavy, heavy, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice);
-        is_initialized = true;
-    }
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_energies, 0, sizeof(double), cudaMemcpyHostToDevice);
-    cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice);
     int blockSize = 256;
     int numBlocks = (n_restrwalls + blockSize - 1) / blockSize;
     calc_restrwall_forces_kernel<<<numBlocks, blockSize>>>(
@@ -84,14 +75,19 @@ void calc_restrwall_forces_host() {
     printf("Restrwall energy: %f\n", h_energy);
     E_restraint.Upres += h_energy;
 }
+
+void init_restrwall_force_kernel_data() {
+    using namespace CudaRestrwallForce;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_energies, sizeof(double));
+        is_initialized = true;
+    }
+}
+
 void cleanup_restrwall_force() {
     using namespace CudaRestrwallForce;
     if (is_initialized) {
-        cudaFree(d_restrwalls);
-        cudaFree(d_coords);
         cudaFree(d_energies);
-        cudaFree(d_dvelocities);
-        cudaFree(d_heavy);
         is_initialized = false;
     }
-}
+}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaShakeConstraints.cu b/src/core/cuda/src/CudaShakeConstraints.cu
index 799291dc..1c446735 100644
--- a/src/core/cuda/src/CudaShakeConstraints.cu
+++ b/src/core/cuda/src/CudaShakeConstraints.cu
@@ -1,17 +1,14 @@
 #include <iostream>
 
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaShakeConstraints.cuh"
 #include "utils.h"
 
 namespace CudaShakeConstraints {
+
 bool is_initialized = false;
-int* d_mol_n_shakes = nullptr;
-shake_bond_t* d_shake_bonds = nullptr;
-coord_t* d_coords = nullptr;
-coord_t* d_xcoords = nullptr;
-double* d_winv = nullptr;
-int* d_total_iterations = nullptr;
-int* d_mol_shake_offset = nullptr;
+int* d_mol_shake_offset;
+int* d_total_iterations;
 }  // namespace CudaShakeConstraints
 
 __global__ void calc_shake_constraints_kernel(
@@ -99,41 +96,48 @@ __global__ void calc_shake_constraints_kernel(
     }
 }
 
-int calc_shake_constraints_host() {
+void init_shake_constraints_kernel_data() {
     using namespace CudaShakeConstraints;
-
     if (!is_initialized) {
-        check_cudaMalloc((void**)&d_mol_n_shakes, sizeof(int) * n_molecules);
-        check_cudaMalloc((void**)&d_shake_bonds, sizeof(shake_bond_t) * n_shake_constraints);
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_xcoords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_winv, sizeof(double) * n_atoms);
-        check_cudaMalloc((void**)&d_total_iterations, sizeof(int));
-        check_cudaMalloc((void**)&d_mol_shake_offset, sizeof(int) * n_molecules);
-
-        cudaMemcpy(d_mol_n_shakes, mol_n_shakes, sizeof(int) * n_molecules, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_shake_bonds, shake_bonds, sizeof(shake_bond_t) * n_shake_constraints, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_winv, winv, sizeof(double) * n_atoms, cudaMemcpyHostToDevice);
-
         int* mol_shake_offset_host = (int*)malloc(sizeof(int) * n_molecules);
         mol_shake_offset_host[0] = 0;
         for (int i = 1; i < n_molecules; i++) {
             mol_shake_offset_host[i] = mol_shake_offset_host[i - 1] + mol_n_shakes[i - 1];
         }
+        check_cudaMalloc((void**)&d_mol_shake_offset, sizeof(int) * n_molecules);  // calculation data, not initialized in the beginning.
         cudaMemcpy(d_mol_shake_offset, mol_shake_offset_host, sizeof(int) * n_molecules, cudaMemcpyHostToDevice);
-
         free(mol_shake_offset_host);
 
+        check_cudaMalloc((void**)&d_total_iterations, sizeof(int));
+
         is_initialized = true;
     }
+}
+
+void cleanup_shake_constraints() {
+    using namespace CudaShakeConstraints;
+    if (is_initialized) {
+        cudaFree(d_mol_shake_offset);
+        cudaFree(d_total_iterations);
+        is_initialized = false;
+    }
+}
 
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_xcoords, xcoords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
+int calc_shake_constraints_host() {
+    using namespace CudaShakeConstraints;
     int total_iterations_host = 0;
     cudaMemcpy(d_total_iterations, &total_iterations_host, sizeof(int), cudaMemcpyHostToDevice);
 
     int blocks = n_molecules;
     int threads = 32;
+
+    CudaContext& ctx = CudaContext::instance();
+    auto d_mol_n_shakes = ctx.d_mol_n_shakes;
+    auto d_shake_bonds = ctx.d_shake_bonds;
+    auto d_coords = ctx.d_coords;
+    auto d_xcoords = ctx.d_xcoords;
+    auto d_winv = ctx.d_winv;
+
     calc_shake_constraints_kernel<<<blocks, threads>>>(
         n_molecules,
         d_mol_n_shakes,
@@ -148,18 +152,3 @@ int calc_shake_constraints_host() {
     cudaMemcpy(coords, d_coords, sizeof(coord_t) * n_atoms, cudaMemcpyDeviceToHost);
     return total_iterations_host;
 }
-
-void cleanup_shake_constraints() {
-    using namespace CudaShakeConstraints;
-
-    if (is_initialized) {
-        cudaFree(d_mol_n_shakes);
-        cudaFree(d_shake_bonds);
-        cudaFree(d_coords);
-        cudaFree(d_xcoords);
-        cudaFree(d_winv);
-        cudaFree(d_total_iterations);
-        cudaFree(d_mol_shake_offset);
-        is_initialized = false;
-    }
-}
\ No newline at end of file
diff --git a/src/core/cuda/src/CudaTemperature.cu b/src/core/cuda/src/CudaTemperature.cu
index c55e1870..f28157f6 100644
--- a/src/core/cuda/src/CudaTemperature.cu
+++ b/src/core/cuda/src/CudaTemperature.cu
@@ -1,3 +1,4 @@
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaTemperature.cuh"
 #include "cuda/include/CudaUtility.cuh"
 #include "iostream"
@@ -5,10 +6,6 @@
 
 namespace CudaTemperature {
 bool is_initialized = false;
-atype_t* d_atypes;
-catype_t* d_catypes;
-vel_t* d_velocities;
-bool* d_excluded;
 double* d_Temp_solute;
 double* d_Tfree_solute;
 double* d_Texcl_solute;
@@ -49,26 +46,6 @@ __global__ void calc_temperature_kernel(int n_atoms, int n_atoms_solute, atype_t
 void calc_temperature_host() {
     printf("Ndegf = %f, Ndegfree = %f, n_excluded = %d, Ndegfree_solvent = %f, Ndegfree_solute = %f\n", Ndegf, Ndegfree, n_excluded, Ndegfree_solvent, Ndegfree_solute);
     using namespace CudaTemperature;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_atypes, sizeof(atype_t) * n_atypes);
-        check_cudaMalloc((void**)&d_catypes, sizeof(catype_t) * n_catypes);
-        check_cudaMalloc((void**)&d_velocities, sizeof(vel_t) * n_atoms);
-        check_cudaMalloc((void**)&d_excluded, sizeof(bool) * n_atoms);
-        check_cudaMalloc((void**)&d_Temp_solute, sizeof(double));
-        check_cudaMalloc((void**)&d_Tfree_solute, sizeof(double));
-        check_cudaMalloc((void**)&d_Texcl_solute, sizeof(double));
-        check_cudaMalloc((void**)&d_Temp_solvent, sizeof(double));
-        check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(double));
-        check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(double));
-
-        cudaMemcpy(d_atypes, atypes, sizeof(atype_t) * n_atypes, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_catypes, catypes, sizeof(catype_t) * n_catypes, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_excluded, excluded, sizeof(bool) * n_atoms, cudaMemcpyHostToDevice);
-
-        is_initialized = true;
-    }
-    cudaMemcpy(d_velocities, velocities, sizeof(vel_t) * n_atoms, cudaMemcpyHostToDevice);
-
     double h_Temp_solute = 0.0, h_Tfree_solute = 0.0, h_Texcl_solute = 0.0, h_Temp_solvent = 0.0, h_Tfree_solvent = 0.0, h_Texcl_solvent = 0.0;
 
     cudaMemcpy(d_Temp_solute, &h_Temp_solute, sizeof(double), cudaMemcpyHostToDevice);
@@ -78,6 +55,11 @@ void calc_temperature_host() {
     cudaMemcpy(d_Tfree_solvent, &h_Tfree_solvent, sizeof(double), cudaMemcpyHostToDevice);
     cudaMemcpy(d_Texcl_solvent, &h_Texcl_solvent, sizeof(double), cudaMemcpyHostToDevice);
 
+    CudaContext& ctx = CudaContext::instance();
+    atype_t* d_atypes = ctx.d_atypes;
+    catype_t* d_catypes = ctx.d_catypes;
+    vel_t* d_velocities = ctx.d_velocities;
+    bool* d_excluded = ctx.d_excluded;
 
     int blockSize = 256;
     int numBlocks = (n_atoms + blockSize - 1) / blockSize;
@@ -111,13 +93,22 @@ void calc_temperature_host() {
     printf("Tscale = %f, tau_T = %f, Temp = %f, Tfree = %f\n", Tscale_solvent, tau_T, Temp, Tfree);
 }
 
+void init_temperature_kernel_data() {
+    using namespace CudaTemperature;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_Temp_solute, sizeof(double));
+        check_cudaMalloc((void**)&d_Tfree_solute, sizeof(double));
+        check_cudaMalloc((void**)&d_Texcl_solute, sizeof(double));
+        check_cudaMalloc((void**)&d_Temp_solvent, sizeof(double));
+        check_cudaMalloc((void**)&d_Tfree_solvent, sizeof(double));
+        check_cudaMalloc((void**)&d_Texcl_solvent, sizeof(double));
+        is_initialized = true;
+    }
+}
+
 void cleanup_temperature() {
     using namespace CudaTemperature;
     if (is_initialized) {
-        cudaFree(d_atypes);
-        cudaFree(d_catypes);
-        cudaFree(d_velocities);
-        cudaFree(d_excluded);
         cudaFree(d_Temp_solute);
         cudaFree(d_Tfree_solute);
         cudaFree(d_Texcl_solute);
@@ -125,5 +116,5 @@ void cleanup_temperature() {
         cudaFree(d_Tfree_solvent);
         cudaFree(d_Texcl_solvent);
         is_initialized = false;
-    }   
+    }
 }
diff --git a/src/core/cuda/src/CudaTorsionForce.cu b/src/core/cuda/src/CudaTorsionForce.cu
index 4c06adc5..6f56fedf 100644
--- a/src/core/cuda/src/CudaTorsionForce.cu
+++ b/src/core/cuda/src/CudaTorsionForce.cu
@@ -1,13 +1,10 @@
+#include "cuda/include/CudaContext.cuh"
 #include "cuda/include/CudaTorsionForce.cuh"
 #include "cuda/include/CudaUtility.cuh"
 #include "utils.h"
 
 namespace CudaTorsionForce {
 bool is_initialized = false;
-torsion_t* d_torsions = nullptr;
-ctorsion_t* d_ctorsions = nullptr;
-coord_t* d_coords = nullptr;
-dvel_t* d_dvelocities = nullptr;
 double* d_energy_sum = nullptr;
 }  // namespace CudaTorsionForce
 
@@ -133,21 +130,16 @@ double calc_torsion_forces_host(int start, int end) {
     if (N <= 0) return 0.0;
     int blockSize = 256;
     int numBlocks = (N + blockSize - 1) / blockSize;
-    if (!is_initialized) {
-        check_cudaMalloc((void**)&d_torsions, sizeof(torsion_t) * n_torsions);
-        check_cudaMalloc((void**)&d_ctorsions, sizeof(ctorsion_t) * n_ctorsions);
-        check_cudaMalloc((void**)&d_coords, sizeof(coord_t) * n_atoms);
-        check_cudaMalloc((void**)&d_dvelocities, sizeof(dvel_t) * n_atoms);
-        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
-        is_initialized = true;
-    }
 
-    cudaMemcpy(d_torsions, torsions, sizeof(torsion_t) * n_torsions, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_ctorsions, ctorsions, sizeof(ctorsion_t) * n_ctorsions, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_coords, coords, sizeof(coord_t) * n_atoms, cudaMemcpyHostToDevice);
-    cudaMemcpy(d_dvelocities, dvelocities, sizeof(dvel_t) * n_atoms, cudaMemcpyHostToDevice);
     double zero = 0.0;
     cudaMemcpy(d_energy_sum, &zero, sizeof(double), cudaMemcpyHostToDevice);
+
+    CudaContext& ctx = CudaContext::instance();
+    coord_t* d_coords = ctx.d_coords;
+    dvel_t* d_dvelocities = ctx.d_dvelocities;
+    torsion_t* d_torsions = ctx.d_torsions;
+    ctorsion_t* d_ctorsions = ctx.d_ctorsions;
+
     calc_torsion_forces_kernel<<<numBlocks, blockSize>>>(start, end, d_torsions, d_ctorsions, d_coords, d_dvelocities, d_energy_sum);
     cudaDeviceSynchronize();
     cudaMemcpy(&zero, d_energy_sum, sizeof(double), cudaMemcpyDeviceToHost);
@@ -155,13 +147,19 @@ double calc_torsion_forces_host(int start, int end) {
     return zero;
 }
 
+
+
+void init_torsion_force_kernel_data() {
+    using namespace CudaTorsionForce;
+    if (!is_initialized) {
+        check_cudaMalloc((void**)&d_energy_sum, sizeof(double));
+        is_initialized = true;
+    }
+}
+
 void cleanup_torsion_force() {
     using namespace CudaTorsionForce;
     if (is_initialized) {
-        cudaFree(d_torsions);
-        cudaFree(d_ctorsions);
-        cudaFree(d_coords);
-        cudaFree(d_dvelocities);
         cudaFree(d_energy_sum);
         is_initialized = false;
     }
diff --git a/src/core/system.cu b/src/core/system.cu
index 65dbc209..afb999cf 100644
--- a/src/core/system.cu
+++ b/src/core/system.cu
@@ -22,6 +22,13 @@
 #include "cuda/include/CudaPshellForce.cuh"
 #include "cuda/include/CudaRadixWaterForce.cuh"
 #include "cuda/include/CudaTemperature.cuh"
+#include "cuda/include/CudaContext.cuh"
+#include "cuda/include/CudaShakeConstraints.cuh"
+#include "cuda/include/CudaNonbondedPPForce.cuh"
+#include "cuda/include/CudaNonbondedQPForce.cuh"
+#include "cuda/include/CudaNonbondedQWForce.cuh"
+#include "cuda/include/CudaNonbondedPWForce.cuh"
+#include "cuda/include/CudaNonbondedWWForce.cuh"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -1068,6 +1075,10 @@ void calc_integration_step(int iteration) {
 
     // Reset derivatives & energies
     reset_energies();
+    if (run_gpu) {
+        CudaContext &ctx = CudaContext::instance();
+        ctx.reset_energies();
+    }
 
     // Determine temperature and kinetic energy
     if (run_gpu) {
@@ -1087,10 +1098,10 @@ void calc_integration_step(int iteration) {
     clock_t start_pp, end_pp, start_qp, end_qp;
     if (run_gpu) {
         start_qp = clock();
-        calc_nonbonded_qp_forces_host();
+        calc_nonbonded_qp_forces_host_v2();
         end_qp = clock();
         start_pp = clock();
-        calc_nonbonded_pp_forces_host();
+        calc_nonbonded_pp_forces_host_v2();
         end_pp = clock();
     }
     else {
@@ -1107,12 +1118,12 @@ void calc_integration_step(int iteration) {
     if (n_waters > 0) {
         if (run_gpu) {
             start_ww = clock();
-            calc_nonbonded_ww_forces_host();
+            calc_nonbonded_ww_forces_host_v2();
             end_ww = clock();
             start_pw = clock();
-            calc_nonbonded_pw_forces_host();
+            calc_nonbonded_pw_forces_host_v2();
             end_pw = clock();
-            calc_nonbonded_qw_forces_host();
+            calc_nonbonded_qw_forces_host_v2();
         }
         else {
             start_ww = clock();
@@ -1138,6 +1149,7 @@ void calc_integration_step(int iteration) {
         if (md.polarisation) {
             if (run_gpu) {
                 calc_polx_water_forces_host(iteration);
+                // calc_polx_w_forces(iteration); // todo: polx water gpu version has bug!!!
             } else {
                 calc_polx_w_forces(iteration);
             }
@@ -1196,7 +1208,6 @@ void calc_integration_step(int iteration) {
     } else {
         calc_temperature();
     }
-
     // Update total potential energies with an average of all states
     for (int state = 0; state < n_lambdas; state++) {
         if (lambdas[state] == 0) {
@@ -1304,6 +1315,30 @@ void calc_integration_step(int iteration) {
 
 }
 
+void init_cuda_kernel_data() {
+    init_angle_force_kernel_data();
+    init_bond_force_kernel_data();
+    init_improper2_force_kernel_data();
+    init_leapfrog_kernel_data();
+    init_nonbonded_pp_force_kernel_data();
+    init_nonbonded_pw_force_kernel_data();
+    init_nonbonded_qp_force_kernel_data();
+    init_nonbonded_qq_force_kernel_data();
+    init_nonbonded_qw_force_kernel_data();
+    init_nonbonded_ww_force_kernel_data();
+    init_polx_water_force_kernel_data();
+    init_pshell_force_kernel_data();
+    init_radix_water_force_kernel_data();
+    init_restrang_force_kernel_data();
+    init_restrdis_force_kernel_data();
+    init_restrpos_force_kernel_data();
+    init_restrseq_force_kernel_data();
+    init_restrwall_force_kernel_data();
+    init_shake_constraints_kernel_data();
+    init_temperature_kernel_data();
+    init_torsion_force_kernel_data();
+}
+
 void init_variables() {
     // From MD file
     init_md("md.csv");
@@ -1423,6 +1458,9 @@ void init_variables() {
     write_header("coords.csv");
     write_header("velocities.csv");
     write_energy_header();
+
+    CudaContext::instance().init();
+    init_cuda_kernel_data();
 }
 
 void clean_variables() {
@@ -1519,17 +1557,24 @@ void clean_variables() {
         cleanup_angle_force();
         cleanup_bond_force();
         cleanup_improper2_force();
-        cleanup_torsion_force();
+        cleanup_leapfrog();
+        cleanup_nonbonded_pp_force();
+        cleanup_nonbonded_pw_force();
+        cleanup_nonbonded_qp_force();
         cleanup_nonbonded_qq_force();
-        cleanup_restrwall_force();
+        cleanup_nonbonded_qw_force();
+        cleanup_nonbonded_ww_force();
+        cleanup_polx_water_force();
+        cleanup_pshell_force();
+        cleanup_radix_water_force();
         cleanup_restrang_force();
-        cleanup_restrpos_force();
         cleanup_restrdis_force();
+        cleanup_restrpos_force();
         cleanup_restrseq_force();
-        cleanup_pshell_force();
+        cleanup_restrwall_force();
+        cleanup_shake_constraints();
         cleanup_temperature();
-        cleanup_radix_water_force();
-        cleanup_leapfrog();
-        cleanup_polx_water_force();
+        cleanup_torsion_force();
+
     }
 }
diff --git a/src/core/utils.cu b/src/core/utils.cu
index a0fe2729..4e269a86 100644
--- a/src/core/utils.cu
+++ b/src/core/utils.cu
@@ -1,3 +1,4 @@
+#include <cuda_runtime.h>
 #include <math.h>
 #include <stdio.h>
 #include <utils.h>
@@ -29,4 +30,11 @@ void check_cudaMalloc(void** devPtr, size_t size) {
         printf(">>> FATAL: cudaMalloc failed with error code %d: %s\n", error, cudaGetErrorString(error));
         exit(EXIT_FAILURE);
     }
-}
\ No newline at end of file
+}
+
+void check_cuda(cudaError_t status) {
+    if (status != cudaSuccess) {
+        printf(">>> FATAL: CUDA call failed with error code %d: %s\n", status, cudaGetErrorString(status));
+        exit(EXIT_FAILURE);
+    }
+}
diff --git a/src/core/utils.h b/src/core/utils.h
index 32d50900..96e224e8 100644
--- a/src/core/utils.h
+++ b/src/core/utils.h
@@ -15,6 +15,9 @@ double to_radians(double degrees);
  * =============================================
  */
 
+#include <cuda_runtime.h>
+
+void check_cuda(cudaError_t status);
 void check_cudaMalloc(void** devPtr, size_t size);
 
-#endif /* __UTILS_H__ */
\ No newline at end of file
+#endif /* __UTILS_H__ */