paboyle · meifeng · Sep 27, 2021 · Sep 27, 2021 · Sep 29, 2021 · Dec 13, 2022
diff --git a/Grid/allocator/MemoryManager.cc b/Grid/allocator/MemoryManager.cc
@@ -222,6 +222,9 @@ void MemoryManager::InitMessage(void) {
 #ifdef GRID_SYCL
   std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
 #endif
+#ifdef GRID_OMPTARGET
+  std::cout << GridLogMessage<< "MemoryManager::Init() Using OMPTARGET managed memory"<<std::endl;
+#endif
 #else
   std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
 #ifdef GRID_CUDA
@@ -233,6 +236,9 @@ void MemoryManager::InitMessage(void) {
 #ifdef GRID_SYCL
   std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
 #endif
+#ifdef GRID_OMPTARGET
+  std::cout << GridLogMessage<< "MemoryManager::Init() Using OMPTARGET omp_alloc_device"<<std::endl;
+#endif
 #endif
 
 }

diff --git a/Grid/communicator/SharedMemoryNone.cc b/Grid/communicator/SharedMemoryNone.cc
@@ -68,7 +68,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
   // Each MPI rank should allocate our own buffer
   ///////////////////////////////////////////////////////////////////////////////////////////////////////////
-  ShmCommBuf = acceleratorAllocDevice(bytes);
+  ShmCommBuf = acceleratorAllocShared(bytes);
+  //ShmCommBuf = acceleratorAllocDevice(bytes);
 
   if (ShmCommBuf == (void *)NULL ) {
     std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;

diff --git a/Grid/lattice/Lattice_reduction.h b/Grid/lattice/Lattice_reduction.h
@@ -251,10 +251,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
     autoView( right_v,right, AcceleratorRead);
     // This code could read coalesce
     // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v(ss);
-	auto y_l = right_v(ss);
-	coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
+    accelerator_for( ss, sites, 1,{
+        auto x_l = left_v[ss];
+        auto y_l = right_v[ss];
+        inner_tmp_v[ss]=innerProductD(x_l,y_l);
     });
   }
 #else
@@ -267,11 +267,18 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
     autoView( right_v,right, AcceleratorRead);
 
     // GPU - SIMT lane compliance...
-    accelerator_for( ss, sites, nsimd,{
-	auto x_l = left_v(ss);
-	auto y_l = right_v(ss);
-	coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
-    });
+    //accelerator_for( ss, sites, nsimd,{
+    //    auto x_l = left_v(ss);
+    //    auto y_l = right_v(ss);
+    //    coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
+    //});
+    #pragma omp target map ( to:left_v, right_v ) map ( tofrom:inner_tmp_v )
+    #pragma omp teams distribute parallel for thread_limit(THREAD_LIMIT) //nowait
+    for ( uint64_t ss=0;ss<sites;ss++) { 
+        auto x_l = left_v[ss];
+        auto y_l = right_v[ss];
+        coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
+    }
   }
 #endif
   // This is in single precision and fails some tests

diff --git a/Grid/lattice/Lattice_reduction_gpu.h b/Grid/lattice/Lattice_reduction_gpu.h
@@ -32,7 +32,10 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
 #ifdef GRID_HIP
   auto r=hipGetDevice(&device);
 #endif
-
+#ifdef GRID_OMPTARGET
+  device = omp_get_device_num();  
+#endif
+
   Iterator warpSize            = gpu_props[device].warpSize;
   Iterator sharedMemPerBlock   = gpu_props[device].sharedMemPerBlock;
   Iterator maxThreadsPerBlock  = gpu_props[device].maxThreadsPerBlock;

diff --git a/Grid/lattice/Lattice_view.h b/Grid/lattice/Lattice_view.h
@@ -79,7 +79,7 @@ class LatticeView : public LatticeAccelerator<vobj>
   accelerator_inline uint64_t end(void)   const { return this->_odata_size; };
   accelerator_inline uint64_t size(void)  const { return this->_odata_size; };
 
-  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
+  LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){ }
   LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
   LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
   {

diff --git a/Grid/threads/Accelerator.cc b/Grid/threads/Accelerator.cc
@@ -14,7 +14,10 @@ void     acceleratorThreads(uint32_t t) {accelerator_threads = t;};
 #define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
 #define ENV_RANK_MVAPICH       "MV2_COMM_WORLD_RANK"
 
-#ifdef GRID_CUDA
+
+// fold omptarget into device specific acceleratorInit()
+#if defined(GRID_CUDA) || (defined(GRID_OMPTARGET) && defined(__CUDA_ARCH__))
+#include <cuda_runtime_api.h>
 cudaDeviceProp *gpu_props;
 cudaStream_t copyStream;
 cudaStream_t computeStream;
@@ -113,7 +116,7 @@ void acceleratorInit(void)
 }
 #endif
 
-#ifdef GRID_HIP
+#if defined(GRID_HIP) || (defined(GRID_OMPTARGET) && defined(__HIP_DEVICE_COMPILE__))
 hipDeviceProp_t *gpu_props;
 hipStream_t copyStream;
 hipStream_t computeStream;
@@ -198,7 +201,7 @@ void acceleratorInit(void)
 #endif
 
 
-#ifdef GRID_SYCL
+#if defined(GRID_SYCL) //|| (defined(GRID_OMPTARGET) && defined(__SYCL_DEVICE_ONLY__))
 
 cl::sycl::queue *theGridAccelerator;
 cl::sycl::queue *theCopyAccelerator;
@@ -270,7 +273,7 @@ void acceleratorInit(void)
 }
 #endif
 
-#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
+#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))// && (!defined(GRID_OMPTARGET))
 void acceleratorInit(void){}
 #endif
 

diff --git a/Grid/threads/Accelerator.h b/Grid/threads/Accelerator.h
@@ -26,8 +26,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
     See the full license in the file "LICENSE" in the top level distribution directory
 *************************************************************************************/
 /*  END LEGAL */
-#pragma once
 
+#ifndef ACCELERATOR_H
+#define ACCELERATOR_H
+
+#pragma once
 #include <string.h>
 
 #ifdef HAVE_MALLOC_MALLOC_H
@@ -474,14 +477,155 @@ inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyS
 #endif
 
 //////////////////////////////////////////////
-// CPU Target - No accelerator just thread instead
+// OpenMP Target acceleration
 //////////////////////////////////////////////
+#ifdef GRID_OMPTARGET
+//TODO GRID_SIMT for OMPTARGET
+#define GRID_ACCELERATED
+#include<omp.h>
+#ifdef __CUDA_ARCH__
+#include <cuda_runtime_api.h>
+#elif defined __HIP_DEVICE_COMPILE__
+#include <hip/hip_runtime.h>
+#elif defined __SYCL_DEVICE_ONLY__
+#include <CL/sycl.hpp>
+#include <CL/sycl/usm.hpp>
+#endif
+extern "C" void *llvm_omp_target_alloc_host  (size_t Size, int DeviceNum);
+extern "C" void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum);
+extern "C" void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
+//TODO: Dynamic Shared Memory
 
-#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
+#define THREAD_LIMIT acceleratorThreads()
 
-#undef GRID_SIMT
+#define accelerator
+#define accelerator_inline strong_inline
+#ifdef THREAD_LIMIT
+#define accelerator_for(i,num,nsimd, ... ) \
+	_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT)") \
+	for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ; 
+#define accelerator_forNB(i,num,nsimd, ... ) \
+	_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT) nowait") \
+        for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define accelerator_barrier(dummy) _Pragma("omp barrier") 
+#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
+	_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT) collapse(2)") \
+        for ( uint64_t iter1=0;iter1<num1;iter1++) \
+	for ( uint64_t iter2=0;iter2<num2;iter2++) { __VA_ARGS__ } ;
+#else
+#define accelerator_for(i,num,nsimd, ... ) \
+        _Pragma("omp target teams distribute parallel for") \
+        for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define accelerator_forNB(i,num,nsimd, ... ) \
+        _Pragma("omp target teams distribute parallel for nowait") \
+        for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
+#define accelerator_barrier(dummy) _Pragma("omp barrier")
+#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
+        _Pragma("omp target teams distribute parallel for collapse(2)") \
+        for ( uint64_t iter1=0;iter1<num1;iter1++) \
+        for ( uint64_t iter2=0;iter2<num2;iter2++) { __VA_ARGS__ } ;
+#endif
 
+accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
+inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)
+{
+  int devc = omp_get_default_device();
+  int host = omp_get_initial_device();
+  if( omp_target_memcpy( to, from, bytes, 0, 0, devc, host ) ) {
+    printf(" omp_target_memcpy host to device failed for %ld in device %d \n",bytes,devc);
+  }
+};
+inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes)
+{
+  int devc = omp_get_default_device();
+  int host = omp_get_initial_device();
+  if( omp_target_memcpy( to, from, bytes, 0, 0, host, devc ) ) {
+    printf(" omp_target_memcpy device to host failed for %ld in device %d \n",bytes,devc);
+  }
+};
+inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) 
+{ 
+#ifdef __CUDA_ARCH__
+  extern cudaStream_t copyStream;
+  cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
+#elif defined __HIP_DEVICE_COMPILE__
+  extern hipStream_t copyStream;
+  hipMemcpyDtoDAsync(to,from,bytes, copyStream);
+#elif defined __SYCL_DEVICE_ONLY__
+  theCopyAccelerator->memcpy(to,from,bytes);
+#endif
+};
+inline void acceleratorCopySynchronise(void) 
+{
+  //#pragma omp barrier
+#ifdef __CUDA_ARCH__
+  extern cudaStream_t copyStream;
+  cudaStreamSynchronize(copyStream);
+#elif defined __HIP_DEVICE_COMPILE__
+  extern hipStream_t copyStream;
+  hipStreamSynchronize(copyStream);
+#elif defined __SYCL_DEVICE_ONLY__
+  theCopyAccelerator->wait();
+#endif
+};
+inline int  acceleratorIsCommunicable(void *ptr){ return 1; }
+inline void acceleratorMemSet(void *base,int value,size_t bytes)
+{
+  void *base_host = memalign(GRID_ALLOC_ALIGN,bytes);
+  memset(base_host,value,bytes);
+  int devc = omp_get_default_device();
+  int host = omp_get_initial_device();
+  if( omp_target_memcpy( base, base_host, bytes, 0, 0, devc, host ) ) {
+    printf(" omp_target_memcpy device to host failed in MemSet for %ld in device %d \n",bytes,devc);
+  }
+};
+inline void *acceleratorAllocShared(size_t bytes)
+{
+#ifdef __CUDA_ARCH__
+  void *ptr=NULL;
+  auto err = cudaMallocManaged((void **)&ptr,bytes);
+  if( err != cudaSuccess ) {
+    ptr = (void *) NULL;
+    printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+  }
+  return ptr;
+#elif defined __HIP_DEVICE_COMPILE__
+  void *ptr=NULL;
+  auto err = hipMallocManaged((void **)&ptr,bytes);
+  if( err != hipSuccess ) {
+    ptr = (void *) NULL;
+    printf(" hipMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
+  }
+  return ptr;
+#elif defined __SYCL_DEVICE_ONLY__
+  queue q;
+  //void *ptr = malloc_shared<void *>(bytes, q);
+  return ptr;
+#else
+  int devc = omp_get_default_device();
+  void *ptr=NULL;
+  ptr = (void *) llvm_omp_target_alloc_shared(bytes, devc);
+  if( ptr == NULL ) {
+    printf(" llvm_omp_target_alloc_shared failed for %ld in device %d \n",bytes,devc);
+  }
+  return ptr;
+#endif
+};
+inline void *acceleratorAllocDevice(size_t bytes)
+{
+  int devc = omp_get_default_device();
+  void *ptr=NULL;
+  ptr = (void *) omp_target_alloc(bytes, devc);
+  if( ptr == NULL ) {
+    printf(" omp_target_alloc failed for %ld in device %d \n",bytes,devc);
+  }
+  return ptr;
+};
+inline void acceleratorFreeShared(void *ptr){omp_target_free(ptr, omp_get_default_device());};
+inline void acceleratorFreeDevice(void *ptr){omp_target_free(ptr, omp_get_default_device());};
 
+//OpenMP CPU threads
+#else
 
 #define accelerator 
 #define accelerator_inline strong_inline
@@ -510,7 +654,14 @@ inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALI
 inline void acceleratorFreeShared(void *ptr){free(ptr);};
 inline void acceleratorFreeDevice(void *ptr){free(ptr);};
 #endif
+#endif
+
+//////////////////////////////////////////////
+// CPU Target - No accelerator just thread instead
+//////////////////////////////////////////////
 
+#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) && (!defined(GRID_OMPTARGET))
+#undef GRID_SIMT
 #endif // CPU target
 
 #ifdef HAVE_MM_MALLOC_H
@@ -583,3 +734,5 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
 
 
 NAMESPACE_END(Grid);
+#endif
+
diff --git a/Grid/threads/Threads.h b/Grid/threads/Threads.h
@@ -46,7 +46,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
 #endif
 
 #ifdef GRID_OMP
-#define DO_PRAGMA_(x) _Pragma (#x)
+#define DO_PRAGMA_(x) _Pragma ("x")
 #define DO_PRAGMA(x) DO_PRAGMA_(x)
 #define thread_num(a) omp_get_thread_num()
 #define thread_max(a) omp_get_max_threads()

diff --git a/HOWTO b/HOWTO
@@ -0,0 +1,25 @@
+1. on Cori GPU, load necessary modules
+source ./load_cgpu_modules.sh 
+
+2. run bootstrap scrip
+./bootstrap.sh
+
+3. Create a build directory, for example, 
+mkdir build-cgpu
+
+3. run configure script in the build directory 
+cd build-cgpu
+sh config-command
+
+Example config-command for single-GPU omp offload: 
+
+../configure \
+    --enable-comms=none \
+    --enable-simd=GEN \
+    --enable-gen-simd-width=16 \
+    CXX=clang++ \
+    LDFLAGS="-L${CUDA_ROOT}/lib64 -lcudart" \
+    CXXFLAGS="-Wno-unknown-cuda-version -I${CUDA_ROOT}/include -fopenmp -std=c++14 -fopenmp-cuda-mode  -O3 -g -fopenmp-targets=nvptx64-nvidia-cuda -Wformat -DOMPTARGET -DOMPTARGET_MANAGED"
+
+4. compile 
+make -j8
diff --git a/amd-omp-stack-err/README b/amd-omp-stack-err/README
@@ -0,0 +1,27 @@
+module load rocm/5.5.1
+
+mkdir build-amd-err && cd build-amd-err
+
+cp ../amd-omp-stack-err/Test.cc ../amd-omp-stack-err/WilsonFermionInstantiationWilsonImplD.cc .
+
+../configure CXX=amdclang++ --enable-comms=none --enable-simd=GEN --enable-accelerator-cshift=no --enable-shm=no --disable-unified --enable-unified=no --enable-fermion-reps=no --enable-gen-simd-width=16 CXXFLAGS="-Wno-unknown-cuda-version -fopenmp --offload-arch=gfx90a -std=c++14 -fopenmp-cuda-mode -O3 -g -Wformat -DEIGEN_NO_CUDA -DEIGEN_DONT_VECTORIZE -DOMPTARGET"
+
+amdclang++ -c Test.cc -o Test.o -I/autofs/nccs-svm1_home1/atif/Grid -I/autofs/nccs-svm1_home1/atif/Grid/build-amd-err/Grid/ -O3 -Wno-unknown-cuda-version -fopenmp --offload-arch=gfx90a -std=c++14 -fopenmp-cuda-mode -O3 -Wformat -DEIGEN_NO_CUDA -DOMPTARGET -fno-strict-aliasing
+
+amdclang++ -c WilsonFermionInstantiationWilsonImplD.cc -o WilsonFails.o -I/autofs/nccs-svm1_home1/atif/Grid -I/autofs/nccs-svm1_home1/atif/Grid/build-amd-err/Grid/ -O3 -Wno-unknown-cuda-version -fopenmp --offload-arch=gfx90a -std=c++14 -fopenmp-cuda-mode -O3 -Wformat -DEIGEN_NO_CUDA -DOMPTARGET -fno-strict-aliasing
+
+ar cru libWilsonFails.a WilsonFails.o
+
+ranlib libWilsonFails.a
+
+amdclang++ -o Test -I/autofs/nccs-svm1_home1/atif/Grid -I/autofs/nccs-svm1_home1/atif/Grid/build-amd-err/Grid/   -O3 -Wno-unknown-cuda-version -fopenmp --offload-arch=gfx90a -std=c++14 -fopenmp-cuda-mode -O3 -Wformat -DEIGEN_NO_CUDA -DOMPTARGET  -fno-strict-aliasing Test.o -L./  -lWilsonFails
+
+error: stack frame size (149840) exceeds limit (131056) in function '__omp_offloading_72_1e118ab9__ZN4Grid7LatticeINS_7iScalarINS_7iMatrixINS2_INS_9Grid_simdISt7complexIdENS_12Optimization3vecIdEEEELi3EEELi4EEEEEEaSINS_12TrinaryWhereENS0_INS1_INS3_IjNS7_IjEEEEEEEESD_SD_EERSD_RKNS_24LatticeTrinaryExpressionIT_T0_T1_T2_EE_l190'
+error: stack frame size (149840) exceeds limit (131056) in function '__omp_offloading_72_1e118ab9__ZN4Grid7LatticeINS_7iScalarINS_7iMatrixINS2_INS_9Grid_simdISt7complexIdENS_12Optimization3vecIdEEEELi3EEELi4EEEEEEaSINS_12TrinaryWhereENS_23LatticeBinaryExpressionINS_10BinaryOrOrENS0_INS1_INS3_IjNS7_IjEEEEEEEESL_EESD_SD_EERSD_RKNS_24LatticeTrinaryExpressionIT_T0_T1_T2_EE_l190'
+error: stack frame size (149840) exceeds limit (131056) in function '__omp_offloading_72_1e118ab9__ZN4Grid7LatticeINS_7iScalarINS_7iMatrixINS2_INS_9Grid_simdISt7complexIdENS_12Optimization3vecIdEEEELi3EEELi4EEEEEEaSINS_9BinaryAddESD_NS_24LatticeTrinaryExpressionINS_12TrinaryWhereENS0_INS1_INS3_IjNS7_IjEEEEEEEESD_SD_EEEERSD_RKNS_23LatticeBinaryExpressionIT_T0_T1_EE_l166'
+clang-16: error: amdgcn-link command failed with exit code 1 (use -v to see invocation)
+
+
+llvm-objdump -t libWilsonFermionWorks2.a > objdump_works2.txt
+llvm-cxxfilt < objdump_works2.txt > cxxfilt_works2.txt
+