Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
712b326
Added OpenMP target offloading support
meifeng Sep 27, 2021
229ce57
Added example config-command
meifeng Sep 27, 2021
c2f8ba1
Working simple OpenMP offloading with cudaMallocManaged; cshift not w…
meifeng Sep 29, 2021
40ee605
Merge pull request #1 from paboyle/develop
meifeng Dec 13, 2022
c33a3b3
Fixed --accelerator-threads input to omp target thread_limit()
meifeng Dec 13, 2022
9b3ac3c
Added stdout for number of GPU threads;
meifeng Dec 13, 2022
f6661ce
Merged openmp offload implementation with develop
meifeng Dec 13, 2022
1df8669
change loop counts to local variables so clang compiler doesn't complain
meifeng Dec 20, 2022
36ffe79
Add simple HOWTO instructions and module load script for Cori GPU
meifeng Dec 20, 2022
ed72390
Update HOWTO with an example config-command
meifeng Dec 20, 2022
26ad759
bug fix in HOWTO
meifeng Dec 20, 2022
a9df27f
Merge branch 'develop' of https://www.github.com/paboyle/Grid into fe…
meifeng Jan 10, 2023
3671ace
added omp allocators and dev copies
atif4461 Apr 4, 2023
5b50eaa
Merge pull request #5 from atif4461/omp-offload-develop
meifeng Apr 5, 2023
2b6b98b
Merge branch 'feature/omp-offload' of github.com:BNL-HPC/Grid into fe…
meifeng May 5, 2023
cb277ae
added file line traces
atif4461 Jul 3, 2023
ef8af7b
Merge branch 'develop' of https://github.com/paboyle/Grid into develop
atif4461 Jul 3, 2023
2100cc6
fixed conflicts after merging pabyle develop
atif4461 Jul 3, 2023
1bda8c4
fixed conflicts after merging pabyle develop
atif4461 Jul 3, 2023
59dade8
added steps to reproduce amd omp gpu bug
atif4461 Aug 27, 2023
5a5c481
added objdump files
atif4461 Aug 27, 2023
ec2ddda
included pragma map in Lattice_reduction.h
atif4461 Aug 27, 2023
157368e
Merge branch 'omp-offload-develop' of https://github.com/atif4461/Gri…
atif4461 Aug 27, 2023
e5bc517
edited readme amd stack err
atif4461 Sep 9, 2023
867abea
removed print flags
atif4461 Dec 4, 2023
7a7aa61
cleaned up
atif4461 Dec 4, 2023
f516acd
fixed conflicts; su3 working
atif4461 Dec 4, 2023
505cc69
Merge pull request #6 from atif4461/omp-offload-develop
meifeng Dec 14, 2023
7264121
Merge branch 'paboyle:develop' into feature/omp-offload
meifeng Dec 14, 2023
cc5ab62
Merge branch 'feature/omp-offload' of github.com:BNL-HPC/Grid into fe…
meifeng Dec 14, 2023
1381dbc
Revert back to Grid develop version since new LLVM compilers now do n…
meifeng Dec 15, 2023
37d1d87
bug fix for Intel GPUs
Dec 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Grid/allocator/MemoryManager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,9 @@ void MemoryManager::InitMessage(void) {
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_shared"<<std::endl;
#endif
#ifdef GRID_OMPTARGET
std::cout << GridLogMessage<< "MemoryManager::Init() Using OMPTARGET managed memory"<<std::endl;
#endif
#else
std::cout << GridLogMessage<< "MemoryManager::Init() Non unified: Caching accelerator data in dedicated memory"<<std::endl;
#ifdef GRID_CUDA
Expand All @@ -233,6 +236,9 @@ void MemoryManager::InitMessage(void) {
#ifdef GRID_SYCL
std::cout << GridLogMessage<< "MemoryManager::Init() Using SYCL malloc_device"<<std::endl;
#endif
#ifdef GRID_OMPTARGET
std::cout << GridLogMessage<< "MemoryManager::Init() Using OMPTARGET omp_alloc_device"<<std::endl;
#endif
#endif

}
Expand Down
3 changes: 2 additions & 1 deletion Grid/communicator/SharedMemoryNone.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
///////////////////////////////////////////////////////////////////////////////////////////////////////////
// Each MPI rank should allocate our own buffer
///////////////////////////////////////////////////////////////////////////////////////////////////////////
ShmCommBuf = acceleratorAllocDevice(bytes);
ShmCommBuf = acceleratorAllocShared(bytes);
//ShmCommBuf = acceleratorAllocDevice(bytes);

if (ShmCommBuf == (void *)NULL ) {
std::cerr << " SharedMemoryNone.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
Expand Down
25 changes: 16 additions & 9 deletions Grid/lattice/Lattice_reduction.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,10 +251,10 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
autoView( right_v,right, AcceleratorRead);
// This code could read coalesce
// GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProductD(x_l,y_l));
accelerator_for( ss, sites, 1,{
auto x_l = left_v[ss];
auto y_l = right_v[ss];
inner_tmp_v[ss]=innerProductD(x_l,y_l);
});
}
#else
Expand All @@ -267,11 +267,18 @@ inline ComplexD rankInnerProduct(const Lattice<vobj> &left,const Lattice<vobj> &
autoView( right_v,right, AcceleratorRead);

// GPU - SIMT lane compliance...
accelerator_for( ss, sites, nsimd,{
auto x_l = left_v(ss);
auto y_l = right_v(ss);
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
});
//accelerator_for( ss, sites, nsimd,{
// auto x_l = left_v(ss);
// auto y_l = right_v(ss);
// coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
//});
#pragma omp target map ( to:left_v, right_v ) map ( tofrom:inner_tmp_v )
#pragma omp teams distribute parallel for thread_limit(THREAD_LIMIT) //nowait
for ( uint64_t ss=0;ss<sites;ss++) {
auto x_l = left_v[ss];
auto y_l = right_v[ss];
coalescedWrite(inner_tmp_v[ss],innerProduct(x_l,y_l));
}
}
#endif
// This is in single precision and fails some tests
Expand Down
5 changes: 4 additions & 1 deletion Grid/lattice/Lattice_reduction_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &
#ifdef GRID_HIP
auto r=hipGetDevice(&device);
#endif

#ifdef GRID_OMPTARGET
device = omp_get_device_num();
#endif

Iterator warpSize = gpu_props[device].warpSize;
Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
Expand Down
2 changes: 1 addition & 1 deletion Grid/lattice/Lattice_view.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class LatticeView : public LatticeAccelerator<vobj>
accelerator_inline uint64_t end(void) const { return this->_odata_size; };
accelerator_inline uint64_t size(void) const { return this->_odata_size; };

LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){}
LatticeView(const LatticeAccelerator<vobj> &refer_to_me) : LatticeAccelerator<vobj> (refer_to_me){ }
LatticeView(const LatticeView<vobj> &refer_to_me) = default; // Trivially copyable
LatticeView(const LatticeAccelerator<vobj> &refer_to_me,ViewMode mode) : LatticeAccelerator<vobj> (refer_to_me)
{
Expand Down
11 changes: 7 additions & 4 deletions Grid/threads/Accelerator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@ void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"

#ifdef GRID_CUDA

// fold omptarget into device specific acceleratorInit()
#if defined(GRID_CUDA) || (defined(GRID_OMPTARGET) && defined(__CUDA_ARCH__))
#include <cuda_runtime_api.h>
cudaDeviceProp *gpu_props;
cudaStream_t copyStream;
cudaStream_t computeStream;
Expand Down Expand Up @@ -113,7 +116,7 @@ void acceleratorInit(void)
}
#endif

#ifdef GRID_HIP
#if defined(GRID_HIP) || (defined(GRID_OMPTARGET) && defined(__HIP_DEVICE_COMPILE__))
hipDeviceProp_t *gpu_props;
hipStream_t copyStream;
hipStream_t computeStream;
Expand Down Expand Up @@ -198,7 +201,7 @@ void acceleratorInit(void)
#endif


#ifdef GRID_SYCL
#if defined(GRID_SYCL) //|| (defined(GRID_OMPTARGET) && defined(__SYCL_DEVICE_ONLY__))

cl::sycl::queue *theGridAccelerator;
cl::sycl::queue *theCopyAccelerator;
Expand Down Expand Up @@ -270,7 +273,7 @@ void acceleratorInit(void)
}
#endif

#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))// && (!defined(GRID_OMPTARGET))
void acceleratorInit(void){}
#endif

Expand Down
161 changes: 157 additions & 4 deletions Grid/threads/Accelerator.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,11 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
See the full license in the file "LICENSE" in the top level distribution directory
*************************************************************************************/
/* END LEGAL */
#pragma once

#ifndef ACCELERATOR_H
#define ACCELERATOR_H

#pragma once
#include <string.h>

#ifdef HAVE_MALLOC_MALLOC_H
Expand Down Expand Up @@ -474,14 +477,155 @@ inline void acceleratorCopySynchronise(void) { auto r=hipStreamSynchronize(copyS
#endif

//////////////////////////////////////////////
// CPU Target - No accelerator just thread instead
// OpenMP Target acceleration
//////////////////////////////////////////////
#ifdef GRID_OMPTARGET
//TODO GRID_SIMT for OMPTARGET
#define GRID_ACCELERATED
#include<omp.h>
#ifdef __CUDA_ARCH__
#include <cuda_runtime_api.h>
#elif defined __HIP_DEVICE_COMPILE__
#include <hip/hip_runtime.h>
#elif defined __SYCL_DEVICE_ONLY__
#include <CL/sycl.hpp>
#include <CL/sycl/usm.hpp>
#endif
extern "C" void *llvm_omp_target_alloc_host (size_t Size, int DeviceNum);
extern "C" void *llvm_omp_target_alloc_device(size_t Size, int DeviceNum);
extern "C" void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum);
//TODO: Dynamic Shared Memory

#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
#define THREAD_LIMIT acceleratorThreads()

#undef GRID_SIMT
#define accelerator
#define accelerator_inline strong_inline
#ifdef THREAD_LIMIT
#define accelerator_for(i,num,nsimd, ... ) \
_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT)") \
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define accelerator_forNB(i,num,nsimd, ... ) \
_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT) nowait") \
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define accelerator_barrier(dummy) _Pragma("omp barrier")
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
_Pragma("omp target teams distribute parallel for thread_limit(THREAD_LIMIT) collapse(2)") \
for ( uint64_t iter1=0;iter1<num1;iter1++) \
for ( uint64_t iter2=0;iter2<num2;iter2++) { __VA_ARGS__ } ;
#else
#define accelerator_for(i,num,nsimd, ... ) \
_Pragma("omp target teams distribute parallel for") \
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define accelerator_forNB(i,num,nsimd, ... ) \
_Pragma("omp target teams distribute parallel for nowait") \
for ( uint64_t i=0;i<num;i++) { __VA_ARGS__ } ;
#define accelerator_barrier(dummy) _Pragma("omp barrier")
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
_Pragma("omp target teams distribute parallel for collapse(2)") \
for ( uint64_t iter1=0;iter1<num1;iter1++) \
for ( uint64_t iter2=0;iter2<num2;iter2++) { __VA_ARGS__ } ;
#endif

accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes)
{
int devc = omp_get_default_device();
int host = omp_get_initial_device();
if( omp_target_memcpy( to, from, bytes, 0, 0, devc, host ) ) {
printf(" omp_target_memcpy host to device failed for %ld in device %d \n",bytes,devc);
}
};
inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes)
{
int devc = omp_get_default_device();
int host = omp_get_initial_device();
if( omp_target_memcpy( to, from, bytes, 0, 0, host, devc ) ) {
printf(" omp_target_memcpy device to host failed for %ld in device %d \n",bytes,devc);
}
};
inline void acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes)
{
#ifdef __CUDA_ARCH__
extern cudaStream_t copyStream;
cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
#elif defined __HIP_DEVICE_COMPILE__
extern hipStream_t copyStream;
hipMemcpyDtoDAsync(to,from,bytes, copyStream);
#elif defined __SYCL_DEVICE_ONLY__
theCopyAccelerator->memcpy(to,from,bytes);
#endif
};
inline void acceleratorCopySynchronise(void)
{
//#pragma omp barrier
#ifdef __CUDA_ARCH__
extern cudaStream_t copyStream;
cudaStreamSynchronize(copyStream);
#elif defined __HIP_DEVICE_COMPILE__
extern hipStream_t copyStream;
hipStreamSynchronize(copyStream);
#elif defined __SYCL_DEVICE_ONLY__
theCopyAccelerator->wait();
#endif
};
inline int acceleratorIsCommunicable(void *ptr){ return 1; }
inline void acceleratorMemSet(void *base,int value,size_t bytes)
{
void *base_host = memalign(GRID_ALLOC_ALIGN,bytes);
memset(base_host,value,bytes);
int devc = omp_get_default_device();
int host = omp_get_initial_device();
if( omp_target_memcpy( base, base_host, bytes, 0, 0, devc, host ) ) {
printf(" omp_target_memcpy device to host failed in MemSet for %ld in device %d \n",bytes,devc);
}
};
inline void *acceleratorAllocShared(size_t bytes)
{
#ifdef __CUDA_ARCH__
void *ptr=NULL;
auto err = cudaMallocManaged((void **)&ptr,bytes);
if( err != cudaSuccess ) {
ptr = (void *) NULL;
printf(" cudaMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
}
return ptr;
#elif defined __HIP_DEVICE_COMPILE__
void *ptr=NULL;
auto err = hipMallocManaged((void **)&ptr,bytes);
if( err != hipSuccess ) {
ptr = (void *) NULL;
printf(" hipMallocManaged failed for %d %s \n",bytes,cudaGetErrorString(err));
}
return ptr;
#elif defined __SYCL_DEVICE_ONLY__
queue q;
//void *ptr = malloc_shared<void *>(bytes, q);
return ptr;
#else
int devc = omp_get_default_device();
void *ptr=NULL;
ptr = (void *) llvm_omp_target_alloc_shared(bytes, devc);
if( ptr == NULL ) {
printf(" llvm_omp_target_alloc_shared failed for %ld in device %d \n",bytes,devc);
}
return ptr;
#endif
};
inline void *acceleratorAllocDevice(size_t bytes)
{
int devc = omp_get_default_device();
void *ptr=NULL;
ptr = (void *) omp_target_alloc(bytes, devc);
if( ptr == NULL ) {
printf(" omp_target_alloc failed for %ld in device %d \n",bytes,devc);
}
return ptr;
};
inline void acceleratorFreeShared(void *ptr){omp_target_free(ptr, omp_get_default_device());};
inline void acceleratorFreeDevice(void *ptr){omp_target_free(ptr, omp_get_default_device());};

//OpenMP CPU threads
#else

#define accelerator
#define accelerator_inline strong_inline
Expand Down Expand Up @@ -510,7 +654,14 @@ inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALI
inline void acceleratorFreeShared(void *ptr){free(ptr);};
inline void acceleratorFreeDevice(void *ptr){free(ptr);};
#endif
#endif

//////////////////////////////////////////////
// CPU Target - No accelerator just thread instead
//////////////////////////////////////////////

#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) ) && (!defined(GRID_OMPTARGET))
#undef GRID_SIMT
#endif // CPU target

#ifdef HAVE_MM_MALLOC_H
Expand Down Expand Up @@ -583,3 +734,5 @@ inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)


NAMESPACE_END(Grid);
#endif

2 changes: 1 addition & 1 deletion Grid/threads/Threads.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ Author: paboyle <paboyle@ph.ed.ac.uk>
#endif

#ifdef GRID_OMP
#define DO_PRAGMA_(x) _Pragma (#x)
#define DO_PRAGMA_(x) _Pragma ("x")
#define DO_PRAGMA(x) DO_PRAGMA_(x)
#define thread_num(a) omp_get_thread_num()
#define thread_max(a) omp_get_max_threads()
Expand Down
25 changes: 25 additions & 0 deletions HOWTO
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
1. on Cori GPU, load necessary modules
source ./load_cgpu_modules.sh

2. run bootstrap scrip
./bootstrap.sh

3. Create a build directory, for example,
mkdir build-cgpu

3. run configure script in the build directory
cd build-cgpu
sh config-command

Example config-command for single-GPU omp offload:

../configure \
--enable-comms=none \
--enable-simd=GEN \
--enable-gen-simd-width=16 \
CXX=clang++ \
LDFLAGS="-L${CUDA_ROOT}/lib64 -lcudart" \
CXXFLAGS="-Wno-unknown-cuda-version -I${CUDA_ROOT}/include -fopenmp -std=c++14 -fopenmp-cuda-mode -O3 -g -fopenmp-targets=nvptx64-nvidia-cuda -Wformat -DOMPTARGET -DOMPTARGET_MANAGED"

4. compile
make -j8
27 changes: 27 additions & 0 deletions amd-omp-stack-err/README
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
module load rocm/5.5.1

mkdir build-amd-err && cd build-amd-err

cp ../amd-omp-stack-err/Test.cc ../amd-omp-stack-err/WilsonFermionInstantiationWilsonImplD.cc .

../configure CXX=amdclang++ --enable-comms=none --enable-simd=GEN --enable-accelerator-cshift=no --enable-shm=no --disable-unified --enable-unified=no --enable-fermion-reps=no --enable-gen-simd-width=16 CXXFLAGS="-Wno-unknown-cuda-version -fopenmp --offload-arch=gfx90a -std=c++14 -fopenmp-cuda-mode -O3 -g -Wformat -DEIGEN_NO_CUDA -DEIGEN_DONT_VECTORIZE -DOMPTARGET"

amdclang++ -c Test.cc -o Test.o -I/autofs/nccs-svm1_home1/atif/Grid -I/autofs/nccs-svm1_home1/atif/Grid/build-amd-err/Grid/ -O3 -Wno-unknown-cuda-version -fopenmp --offload-arch=gfx90a -std=c++14 -fopenmp-cuda-mode -O3 -Wformat -DEIGEN_NO_CUDA -DOMPTARGET -fno-strict-aliasing

amdclang++ -c WilsonFermionInstantiationWilsonImplD.cc -o WilsonFails.o -I/autofs/nccs-svm1_home1/atif/Grid -I/autofs/nccs-svm1_home1/atif/Grid/build-amd-err/Grid/ -O3 -Wno-unknown-cuda-version -fopenmp --offload-arch=gfx90a -std=c++14 -fopenmp-cuda-mode -O3 -Wformat -DEIGEN_NO_CUDA -DOMPTARGET -fno-strict-aliasing

ar cru libWilsonFails.a WilsonFails.o

ranlib libWilsonFails.a

amdclang++ -o Test -I/autofs/nccs-svm1_home1/atif/Grid -I/autofs/nccs-svm1_home1/atif/Grid/build-amd-err/Grid/ -O3 -Wno-unknown-cuda-version -fopenmp --offload-arch=gfx90a -std=c++14 -fopenmp-cuda-mode -O3 -Wformat -DEIGEN_NO_CUDA -DOMPTARGET -fno-strict-aliasing Test.o -L./ -lWilsonFails

error: stack frame size (149840) exceeds limit (131056) in function '__omp_offloading_72_1e118ab9__ZN4Grid7LatticeINS_7iScalarINS_7iMatrixINS2_INS_9Grid_simdISt7complexIdENS_12Optimization3vecIdEEEELi3EEELi4EEEEEEaSINS_12TrinaryWhereENS0_INS1_INS3_IjNS7_IjEEEEEEEESD_SD_EERSD_RKNS_24LatticeTrinaryExpressionIT_T0_T1_T2_EE_l190'
error: stack frame size (149840) exceeds limit (131056) in function '__omp_offloading_72_1e118ab9__ZN4Grid7LatticeINS_7iScalarINS_7iMatrixINS2_INS_9Grid_simdISt7complexIdENS_12Optimization3vecIdEEEELi3EEELi4EEEEEEaSINS_12TrinaryWhereENS_23LatticeBinaryExpressionINS_10BinaryOrOrENS0_INS1_INS3_IjNS7_IjEEEEEEEESL_EESD_SD_EERSD_RKNS_24LatticeTrinaryExpressionIT_T0_T1_T2_EE_l190'
error: stack frame size (149840) exceeds limit (131056) in function '__omp_offloading_72_1e118ab9__ZN4Grid7LatticeINS_7iScalarINS_7iMatrixINS2_INS_9Grid_simdISt7complexIdENS_12Optimization3vecIdEEEELi3EEELi4EEEEEEaSINS_9BinaryAddESD_NS_24LatticeTrinaryExpressionINS_12TrinaryWhereENS0_INS1_INS3_IjNS7_IjEEEEEEEESD_SD_EEEERSD_RKNS_23LatticeBinaryExpressionIT_T0_T1_EE_l166'
clang-16: error: amdgcn-link command failed with exit code 1 (use -v to see invocation)


llvm-objdump -t libWilsonFermionWorks2.a > objdump_works2.txt
llvm-cxxfilt < objdump_works2.txt > cxxfilt_works2.txt

Loading