diff --git a/CMakeLists.txt b/CMakeLists.txt index 7f805145..a1d2b2cc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,11 +14,29 @@ set(DEME_VERSION_MAJOR 2) set(DEME_VERSION_MINOR 1) set(DEME_VERSION_PATCH 0) -project( - Chrono-DEM-Engine - VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH} - LANGUAGES CXX CUDA -) +# HIP/ROCm support option (must be set before project() to select the right language) +option(USE_HIP "Build with HIP for AMD GPUs" OFF) + +if(USE_HIP) + project( + Chrono-DEM-Engine + VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH} + LANGUAGES CXX HIP + ) + # Set default HIP architecture if not specified + if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "") + set(CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "HIP architectures") + endif() + add_compile_definitions(USE_HIP) + # CXX files that include hip_runtime.h need this defined (HIP compiler defines it automatically) + add_compile_definitions(__HIP_PLATFORM_AMD__) +else() + project( + Chrono-DEM-Engine + VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH} + LANGUAGES CXX CUDA + ) +endif() if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "Setting CMAKE_BUILD_TYPE to 'Release' (default)") @@ -41,20 +59,29 @@ include(cmake/FixNinjaColors.cmake) fix_ninja_colors() -find_package(CUDAToolkit REQUIRED) - -# Find CUB library (this might need to be done in source-level config) -find_package( - CUB REQUIRED - HINTS ${CUDAToolkit_ROOT}/lib64/cmake/cub -) +if(USE_HIP) + # Find hipCUB for HIP builds + find_package(hipcub REQUIRED) + find_package(hip REQUIRED) +else() + find_package(CUDAToolkit REQUIRED) + # Find CUB library (this might need to be done in source-level config) + find_package( + CUB REQUIRED + HINTS ${CUDAToolkit_ROOT}/lib64/cmake/cub + ) +endif() -# Find NVIDIA's Jitify library -find_path( - NVIDIAJitifyPath - NAMES jitify.hpp - PATHS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/jitify" -) +# Find Jitify library (NVIDIA version only; HIP uses hiprtc directly via JitKernel abstraction) +if(NOT USE_HIP) + find_path( + JitifyPath + NAMES jitify.hpp + PATHS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/jitify" + ) + # Keep backward compat alias + set(NVIDIAJitifyPath ${JitifyPath}) +endif() # Let the user decide if they want to use ChPF option(USE_CHPF "Toggle the use of ChPF for outputting" OFF) @@ -119,8 +146,13 @@ cxx_std_autodetect() set(ProjectIncludeSource "${CMAKE_CURRENT_SOURCE_DIR}/src") set(ProjectIncludeGenerated "${CMAKE_BINARY_DIR}/src") -# Global fix for CUDA language bug -include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +# Global fix for CUDA/HIP language bug +if(USE_HIP) + # Add HIP include directories for all targets (both CXX and HIP) + include_directories(${HIP_INCLUDE_DIR}) +else() + include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) +endif() #------------------------------------------------------------ @@ -164,22 +196,41 @@ if(USE_CHPF) target_compile_definitions(simulator_multi_gpu PUBLIC DEME_USE_CHPF) set(USE_CHPF_STR "ON") - target_link_libraries(simulator_multi_gpu - PUBLIC CUDA::cudart - PUBLIC CUDA::nvrtc - PUBLIC CUDA::cuda_driver - PUBLIC ${ChPF_IMPORTED_NAME} - PUBLIC DEMERuntimeDataHelper - ) + if(USE_HIP) + find_library(HIPRTC_LIB hiprtc HINTS ${ROCM_PATH}/lib /opt/rocm/lib REQUIRED) + target_link_libraries(simulator_multi_gpu + PUBLIC hip::host + PUBLIC ${HIPRTC_LIB} + PUBLIC ${ChPF_IMPORTED_NAME} + PUBLIC DEMERuntimeDataHelper + ) + else() + target_link_libraries(simulator_multi_gpu + PUBLIC CUDA::cudart + PUBLIC CUDA::nvrtc + PUBLIC CUDA::cuda_driver + PUBLIC ${ChPF_IMPORTED_NAME} + PUBLIC DEMERuntimeDataHelper + ) + endif() else() set(USE_CHPF_STR "OFF") - target_link_libraries(simulator_multi_gpu - PUBLIC CUDA::cudart - PUBLIC CUDA::nvrtc - PUBLIC CUDA::cuda_driver - PUBLIC DEMERuntimeDataHelper - ) + if(USE_HIP) + find_library(HIPRTC_LIB hiprtc HINTS ${ROCM_PATH}/lib /opt/rocm/lib REQUIRED) + target_link_libraries(simulator_multi_gpu + PUBLIC hip::host + PUBLIC ${HIPRTC_LIB} + PUBLIC DEMERuntimeDataHelper + ) + else() + target_link_libraries(simulator_multi_gpu + PUBLIC CUDA::cudart + PUBLIC CUDA::nvrtc + PUBLIC CUDA::cuda_driver + PUBLIC DEMERuntimeDataHelper + ) + endif() endif() # If use managed arrays, define a macro @@ -194,10 +245,17 @@ if(WIN32) endif() # Attach include directories to the top-level library target -set_target_properties(simulator_multi_gpu - PROPERTIES - LINKER_LANGUAGE CUDA -) +if(USE_HIP) + set_target_properties(simulator_multi_gpu + PROPERTIES + LINKER_LANGUAGE HIP + ) +else() + set_target_properties(simulator_multi_gpu + PROPERTIES + LINKER_LANGUAGE CUDA + ) +endif() # ---------------------------------------------------------------------------- # # Export and Install The Generated Targets diff --git a/README.md b/README.md index f6d3f9b0..45011764 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ __A dual-GPU DEM solver with complex grain geometry support__ DEM-Engine, nicknamed _DEME_, does Discrete Element Method simulations: -- Using up to two GPUs at the same time (works great on consumer _and_ data center GPUs). +- Using up to two GPUs at the same time (works great on consumer _and_ data center GPUs, on both NVIDIA CUDA and AMD ROCm). - With the particles having complex shapes represented by clumped spheres. - With support for customizable contact force models (want to add a non-standard cohesive force, or an electrostatic repulsive force? You got this). - With an emphasis on computational efficiency. As a rule of thumb, using 3-sphere clump elements, simulating 1 million elements for 1 million time steps takes around 1 hour on two RTX 3080s. @@ -139,6 +139,21 @@ Some additional troubleshooting tips for building the project: - If CUB is not found, then you may manually set it in the `ccmake` GUI as `/usr/local/cuda/lib64/cmake/cub`. It may be a slightly different path on your machine or cluster. - If `libcudacxx` is not found, then you may manually set it in the `ccmake` GUI as `/usr/local/cuda-12.8/targets/x86_64-linux/lib/cmake/libcudacxx`. Depending on your CUDA version it may be a slightly different path on your machine or cluster. You may also try to find these packages using `find`. +### AMD GPUs (ROCm) + +_DEME_ also builds and runs on AMD GPUs through ROCm/HIP. Instead of CUDA, install a recent [ROCm](https://rocm.docs.amd.com/) release (the HIP runtime, `hiprtc`, and `hipCUB` are required). You do not need the NVIDIA/jitify submodule for an AMD build; runtime kernel compilation is handled by `hiprtc`. + +Configure the project the same way as in the **Linux and WSL** section, but pass `-DUSE_HIP=ON`, and set `CMAKE_HIP_ARCHITECTURES` to your GPU's architecture (for example `gfx90a` for MI200-series, `gfx1100` for RDNA3 desktop, or `gfx1201` for RDNA4). An example: + +``` +mkdir build +cd build +cmake -DCMAKE_BUILD_TYPE=Release -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm .. +ninja +``` + +If `CMAKE_HIP_ARCHITECTURES` is left unset it defaults to `gfx90a`. You can list the architecture of an installed device with `rocminfo`. If the ROCm install is not on CMake's default search path, point `-DCMAKE_PREFIX_PATH` at it (e.g. `/opt/rocm`) so `find_package` can locate hip and hipCUB. The demos are then run exactly as in the **Numerical examples** section. + ### Windows The process is similar to [the installation of Chrono](https://api.projectchrono.org/tutorial_install_chrono.html), which you can use as reference. The steps depend on your choice of tools, and what listed here are our recommendation. diff --git a/src/DEM/API.h b/src/DEM/API.h index c1ff1665..67575c3d 100644 --- a/src/DEM/API.h +++ b/src/DEM/API.h @@ -1370,12 +1370,12 @@ class DEMSolver { void PrintKinematicScratchSpaceUsage() const { kT->printScratchSpaceUsage(); } /// Let dT do this call and return the reduce value of the inspected quantity. - float dTInspectReduce(const std::shared_ptr& inspection_kernel, + float dTInspectReduce(const std::shared_ptr& inspection_kernel, const std::string& kernel_name, INSPECT_ENTITY_TYPE thing_to_insp, CUB_REDUCE_FLAVOR reduce_flavor, bool all_domain); - float* dTInspectNoReduce(const std::shared_ptr& inspection_kernel, + float* dTInspectNoReduce(const std::shared_ptr& inspection_kernel, const std::string& kernel_name, INSPECT_ENTITY_TYPE thing_to_insp, CUB_REDUCE_FLAVOR reduce_flavor, @@ -1509,7 +1509,9 @@ class DEMSolver { int m_updateFreq = 20; // The extra libs that the kernels need to include. - std::string kernel_includes = "#include \n"; + // Default: none (curand/hiprand not used by built-in kernels). + // Users can add custom includes via SetKernelInclude() if needed. + std::string kernel_includes = ""; // If and how we should add boundaries to the simulation world upon initialization. Choose between none, all and // top_open. diff --git a/src/DEM/APIPrivate.cpp b/src/DEM/APIPrivate.cpp index 08250829..eebab753 100644 --- a/src/DEM/APIPrivate.cpp +++ b/src/DEM/APIPrivate.cpp @@ -2121,7 +2121,20 @@ inline void DEMSolver::equipSimParams(std::unordered_map( + DEME_MIN(DEME_MIN(runtimeWarpSize, DEME_KT_CD_NTHREADS_PER_BLOCK), DEME_NUM_BODIES_PER_BLOCK)); + strMap["_nActiveLoadingThreads_"] = std::to_string(nActiveLoadingThreads); // nTotalBodyTopologies includes clump topologies and ext obj topologies strMap["_nDistinctMassProperties_"] = std::to_string(nDistinctMassProperties); strMap["_nJitifiableClumpComponents_"] = std::to_string(nJitifiableClumpComponents); diff --git a/src/DEM/APIPublic.cpp b/src/DEM/APIPublic.cpp index 41b5c006..93b37abe 100644 --- a/src/DEM/APIPublic.cpp +++ b/src/DEM/APIPublic.cpp @@ -2526,7 +2526,7 @@ void DEMSolver::ClearThreadCollaborationStats() { dT->nTotalSteps = 0; } -float DEMSolver::dTInspectReduce(const std::shared_ptr& inspection_kernel, +float DEMSolver::dTInspectReduce(const std::shared_ptr& inspection_kernel, const std::string& kernel_name, INSPECT_ENTITY_TYPE thing_to_insp, CUB_REDUCE_FLAVOR reduce_flavor, @@ -2537,7 +2537,7 @@ float DEMSolver::dTInspectReduce(const std::shared_ptr& inspect return (float)(*pRes); } -float* DEMSolver::dTInspectNoReduce(const std::shared_ptr& inspection_kernel, +float* DEMSolver::dTInspectNoReduce(const std::shared_ptr& inspection_kernel, const std::string& kernel_name, INSPECT_ENTITY_TYPE thing_to_insp, CUB_REDUCE_FLAVOR reduce_flavor, diff --git a/src/DEM/AuxClasses.cpp b/src/DEM/AuxClasses.cpp index 5e2bb13b..a000f38a 100644 --- a/src/DEM/AuxClasses.cpp +++ b/src/DEM/AuxClasses.cpp @@ -229,10 +229,10 @@ void DEMInspector::Initialize(const std::unordered_map my_subs["_inRegionPolicy_"] = in_region_specifier; my_subs["_quantityQueryProcess_"] = inspection_code; if (thing_to_insp == INSPECT_ENTITY_TYPE::SPHERE) { - inspection_kernel = std::make_shared(std::move(JitHelper::buildProgram( + inspection_kernel = std::make_shared(std::move(JitHelper::buildProgram( "DEMSphereQueryKernels", JitHelper::KERNEL_DIR / "DEMSphereQueryKernels.cu", my_subs, options))); } else if (thing_to_insp == INSPECT_ENTITY_TYPE::CLUMP || thing_to_insp == INSPECT_ENTITY_TYPE::EVERYTHING) { - inspection_kernel = std::make_shared(std::move(JitHelper::buildProgram( + inspection_kernel = std::make_shared(std::move(JitHelper::buildProgram( "DEMOwnerQueryKernels", JitHelper::KERNEL_DIR / "DEMOwnerQueryKernels.cu", my_subs, options))); } else { std::stringstream ss; diff --git a/src/DEM/AuxClasses.h b/src/DEM/AuxClasses.h index 310d1b4b..09baada1 100644 --- a/src/DEM/AuxClasses.h +++ b/src/DEM/AuxClasses.h @@ -11,11 +11,6 @@ #include "../core/utils/JitHelper.h" #include "Defines.h" -// Forward declare jitify::Program to avoid downstream dependency -namespace jitify { -class Program; -} - namespace deme { class DEMSolver; @@ -25,7 +20,7 @@ class DEMDynamicThread; /// their simulation entites, in a given region. class DEMInspector { private: - std::shared_ptr inspection_kernel; + std::shared_ptr inspection_kernel; std::string inspection_code; std::string in_region_code; diff --git a/src/DEM/CMakeLists.txt b/src/DEM/CMakeLists.txt index 43787a51..7cb96d09 100644 --- a/src/DEM/CMakeLists.txt +++ b/src/DEM/CMakeLists.txt @@ -13,17 +13,13 @@ target_include_directories( PUBLIC ${ProjectIncludeGenerated} ) +# DEM is pure CXX, no GPU library dependencies. +# CUB/hipcub is only needed by algorithms target. if(USE_CHPF) target_link_libraries( DEM - PUBLIC CUB::CUB PUBLIC ${ChPF_IMPORTED_NAME} ) -else() - target_link_libraries( - DEM - PUBLIC CUB::CUB - ) endif() # if(WIN32) diff --git a/src/DEM/Defines.h b/src/DEM/Defines.h index c2a8de8e..68ffdfb8 100644 --- a/src/DEM/Defines.h +++ b/src/DEM/Defines.h @@ -12,7 +12,7 @@ #include #include "VariableTypes.h" -#include "cuda_runtime.h" +#include #define DEME_MIN(a, b) ((a < b) ? a : b) #define DEME_MAX(a, b) ((a > b) ? a : b) @@ -29,7 +29,17 @@ namespace deme { #define DEME_TINY_FLOAT 1e-12 ///< Appears to be very sensitive to even smaller values... #define DEME_HUGE_FLOAT 1e15 #define DEME_BITS_PER_BYTE 8 -#define DEME_CUDA_WARP_SIZE 32 +// Wavefront/warp size: 64 only on AMD wave64 device code (gfx8/gfx9 GCN/CDNA); 32 everywhere +// else (AMD RDNA, NVIDIA, and host code). ROCm 7.2.x does not provide __AMDGCN_WAVEFRONT_SIZE__, +// so device code keys off the __GFX*__ macros. For JIT kernels the actual device warp size is +// queried at runtime and substituted. +#if defined(__HIP_DEVICE_COMPILE__) && (defined(__GFX8__) || defined(__GFX9__)) + #define DEME_WARP_SIZE 64 +#else + #define DEME_WARP_SIZE 32 +#endif +// Legacy macro for compatibility +#define DEME_CUDA_WARP_SIZE DEME_WARP_SIZE #define DEME_MAX_WILDCARD_NUM 16 // In bin--triangle intersection scan, all bins are enlarged by a factor of this following constant, so that no triangle // lies in between bins and not picked up by any bins. diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp index 0d047bb5..09bfc364 100644 --- a/src/DEM/dT.cpp +++ b/src/DEM/dT.cpp @@ -2519,41 +2519,41 @@ void DEMDynamicThread::jitifyKernels(const std::unordered_map& JitifyOptions) { // First one is force array preparation kernels { - prep_force_kernels = std::make_shared(std::move(JitHelper::buildProgram( + prep_force_kernels = std::make_shared(std::move(JitHelper::buildProgram( "DEMPrepForceKernels", JitHelper::KERNEL_DIR / "DEMPrepForceKernels.cu", Subs, JitifyOptions))); } // Then force calculation kernels { - cal_force_kernels = std::make_shared(std::move(JitHelper::buildProgram( + cal_force_kernels = std::make_shared(std::move(JitHelper::buildProgram( "DEMCalcForceKernels", JitHelper::KERNEL_DIR / "DEMCalcForceKernels.cu", Subs, JitifyOptions))); } // Then force accumulation kernels if (solverFlags.useCubForceCollect) { - collect_force_kernels = std::make_shared(std::move(JitHelper::buildProgram( + collect_force_kernels = std::make_shared(std::move(JitHelper::buildProgram( "DEMCollectForceKernels", JitHelper::KERNEL_DIR / "DEMCollectForceKernels.cu", Subs, JitifyOptions))); } else { - collect_force_kernels = std::make_shared(std::move( + collect_force_kernels = std::make_shared(std::move( JitHelper::buildProgram("DEMCollectForceKernels_Compact", JitHelper::KERNEL_DIR / "DEMCollectForceKernels_Compact.cu", Subs, JitifyOptions))); } // Then integration kernels { - integrator_kernels = std::make_shared(std::move(JitHelper::buildProgram( + integrator_kernels = std::make_shared(std::move(JitHelper::buildProgram( "DEMIntegrationKernels", JitHelper::KERNEL_DIR / "DEMIntegrationKernels.cu", Subs, JitifyOptions))); } // Then kernels that are... wildcards, which make on-the-fly changes to solver data if (solverFlags.canFamilyChangeOnDevice) { - mod_kernels = std::make_shared(std::move(JitHelper::buildProgram( + mod_kernels = std::make_shared(std::move(JitHelper::buildProgram( "DEMModeratorKernels", JitHelper::KERNEL_DIR / "DEMModeratorKernels.cu", Subs, JitifyOptions))); } // Then misc kernels { - misc_kernels = std::make_shared(std::move(JitHelper::buildProgram( + misc_kernels = std::make_shared(std::move(JitHelper::buildProgram( "DEMMiscKernels", JitHelper::KERNEL_DIR / "DEMMiscKernels.cu", Subs, JitifyOptions))); } } -float* DEMDynamicThread::inspectCall(const std::shared_ptr& inspection_kernel, +float* DEMDynamicThread::inspectCall(const std::shared_ptr& inspection_kernel, const std::string& kernel_name, INSPECT_ENTITY_TYPE thing_to_insp, CUB_REDUCE_FLAVOR reduce_flavor, diff --git a/src/DEM/dT.h b/src/DEM/dT.h index 3222088d..80df1866 100644 --- a/src/DEM/dT.h +++ b/src/DEM/dT.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -23,10 +24,12 @@ #include "Structs.h" #include "AuxClasses.h" -// Forward declare jitify::Program to avoid downstream dependency -namespace jitify { +// Forward declare deme::jit::Program +namespace deme { +namespace jit { class Program; } +} namespace deme { @@ -631,7 +634,7 @@ class DEMDynamicThread { const std::vector& JitifyOptions); // Execute this kernel, then return the reduced value - float* inspectCall(const std::shared_ptr& inspection_kernel, + float* inspectCall(const std::shared_ptr& inspection_kernel, const std::string& kernel_name, INSPECT_ENTITY_TYPE thing_to_insp, CUB_REDUCE_FLAVOR reduce_flavor, @@ -709,13 +712,13 @@ class DEMDynamicThread { const std::function& condition); // Just-in-time compiled kernels - std::shared_ptr prep_force_kernels; - std::shared_ptr cal_force_kernels; - std::shared_ptr collect_force_kernels; - std::shared_ptr integrator_kernels; - // std::shared_ptr quarry_stats_kernels; - std::shared_ptr mod_kernels; - std::shared_ptr misc_kernels; + std::shared_ptr prep_force_kernels; + std::shared_ptr cal_force_kernels; + std::shared_ptr collect_force_kernels; + std::shared_ptr integrator_kernels; + // std::shared_ptr quarry_stats_kernels; + std::shared_ptr mod_kernels; + std::shared_ptr misc_kernels; // Adjuster for update freq class AccumStepUpdater { diff --git a/src/DEM/kT.cpp b/src/DEM/kT.cpp index c56815df..2d140d68 100644 --- a/src/DEM/kT.cpp +++ b/src/DEM/kT.cpp @@ -880,34 +880,34 @@ void DEMKinematicThread::jitifyKernels(const std::unordered_map& JitifyOptions) { // First one is bin_sphere_kernels kernels, which figure out the bin--sphere touch pairs { - bin_sphere_kernels = std::make_shared(std::move(JitHelper::buildProgram( + bin_sphere_kernels = std::make_shared(std::move(JitHelper::buildProgram( "DEMBinSphereKernels", JitHelper::KERNEL_DIR / "DEMBinSphereKernels.cu", Subs, JitifyOptions))); } // Then CD kernels { - sphere_contact_kernels = std::make_shared(std::move( + sphere_contact_kernels = std::make_shared(std::move( JitHelper::buildProgram("DEMContactKernels_SphereSphere", JitHelper::KERNEL_DIR / "DEMContactKernels_SphereSphere.cu", Subs, JitifyOptions))); } // Then triangle--bin intersection-related kernels { - bin_triangle_kernels = std::make_shared(std::move(JitHelper::buildProgram( + bin_triangle_kernels = std::make_shared(std::move(JitHelper::buildProgram( "DEMBinTriangleKernels", JitHelper::KERNEL_DIR / "DEMBinTriangleKernels.cu", Subs, JitifyOptions))); } // Then sphere--triangle contact detection-related kernels { - sphTri_contact_kernels = std::make_shared(std::move(JitHelper::buildProgram( + sphTri_contact_kernels = std::make_shared(std::move(JitHelper::buildProgram( "DEMContactKernels_SphereTriangle", JitHelper::KERNEL_DIR / "DEMContactKernels_SphereTriangle.cu", Subs, JitifyOptions))); } // Then contact history mapping kernels { - history_kernels = std::make_shared(std::move(JitHelper::buildProgram( + history_kernels = std::make_shared(std::move(JitHelper::buildProgram( "DEMHistoryMappingKernels", JitHelper::KERNEL_DIR / "DEMHistoryMappingKernels.cu", Subs, JitifyOptions))); } // Then misc kernels { - misc_kernels = std::make_shared(std::move(JitHelper::buildProgram( + misc_kernels = std::make_shared(std::move(JitHelper::buildProgram( "DEMMiscKernels", JitHelper::KERNEL_DIR / "DEMMiscKernels.cu", Subs, JitifyOptions))); } } diff --git a/src/DEM/kT.h b/src/DEM/kT.h index 54c8ed93..6032b10e 100644 --- a/src/DEM/kT.h +++ b/src/DEM/kT.h @@ -23,10 +23,12 @@ #include "Defines.h" #include "Structs.h" -// Forward declare jitify::Program to avoid downstream dependency -namespace jitify { +// Forward declare deme::jit::Program +namespace deme { +namespace jit { class Program; } +} namespace deme { @@ -408,12 +410,12 @@ class DEMKinematicThread { // Just-in-time compiled kernels // jitify::Program bin_sphere_kernels = JitHelper::buildProgram("bin_sphere_kernels", " "); - std::shared_ptr bin_sphere_kernels; - std::shared_ptr bin_triangle_kernels; - std::shared_ptr sphTri_contact_kernels; - std::shared_ptr sphere_contact_kernels; - std::shared_ptr history_kernels; - std::shared_ptr misc_kernels; + std::shared_ptr bin_sphere_kernels; + std::shared_ptr bin_triangle_kernels; + std::shared_ptr sphTri_contact_kernels; + std::shared_ptr sphere_contact_kernels; + std::shared_ptr history_kernels; + std::shared_ptr misc_kernels; // Adjuster for bin size class AccumTimer { diff --git a/src/algorithms/CMakeLists.txt b/src/algorithms/CMakeLists.txt index 796d48f0..ff9a88ca 100644 --- a/src/algorithms/CMakeLists.txt +++ b/src/algorithms/CMakeLists.txt @@ -15,10 +15,17 @@ target_include_directories( PUBLIC ${ProjectIncludeGenerated} ) -target_link_libraries( - algorithms - PUBLIC CUB::CUB -) +if(USE_HIP) + target_link_libraries( + algorithms + PUBLIC hip::hipcub + ) +else() + target_link_libraries( + algorithms + PUBLIC CUB::CUB + ) +endif() ### HOST HEADERS ONLY (.h, .hpp) ### set(algorithms_interface @@ -45,23 +52,36 @@ target_sources( PRIVATE ${algorithms_sources} ) -# Get a list of supported architectures -include(../../cmake/CudaSupportedArchitectures.cmake) -cuda_supported_architectures() +if(USE_HIP) + # Mark .cu sources as HIP language + set_source_files_properties(${algorithms_sources} PROPERTIES LANGUAGE HIP) -# Convert the cuda supported architectures into their respective CMake properties -set(virtual_architectures "") -foreach(arch ${CUDASUP_ARCHITECTURES}) - list(APPEND virtual_architectures "${arch}-virtual") -endforeach() + set_target_properties( + algorithms PROPERTIES + POSITION_INDEPENDENT_CODE True + CXX_STANDARD ${CXXSTD_SUPPORTED} + PUBLIC_HEADER "${algorithms_interface}" + HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" + ) +else() + # Get a list of supported architectures + include(../../cmake/CudaSupportedArchitectures.cmake) + cuda_supported_architectures() -set_target_properties( - algorithms PROPERTIES - POSITION_INDEPENDENT_CODE True - CXX_STANDARD ${CXXSTD_SUPPORTED} - PUBLIC_HEADER "${algorithms_interface}" - CUDA_ARCHITECTURES "${virtual_architectures}" -) + # Convert the cuda supported architectures into their respective CMake properties + set(virtual_architectures "") + foreach(arch ${CUDASUP_ARCHITECTURES}) + list(APPEND virtual_architectures "${arch}-virtual") + endforeach() + + set_target_properties( + algorithms PROPERTIES + POSITION_INDEPENDENT_CODE True + CXX_STANDARD ${CXXSTD_SUPPORTED} + PUBLIC_HEADER "${algorithms_interface}" + CUDA_ARCHITECTURES "${virtual_architectures}" + ) +endif() install( TARGETS algorithms diff --git a/src/algorithms/DEMCubContactDetection.cu b/src/algorithms/DEMCubContactDetection.cu index dccc4702..7158b6df 100644 --- a/src/algorithms/DEMCubContactDetection.cu +++ b/src/algorithms/DEMCubContactDetection.cu @@ -3,7 +3,12 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#if defined(USE_HIP) +#include +#else #include +#endif // #include #include @@ -35,11 +40,11 @@ inline void contactEventArraysResize(size_t nContactPairs, granData.toDevice(); } -void contactDetection(std::shared_ptr& bin_sphere_kernels, - std::shared_ptr& bin_triangle_kernels, - std::shared_ptr& sphere_contact_kernels, - std::shared_ptr& sphTri_contact_kernels, - std::shared_ptr& history_kernels, +void contactDetection(std::shared_ptr& bin_sphere_kernels, + std::shared_ptr& bin_triangle_kernels, + std::shared_ptr& sphere_contact_kernels, + std::shared_ptr& sphTri_contact_kernels, + std::shared_ptr& history_kernels, DualStruct& granData, DualStruct& simParams, SolverFlags& solverFlags, diff --git a/src/algorithms/DEMCubForceCollection.cu b/src/algorithms/DEMCubForceCollection.cu index 44c1052f..bf07dc65 100644 --- a/src/algorithms/DEMCubForceCollection.cu +++ b/src/algorithms/DEMCubForceCollection.cu @@ -3,7 +3,12 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#if defined(USE_HIP) +#include +#else #include +#endif // #include #include @@ -16,7 +21,7 @@ namespace deme { -void collectContactForcesThruCub(std::shared_ptr& collect_force_kernels, +void collectContactForcesThruCub(std::shared_ptr& collect_force_kernels, DualStruct& granData, const size_t nContactPairs, const size_t nClumps, diff --git a/src/algorithms/DEMCubInstantiations.cu b/src/algorithms/DEMCubInstantiations.cu index 25735e9d..0e9f0873 100644 --- a/src/algorithms/DEMCubInstantiations.cu +++ b/src/algorithms/DEMCubInstantiations.cu @@ -3,7 +3,12 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#if defined(USE_HIP) +#include +#else #include +#endif #include #include diff --git a/src/algorithms/DEMCubWrappers.cu b/src/algorithms/DEMCubWrappers.cu index 4ecfcbe5..aae5c9ea 100644 --- a/src/algorithms/DEMCubWrappers.cu +++ b/src/algorithms/DEMCubWrappers.cu @@ -3,7 +3,16 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include +#if defined(USE_HIP) +#include +// CUB_RUNTIME_FUNCTION is a CUB macro not defined in hipCUB +#ifndef CUB_RUNTIME_FUNCTION +#define CUB_RUNTIME_FUNCTION +#endif +#else #include +#endif #include #include #include diff --git a/src/algorithms/DEMStaticDeviceSubroutines.h b/src/algorithms/DEMStaticDeviceSubroutines.h index acd0cb4f..d2de6ef7 100644 --- a/src/algorithms/DEMStaticDeviceSubroutines.h +++ b/src/algorithms/DEMStaticDeviceSubroutines.h @@ -76,11 +76,11 @@ void cubSortByKey(T1* d_keys_in, // For kT and dT's private usage //////////////////////////////////////////////////////////////////////////////// -void contactDetection(std::shared_ptr& bin_sphere_kernels, - std::shared_ptr& bin_triangle_kernels, - std::shared_ptr& sphere_contact_kernels, - std::shared_ptr& sphTri_contact_kernels, - std::shared_ptr& history_kernels, +void contactDetection(std::shared_ptr& bin_sphere_kernels, + std::shared_ptr& bin_triangle_kernels, + std::shared_ptr& sphere_contact_kernels, + std::shared_ptr& sphTri_contact_kernels, + std::shared_ptr& history_kernels, DualStruct& granData, DualStruct& simParams, SolverFlags& solverFlags, @@ -99,7 +99,7 @@ void contactDetection(std::shared_ptr& bin_sphere_kernels, SolverTimers& timers, kTStateParams& stateParams); -void collectContactForcesThruCub(std::shared_ptr& collect_force_kernels, +void collectContactForcesThruCub(std::shared_ptr& collect_force_kernels, DualStruct& granData, const size_t nContactPairs, const size_t nClumps, diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index c01c748e..a0b5dae9 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -14,12 +14,15 @@ configure_file( ) -message(STATUS "${core_message} Extracting NVIDIA Jitify header...") -configure_file( - ${NVIDIAJitifyPath}/jitify.hpp - ${CMAKE_BINARY_DIR}/src/jitify/jitify.hpp - COPYONLY -) +# Jitify is only needed for CUDA builds (HIP uses hiprtc directly via JitKernel abstraction) +if(NOT USE_HIP) + message(STATUS "${core_message} Extracting Jitify header...") + configure_file( + ${JitifyPath}/jitify.hpp + ${CMAKE_BINARY_DIR}/src/jitify/jitify.hpp + COPYONLY + ) +endif() add_library(core OBJECT) @@ -31,17 +34,13 @@ target_include_directories( ) +# Core is pure CXX, no GPU library dependencies. +# CUB/hipcub is only needed by algorithms target. if(USE_CHPF) target_link_libraries( core - PUBLIC CUB::CUB INTERFACE ${ChPF_IMPORTED_NAME} ) -else() - target_link_libraries( - core - PUBLIC CUB::CUB - ) endif() @@ -50,6 +49,8 @@ set(core_headers ${CMAKE_CURRENT_SOURCE_DIR}/utils/CudaAllocator.hpp ${CMAKE_CURRENT_SOURCE_DIR}/utils/ManagedMemory.hpp ${CMAKE_CURRENT_SOURCE_DIR}/utils/JitHelper.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel.h + ${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel.inl ${CMAKE_CURRENT_SOURCE_DIR}/utils/ThreadManager.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/GpuError.h ${CMAKE_CURRENT_SOURCE_DIR}/utils/GpuManager.h @@ -68,6 +69,13 @@ set(core_sources ${CMAKE_CURRENT_SOURCE_DIR}/utils/DEMEPaths.cpp ) +# JitKernel abstraction: select backend based on USE_HIP +if(USE_HIP) + list(APPEND core_sources ${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel_hip.cpp) +else() + list(APPEND core_sources ${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel_cuda.cpp) +endif() + target_sources( core PUBLIC ${core_headers} ${core_utils_headers} @@ -98,13 +106,15 @@ install( ${CMAKE_INSTALL_INCLUDEDIR}/core ) -# Install Third-party Headers -install( - FILES - "${NVIDIAJitifyPath}/jitify.hpp" - DESTINATION - ${CMAKE_INSTALL_INCLUDEDIR}/jitify -) +# Install Third-party Headers (jitify only for CUDA builds) +if(NOT USE_HIP) + install( + FILES + "${NVIDIAJitifyPath}/jitify.hpp" + DESTINATION + ${CMAKE_INSTALL_INCLUDEDIR}/jitify + ) +endif() # --------------------------------------------------------- # @@ -114,7 +124,7 @@ install( message(STATUS "${core_message} Generating Runtime Data helper (build)...") set(DEME_RUNTIME_DATA_DIRECTORY "${CMAKE_BINARY_DIR}") -set(DEME_RUNTIME_INCLUDE_DIRECTORY "${CMAKE_BINARY_DIR}") +set(DEME_RUNTIME_INCLUDE_DIRECTORY "${CMAKE_SOURCE_DIR}/src") configure_file( utils/RuntimeData.cpp.in utils/RuntimeDataBuild.cpp @ONLY diff --git a/src/core/utils/CudaAllocator.hpp b/src/core/utils/CudaAllocator.hpp index e05b8942..32a576aa 100644 --- a/src/core/utils/CudaAllocator.hpp +++ b/src/core/utils/CudaAllocator.hpp @@ -8,7 +8,7 @@ #include -#include +#include "cuda_to_hip.h" #include #include #include diff --git a/src/core/utils/GpuError.h b/src/core/utils/GpuError.h index 97d213b5..c24bc315 100644 --- a/src/core/utils/GpuError.h +++ b/src/core/utils/GpuError.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include "cuda_to_hip.h" #define DEME_GPU_CALL(res) \ { gpu_assert((res), __FILE__, __LINE__); } diff --git a/src/core/utils/GpuManager.h b/src/core/utils/GpuManager.h index bf9cf611..d5a5d3c3 100644 --- a/src/core/utils/GpuManager.h +++ b/src/core/utils/GpuManager.h @@ -1,7 +1,7 @@ #ifndef DEME_GPU_MANAGER_H #define DEME_GPU_MANAGER_H -#include +#include "cuda_to_hip.h" #include #include diff --git a/src/core/utils/JitHelper.cpp b/src/core/utils/JitHelper.cpp index 9779b8a1..ff0a4706 100644 --- a/src/core/utils/JitHelper.cpp +++ b/src/core/utils/JitHelper.cpp @@ -12,21 +12,30 @@ #include #include #include - -#include - -// Compile-time default CUDA architecture fallback. -// Can be overridden at build time via -DDEME_DEFAULT_CUDA_ARCH_STR="compute_XY". -// At runtime, the environment variable DEME_DEFAULT_CUDA_ARCH takes precedence. +#include + +#include "cuda_to_hip.h" + +// Compile-time default architecture fallback. +// Can be overridden at build time via -DDEME_DEFAULT_CUDA_ARCH_STR="compute_XY" (CUDA) +// or -DDEME_DEFAULT_HIP_ARCH_STR="gfx90a" (HIP). +// At runtime, the environment variable DEME_DEFAULT_CUDA_ARCH (CUDA) or +// DEME_DEFAULT_HIP_ARCH (HIP) takes precedence. +#if defined(USE_HIP) +#ifndef DEME_DEFAULT_HIP_ARCH_STR + #define DEME_DEFAULT_HIP_ARCH_STR "gfx90a" +#endif +#else #ifndef DEME_DEFAULT_CUDA_ARCH_STR #define DEME_DEFAULT_CUDA_ARCH_STR "compute_75" #endif +#endif #include #include "RuntimeData.h" #include "JitHelper.h" -jitify::JitCache* JitHelper::kcache = nullptr; +deme::jit::ProgramCache* JitHelper::kcache = nullptr; const std::filesystem::path JitHelper::KERNEL_DIR = DEMERuntimeDataHelper::data_path / "kernel"; const std::filesystem::path JitHelper::KERNEL_INCLUDE_DIR = DEMERuntimeDataHelper::include_path; @@ -47,13 +56,12 @@ void JitHelper::Header::substitute(const std::string& symbol, const std::string& } } -jitify::Program JitHelper::buildProgram( +deme::jit::Program JitHelper::buildProgram( const std::string& name, const std::filesystem::path& source, std::unordered_map substitutions, - // std::vector headers, // THIS PARAMETER PROBABLY WON'T EVER BE USED std::vector flags) { - // Double ensure include paths for runtime headers + CUDA/CCCL (cuda::std) + // Double ensure include paths for runtime headers + CUDA/CCCL (cuda::std) or ROCm auto add_inc = [&](const std::filesystem::path& p) { if (p.empty()) return; @@ -68,15 +76,59 @@ jitify::Program JitHelper::buildProgram( // Project/runtime includes add_inc(KERNEL_INCLUDE_DIR); + // Also add the source tree for header includes (cuda_to_hip.h, etc.) + // KERNEL_INCLUDE_DIR is the build tree, but some headers live in src/ + // The source tree is one level up from build, then into src/ + std::error_code ec; + std::filesystem::path src_include = std::filesystem::canonical(KERNEL_INCLUDE_DIR / "..", ec); + if (!ec) { + add_inc(src_include / "src"); + } + // Common fallbacks +#if defined(USE_HIP) + // Helper: find clang builtin headers by scanning lib/llvm/lib/clang//include + auto add_clang_builtins = [&](const std::filesystem::path& rocm_root) { + std::filesystem::path clang_base = rocm_root / "lib" / "llvm" / "lib" / "clang"; + std::error_code scan_ec; + for (const auto& entry : std::filesystem::directory_iterator(clang_base, scan_ec)) { + if (entry.is_directory()) { + add_inc(entry.path() / "include"); + break; // use the first (and typically only) version dir + } + } + }; + + // ROCm include paths for hipRTC + if (const char* rocm_path = std::getenv("ROCM_PATH")) { + add_inc(std::filesystem::path(rocm_path) / "include"); + add_inc(std::filesystem::path(rocm_path) / "include" / "hipcub"); + add_inc(std::filesystem::path(rocm_path) / "include" / "rocprim"); + // Clang builtin headers (stddef.h, etc.) for hiprtc -- scan for actual version dir + add_clang_builtins(std::filesystem::path(rocm_path)); + } + add_inc("/opt/rocm/include"); + add_inc("/opt/rocm/include/hipcub"); + add_inc("/opt/rocm/include/rocprim"); + // Clang builtin headers for hiprtc (needed for stddef.h, stdint.h, etc.) + add_clang_builtins(std::filesystem::path("/opt/rocm")); +#else if (const char* cuda_home = std::getenv("CUDA_HOME")) { add_inc(std::filesystem::path(cuda_home) / "include"); add_inc(std::filesystem::path(cuda_home) / "include" / "cccl"); } add_inc("/usr/local/cuda/include"); add_inc("/usr/local/cuda/include/cccl"); +#endif +#if defined(USE_HIP) + // For hiprtc: start with hip_runtime.h to provide HIP types/functions in device code. + // The program name is passed separately to hiprtcCreateProgram. + std::string code = "#include \n"; +#else + // For CUDA/jitify: the program name on the first line is a jitify requirement. std::string code = name + "\n"; +#endif code.append(JitHelper::loadSourceFile(source)); // Apply the substitutions @@ -84,64 +136,53 @@ jitify::Program JitHelper::buildProgram( code = std::regex_replace(code, std::regex(subst.first), subst.second); } - std::vector header_code; - // THIS BLOCK IS ONLY NEEDED IF THE headers PARAMETER IS USED - /* - for (auto it = headers.begin(); it != headers.end(); it++) { - header_code.push_back(it->getSource()); - } - */ - if (kcache == nullptr) - kcache = new jitify::JitCache(); + kcache = new deme::jit::ProgramCache(); - // Stage 1: Explicit architecture detection. - // Query the active CUDA device compute capability and build an -arch flag. + // Detect GPU architecture and add appropriate flags + std::string arch_flag; { int dev = 0; cudaDeviceProp prop; memset(&prop, 0, sizeof(prop)); - if (cudaGetDevice(&dev) == cudaSuccess && cudaGetDeviceProperties(&prop, dev) == cudaSuccess && - prop.major > 0 && prop.minor >= 0) { - std::string detected_arch = "compute_" + std::to_string(prop.major) + std::to_string(prop.minor); - std::vector arch_flags = flags; - arch_flags.push_back("-arch=" + detected_arch); - return kcache->program(code, header_code, arch_flags); + if (cudaGetDevice(&dev) == cudaSuccess && cudaGetDeviceProperties(&prop, dev) == cudaSuccess) { +#if defined(USE_HIP) + // HIP: use gcnArchName (e.g., "gfx90a") + std::string detected_arch = prop.gcnArchName; + // Strip any suffix after the base arch name (e.g., "gfx90a:sramecc+:xnack-" -> "gfx90a") + size_t colon_pos = detected_arch.find(':'); + if (colon_pos != std::string::npos) { + detected_arch = detected_arch.substr(0, colon_pos); + } + arch_flag = "--gpu-architecture=" + detected_arch; +#else + if (prop.major > 0 && prop.minor >= 0) { + std::string detected_arch = "compute_" + std::to_string(prop.major) + std::to_string(prop.minor); + arch_flag = "-arch=" + detected_arch; + } +#endif } } - // Stage 2: Jitify auto-detect. - // Device detection failed or returned an invalid compute capability; compile - // without an explicit -arch flag and let Jitify detect the architecture. - try { - return kcache->program(code, header_code, flags); - } catch (const std::exception& e) { - const std::string err_msg = e.what(); - // Only fall through to Stage 3 when the failure is architecture-related. - const bool is_arch_error = - (err_msg.find("arch") != std::string::npos || err_msg.find("compute_") != std::string::npos || - err_msg.find("sm_") != std::string::npos); - if (!is_arch_error) { - throw; - } - - // Stage 3: Hardcoded default fallback. - // Use the DEME_DEFAULT_CUDA_ARCH environment variable if set; otherwise fall - // back to the compile-time constant DEME_DEFAULT_CUDA_ARCH_STR. + // If device detection failed, use fallback + if (arch_flag.empty()) { +#if defined(USE_HIP) + const char* env_arch = std::getenv("DEME_DEFAULT_HIP_ARCH"); + const std::string fallback_arch = + (env_arch != nullptr && env_arch[0] != '\0') ? std::string(env_arch) : DEME_DEFAULT_HIP_ARCH_STR; + arch_flag = "--gpu-architecture=" + fallback_arch; +#else const char* env_arch = std::getenv("DEME_DEFAULT_CUDA_ARCH"); const std::string fallback_arch = (env_arch != nullptr && env_arch[0] != '\0') ? std::string(env_arch) : DEME_DEFAULT_CUDA_ARCH_STR; - std::vector fallback_flags = flags; - fallback_flags.push_back("-arch=" + fallback_arch); - try { - return kcache->program(code, header_code, fallback_flags); - } catch (const std::exception& e3) { - std::string ctx = "Jitify compilation failed with fallback arch '"; - ctx += fallback_arch; - ctx += (env_arch != nullptr && env_arch[0] != '\0') ? "' (from DEME_DEFAULT_CUDA_ARCH env var): " - : "' (compile-time default): "; - ctx += e3.what(); - throw std::runtime_error(ctx); - } + arch_flag = "-arch=" + fallback_arch; +#endif } + + std::vector final_flags = flags; + final_flags.push_back(arch_flag); + + // Use the unified JitKernel abstraction - no headers needed for our usage + std::vector> headers; + return kcache->program(name, code, headers, final_flags); } diff --git a/src/core/utils/JitHelper.h b/src/core/utils/JitHelper.h index 638f23cd..c3dee0d8 100644 --- a/src/core/utils/JitHelper.h +++ b/src/core/utils/JitHelper.h @@ -7,11 +7,12 @@ #define DEME_JIT_HELPER_H #include +#include #include #include #include -#include +#include "JitKernel.h" #if defined(_WIN32) || defined(_WIN64) #undef max @@ -31,24 +32,17 @@ class JitHelper { std::string _source; }; - static jitify::Program buildProgram( + static deme::jit::Program buildProgram( const std::string& name, const std::filesystem::path& source, std::unordered_map substitutions = std::unordered_map(), std::vector flags = std::vector()); - //// I'm pretty sure C++17 auto-converts this - // static jitify::Program buildProgram( - // const std::string& name, const std::string& code, - // std::vector
headers = 0, - // std::vector flags = 0 - // ); - static const std::filesystem::path KERNEL_DIR; static const std::filesystem::path KERNEL_INCLUDE_DIR; private: - static jitify::JitCache* kcache; + static deme::jit::ProgramCache* kcache; inline static std::string loadSourceFile(const std::filesystem::path& sourcefile) { std::string code; diff --git a/src/core/utils/JitKernel.h b/src/core/utils/JitKernel.h new file mode 100644 index 00000000..39e99ae2 --- /dev/null +++ b/src/core/utils/JitKernel.h @@ -0,0 +1,189 @@ +// Copyright (c) 2021, SBEL GPU Development Team +// Copyright (c) 2021, University of Wisconsin - Madison +// Copyright (c) 2026, Advanced Micro Devices, Inc. +// +// SPDX-License-Identifier: BSD-3-Clause + +// Unified runtime kernel compilation API for DEM-Engine. +// Abstracts over jitify v1 (CUDA/NVRTC) and hiprtc (ROCm), enabling both +// platforms to use the same fluent kernel launch syntax. + +#ifndef DEME_JIT_KERNEL_H +#define DEME_JIT_KERNEL_H + +#include +#include +#include +#include +#include + +#ifdef USE_HIP +#include +using gpuStream_t = hipStream_t; +#else +#include +using gpuStream_t = cudaStream_t; +#endif + +namespace deme { +namespace jit { + +class Kernel; +class Program; + +// Fluent kernel launch builder - matches existing jitify v1 usage pattern. +// Usage: program->kernel("name")().instantiate().configure(...).launch(args...) +class KernelLauncher { + public: + // For kernels without template parameters + KernelLauncher& instantiate(); + + // For kernels with template parameters. + // type_names: comma-separated type names as they appear in the kernel template. + // On hiprtc, these get baked into the mangled kernel name. + KernelLauncher& instantiate(const std::string& type_names); + + // Configure launch parameters + KernelLauncher& configure(dim3 grid, dim3 block, size_t shared_mem = 0, gpuStream_t stream = 0); + + // Launch with arguments - variadic template forwards to backend + // The implementation is in the .inl file for HIP, or calls jitify for CUDA + template + void launch(Args... args); + + // Non-templated launch for backend implementations + void launchRaw(void** kernel_args, size_t num_args); + + // Kernel can create KernelLaunchers + explicit KernelLauncher(Kernel* k); + + private: + friend class Kernel; + + Kernel* kernel_; + dim3 grid_{1, 1, 1}; + dim3 block_{1, 1, 1}; + size_t shared_mem_ = 0; + gpuStream_t stream_ = 0; + std::string instantiation_; + bool instantiated_ = false; +}; + +// Handle to a kernel function within a compiled program. +// Created by Program::kernel(name). +class Kernel { + public: + ~Kernel(); + Kernel(Kernel&&) noexcept; + Kernel& operator=(Kernel&&) noexcept; + + // Deleted copy operations - kernel handles are move-only + Kernel(const Kernel&) = delete; + Kernel& operator=(const Kernel&) = delete; + + // Start a launch chain. Returns a KernelLauncher for fluent chaining. + KernelLauncher operator()(); + + // Direct instantiate() method to match jitify v1 API. + // Usage: program->kernel("name").instantiate().configure(...).launch(...) + KernelLauncher& instantiate(); + KernelLauncher& instantiate(const std::string& type_names); + + // Get kernel name (for debugging/error messages) + const std::string& name() const; + + private: + friend class Program; + friend class KernelLauncher; + Kernel(); + + struct Impl; + std::unique_ptr impl_; + std::unique_ptr launcher_; // For instantiate() return reference +}; + +// A compiled GPU program containing one or more kernels. +// Created by ProgramCache::program(). +class Program { + public: + ~Program(); + Program(Program&&) noexcept; + Program& operator=(Program&&) noexcept; + + // Deleted copy operations - programs are move-only + Program(const Program&) = delete; + Program& operator=(const Program&) = delete; + + // Get a kernel by name. The kernel can then be launched via: + // program->kernel("name")().instantiate().configure(...).launch(...) + Kernel kernel(const std::string& name); + + // Check if program is valid (compiled successfully) + explicit operator bool() const; + + private: + friend class ProgramCache; + Program(); + + struct Impl; + std::unique_ptr impl_; +}; + +// Caches compiled programs to avoid recompilation. +// Thread-safe: multiple threads can call program() concurrently. +class ProgramCache { + public: + ProgramCache(); + ~ProgramCache(); + + // Compile a program. + // - name: identifier for the program (typically the kernel file name) + // - source: kernel source code (prepended with name for jitify compatibility) + // - headers: additional header code as (name, source) pairs + // - flags: compiler flags (-I, -D, -arch, etc.) + // + // Returns a new Program by value. Caching is not performed at this level; + // callers typically cache via shared_ptr at the application level. + Program program(const std::string& name, + const std::string& source, + const std::vector>& headers = {}, + const std::vector& flags = {}); + + // Clear all cached programs. Invalidates all Program references. + void clear(); + + private: + struct Impl; + std::unique_ptr impl_; +}; + +// Helper to build compiler flags in a platform-agnostic way. +class CompilerFlags { + public: + // Add an include path (-I) + CompilerFlags& include(const std::string& path); + + // Add a macro definition (-D) + CompilerFlags& define(const std::string& macro, const std::string& value = ""); + + // Set target architecture (compute_75 for CUDA, gfx90a for HIP) + // The flag format is handled automatically per platform. + CompilerFlags& arch(const std::string& arch); + + // Add a raw flag (passed through unchanged) + CompilerFlags& flag(const std::string& raw_flag); + + // Build the final flags vector + std::vector build() const; + + private: + std::vector flags_; +}; + +} // namespace jit +} // namespace deme + +// Include the template implementation (must be visible at all call sites) +#include "JitKernel.inl" + +#endif // DEME_JIT_KERNEL_H diff --git a/src/core/utils/JitKernel.inl b/src/core/utils/JitKernel.inl new file mode 100644 index 00000000..6984dd94 --- /dev/null +++ b/src/core/utils/JitKernel.inl @@ -0,0 +1,64 @@ +// Copyright (c) 2021, SBEL GPU Development Team +// Copyright (c) 2021, University of Wisconsin - Madison +// Copyright (c) 2026, Advanced Micro Devices, Inc. +// +// SPDX-License-Identifier: BSD-3-Clause + +// Template implementation for JitKernel.h +// This file is included at the end of JitKernel.h + +#ifndef DEME_JIT_KERNEL_INL +#define DEME_JIT_KERNEL_INL + +#include +#include +#include + +namespace deme { +namespace jit { + +namespace detail { + +// Helper to pack a single argument by copying to a buffer +template +inline void packArg(std::vector& args, std::vector>& storage, T arg) { + auto buf = std::make_unique(sizeof(T)); + std::memcpy(buf.get(), &arg, sizeof(T)); + args.push_back(buf.get()); + storage.push_back(std::move(buf)); +} + +// Recursive pack for variadic args +template +inline void packArgs(std::vector& args, std::vector>& storage, T first, Rest... rest) { + packArg(args, storage, first); + if constexpr (sizeof...(rest) > 0) { + packArgs(args, storage, rest...); + } +} + +// Base case: no args +inline void packArgs(std::vector&, std::vector>&) {} + +} // namespace detail + +template +void KernelLauncher::launch(Args... args) { + // Pack each argument into a heap buffer and collect pointers to them, then + // forward to the backend's non-template raw launcher (hipModuleLaunchKernel + // on ROCm, jitify's cuLaunchKernel wrapper on CUDA). Both runtimes take the + // same "array of pointers to argument values" convention. + std::vector kernel_args; + std::vector> arg_storage; + + if constexpr (sizeof...(args) > 0) { + detail::packArgs(kernel_args, arg_storage, args...); + } + + launchRaw(kernel_args.empty() ? nullptr : kernel_args.data(), kernel_args.size()); +} + +} // namespace jit +} // namespace deme + +#endif // DEME_JIT_KERNEL_INL diff --git a/src/core/utils/JitKernel_cuda.cpp b/src/core/utils/JitKernel_cuda.cpp new file mode 100644 index 00000000..c55126e7 --- /dev/null +++ b/src/core/utils/JitKernel_cuda.cpp @@ -0,0 +1,205 @@ +// Copyright (c) 2021, SBEL GPU Development Team +// Copyright (c) 2021, University of Wisconsin - Madison +// Copyright (c) 2026, Advanced Micro Devices, Inc. +// +// SPDX-License-Identifier: BSD-3-Clause + +// CUDA backend for JitKernel using jitify v1 (NVRTC) + +#ifndef USE_HIP + +#include "JitKernel.h" + +// Jitify is in the build tree at ${CMAKE_BINARY_DIR}/src/jitify/ +// Include path is added by CMake target_include_directories +#include +#include +#include +#include + +namespace deme { +namespace jit { + +// ============================================================================ +// Kernel implementation +// ============================================================================ + +struct Kernel::Impl { + std::string name; + jitify::Program* program = nullptr; // Non-owning reference to parent program + jitify::KernelInstantiation instantiation; + bool has_instantiation = false; +}; + +Kernel::Kernel() : impl_(std::make_unique()) {} +Kernel::~Kernel() = default; +Kernel::Kernel(Kernel&&) noexcept = default; +Kernel& Kernel::operator=(Kernel&&) noexcept = default; + +KernelLauncher Kernel::operator()() { + return KernelLauncher(this); +} + +KernelLauncher& Kernel::instantiate() { + launcher_ = std::make_unique(this); + return launcher_->instantiate(); +} + +KernelLauncher& Kernel::instantiate(const std::string& type_names) { + launcher_ = std::make_unique(this); + return launcher_->instantiate(type_names); +} + +const std::string& Kernel::name() const { + return impl_->name; +} + +// ============================================================================ +// KernelLauncher implementation +// ============================================================================ + +KernelLauncher::KernelLauncher(Kernel* k) : kernel_(k) {} + +KernelLauncher& KernelLauncher::instantiate() { + kernel_->impl_->instantiation = kernel_->impl_->program->kernel(kernel_->impl_->name).instantiate(); + kernel_->impl_->has_instantiation = true; + instantiated_ = true; + return *this; +} + +KernelLauncher& KernelLauncher::instantiate(const std::string& type_names) { + // Parse comma-separated type names and instantiate with them. + // Jitify uses variadic templates for instantiation, but we can pass + // the type string directly to the kernel() call via name mangling. + // For CUDA/jitify, we use the instantiate(type_name) overload. + kernel_->impl_->instantiation = kernel_->impl_->program->kernel(kernel_->impl_->name).instantiate({type_names}); + kernel_->impl_->has_instantiation = true; + instantiated_ = true; + instantiation_ = type_names; + return *this; +} + +KernelLauncher& KernelLauncher::configure(dim3 grid, dim3 block, size_t shared_mem, gpuStream_t stream) { + grid_ = grid; + block_ = block; + shared_mem_ = shared_mem; + stream_ = stream; + return *this; +} + +// ============================================================================ +// Program implementation +// ============================================================================ + +struct Program::Impl { + jitify::Program jit_program; +}; + +Program::Program() : impl_(std::make_unique()) {} +Program::~Program() = default; +Program::Program(Program&&) noexcept = default; +Program& Program::operator=(Program&&) noexcept = default; + +Program::operator bool() const { + return impl_ != nullptr; +} + +Kernel Program::kernel(const std::string& name) { + Kernel k; + k.impl_->name = name; + k.impl_->program = &impl_->jit_program; + return k; +} + +// ============================================================================ +// ProgramCache implementation +// ============================================================================ + +struct ProgramCache::Impl { + jitify::JitCache cache; +}; + +ProgramCache::ProgramCache() : impl_(std::make_unique()) {} +ProgramCache::~ProgramCache() = default; + +Program ProgramCache::program(const std::string& name, + const std::string& source, + const std::vector>& headers, + const std::vector& flags) { + // jitify::JitCache handles caching internally, so we just wrap the result + (void)name; // Name is embedded in source for jitify + + // Extract header sources (jitify v1 takes just the source strings) + std::vector header_sources; + for (const auto& h : headers) { + header_sources.push_back(h.second); + } + + Program prog; + prog.impl_->jit_program = impl_->cache.program(source, header_sources, flags); + return prog; +} + +void ProgramCache::clear() { + // No-op: jitify::JitCache doesn't expose a clear method +} + +// ============================================================================ +// CompilerFlags implementation +// ============================================================================ + +CompilerFlags& CompilerFlags::include(const std::string& path) { + flags_.push_back("-I" + path); + return *this; +} + +CompilerFlags& CompilerFlags::define(const std::string& macro, const std::string& value) { + if (value.empty()) { + flags_.push_back("-D" + macro); + } else { + flags_.push_back("-D" + macro + "=" + value); + } + return *this; +} + +CompilerFlags& CompilerFlags::arch(const std::string& arch) { + flags_.push_back("-arch=" + arch); + return *this; +} + +CompilerFlags& CompilerFlags::flag(const std::string& raw_flag) { + flags_.push_back(raw_flag); + return *this; +} + +std::vector CompilerFlags::build() const { + return flags_; +} + +// ============================================================================ +// KernelLauncher::launchRaw() - launch via jitify's cuLaunchKernel wrapper +// ============================================================================ + +// The templated launch() (in JitKernel.inl) packs each argument into a buffer +// and hands the resulting array of pointers here, identically to the HIP +// backend. jitify's launch takes the same "vector of pointers to argument +// values" convention, so we forward straight to it. +void KernelLauncher::launchRaw(void** kernel_args, size_t num_args) { + if (!kernel_->impl_->has_instantiation) { + throw std::runtime_error("Kernel '" + kernel_->impl_->name + "' not instantiated before launch"); + } + + // Skip launch if any grid dimension is zero (no work); keeps parity with the + // HIP backend, where a zero grid is an error rather than a silent no-op. + if (grid_.x == 0 || grid_.y == 0 || grid_.z == 0) { + return; + } + + std::vector arg_ptrs(kernel_args, kernel_args + num_args); + kernel_->impl_->instantiation.configure(grid_, block_, shared_mem_, stream_).safe_launch(arg_ptrs); +} + +} // namespace jit +} // namespace deme + +#endif // !USE_HIP diff --git a/src/core/utils/JitKernel_hip.cpp b/src/core/utils/JitKernel_hip.cpp new file mode 100644 index 00000000..3dba0cb3 --- /dev/null +++ b/src/core/utils/JitKernel_hip.cpp @@ -0,0 +1,406 @@ +// Copyright (c) 2021, SBEL GPU Development Team +// Copyright (c) 2021, University of Wisconsin - Madison +// Copyright (c) 2026, Advanced Micro Devices, Inc. +// +// SPDX-License-Identifier: BSD-3-Clause + +// HIP backend for JitKernel using hiprtc. +// +// Key design: hiprtc requires all name expressions (kernel names to resolve) +// be registered BEFORE compilation via hiprtcAddNameExpression(). Since the +// DEM-Engine API calls buildProgram() first, then kernel().instantiate() later, +// we use lazy compilation: defer hiprtcCompileProgram until the first kernel +// launch, by which point all kernel+instantiation requests have been collected. + +#ifdef USE_HIP + +#include "JitKernel.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace deme { +namespace jit { + +namespace { + +void checkHiprtc(hiprtcResult result, const char* msg) { + if (result != HIPRTC_SUCCESS) { + throw std::runtime_error(std::string(msg) + ": " + hiprtcGetErrorString(result)); + } +} + +void checkHip(hipError_t result, const char* msg) { + if (result != hipSuccess) { + throw std::runtime_error(std::string(msg) + ": " + hipGetErrorString(result)); + } +} + +// Build the name expression string for hiprtcAddNameExpression. +// For a templated kernel like "modifyComponents" instantiated with "deme::DEMDataKT", +// the expression is "modifyComponents". +std::string buildNameExpression(const std::string& name, const std::string& instantiation) { + if (instantiation.empty()) { + return name; + } + return name + "<" + instantiation + ">"; +} + +} // namespace + +// Forward declaration of ProgramImpl so Kernel::Impl can reference it +struct ProgramImpl { + // Source and compilation options (stored for lazy/deferred compilation) + std::string name; + std::string source; + std::vector> headers; + std::vector flags; + + // Kernel name expressions that have been requested + // Key: name expression (e.g., "modifyComponents") + // Value: lowered/mangled name after compilation + std::unordered_map loweredNames; + + // Name expressions pending addition (requested but not yet compiled) + std::unordered_set pendingNameExpressions; + + // Compiled module (nullptr until first compilation) + hipModule_t module = nullptr; + + // Compiled code buffer (kept for potential recompilation) + std::vector codeBuffer; + + // Whether we need to recompile (new name expressions added since last compile) + bool needsRecompile = true; + + std::mutex mutex; + + ~ProgramImpl() { + if (module) { + (void)hipModuleUnload(module); + } + } + + // Compile or recompile with all registered name expressions + void ensureCompiled() { + std::lock_guard lock(mutex); + + if (!needsRecompile && module != nullptr) { + return; // Already compiled with current name expressions + } + + // Unload previous module if any + if (module) { + (void)hipModuleUnload(module); + module = nullptr; + } + + // Create hiprtc program + hiprtcProgram prog; + std::vector header_names; + std::vector header_sources; + for (const auto& h : headers) { + header_names.push_back(h.first.c_str()); + header_sources.push_back(h.second.c_str()); + } + + checkHiprtc(hiprtcCreateProgram(&prog, source.c_str(), name.c_str(), + static_cast(header_sources.size()), + header_sources.data(), header_names.data()), + "hiprtcCreateProgram"); + + // Register all name expressions (pending ones plus any previously resolved) + std::vector allNameExpressions; + for (const auto& pending : pendingNameExpressions) { + allNameExpressions.push_back(pending); + } + for (const auto& resolved : loweredNames) { + // Re-register previously resolved names (needed for recompilation) + if (pendingNameExpressions.find(resolved.first) == pendingNameExpressions.end()) { + allNameExpressions.push_back(resolved.first); + } + } + + for (const auto& expr : allNameExpressions) { + hiprtcResult res = hiprtcAddNameExpression(prog, expr.c_str()); + if (res != HIPRTC_SUCCESS) { + hiprtcDestroyProgram(&prog); + throw std::runtime_error("hiprtcAddNameExpression failed for '" + expr + + "': " + hiprtcGetErrorString(res)); + } + } + + // Convert flags to const char* array + std::vector options; + for (const auto& f : flags) { + options.push_back(f.c_str()); + } + + // Compile + hiprtcResult compileResult = hiprtcCompileProgram(prog, static_cast(options.size()), options.data()); + + if (compileResult != HIPRTC_SUCCESS) { + size_t logSize; + hiprtcGetProgramLogSize(prog, &logSize); + std::string log(logSize, '\0'); + hiprtcGetProgramLog(prog, log.data()); + hiprtcDestroyProgram(&prog); + throw std::runtime_error("hiprtc compilation failed for '" + name + "':\n" + log); + } + + // Get lowered names for all expressions + for (const auto& expr : allNameExpressions) { + const char* lowered = nullptr; + hiprtcResult res = hiprtcGetLoweredName(prog, expr.c_str(), &lowered); + if (res != HIPRTC_SUCCESS) { + hiprtcDestroyProgram(&prog); + throw std::runtime_error("hiprtcGetLoweredName failed for '" + expr + + "': " + hiprtcGetErrorString(res)); + } + loweredNames[expr] = std::string(lowered); + } + + // Clear pending set - all are now resolved + pendingNameExpressions.clear(); + + // Get compiled code + size_t codeSize; + checkHiprtc(hiprtcGetCodeSize(prog, &codeSize), "hiprtcGetCodeSize"); + codeBuffer.resize(codeSize); + checkHiprtc(hiprtcGetCode(prog, codeBuffer.data()), "hiprtcGetCode"); + + hiprtcDestroyProgram(&prog); + + // Load module + checkHip(hipModuleLoadData(&module, codeBuffer.data()), "hipModuleLoadData"); + + needsRecompile = false; + } + + // Register a name expression (kernel + optional instantiation) + // Returns the lowered name, triggering compilation if needed + std::string getLoweredName(const std::string& nameExpr) { + { + std::lock_guard lock(mutex); + + // Check if already resolved + auto it = loweredNames.find(nameExpr); + if (it != loweredNames.end() && !needsRecompile) { + return it->second; + } + + // Add to pending if not already there + if (loweredNames.find(nameExpr) == loweredNames.end()) { + pendingNameExpressions.insert(nameExpr); + needsRecompile = true; + } + } + + // Compile (releases and re-acquires lock internally) + ensureCompiled(); + + // Now look up the lowered name + std::lock_guard lock(mutex); + auto it = loweredNames.find(nameExpr); + if (it == loweredNames.end()) { + throw std::runtime_error("Internal error: lowered name not found for '" + nameExpr + "'"); + } + return it->second; + } +}; + +// ============================================================================ +// Kernel implementation +// ============================================================================ + +struct Kernel::Impl { + std::string name; + std::string instantiation; + hipFunction_t func = nullptr; + ProgramImpl* program = nullptr; // Non-owning reference to parent program +}; + +Kernel::Kernel() : impl_(std::make_unique()) {} +Kernel::~Kernel() = default; +Kernel::Kernel(Kernel&&) noexcept = default; +Kernel& Kernel::operator=(Kernel&&) noexcept = default; + +KernelLauncher Kernel::operator()() { + return KernelLauncher(this); +} + +KernelLauncher& Kernel::instantiate() { + launcher_ = std::make_unique(this); + return launcher_->instantiate(); +} + +KernelLauncher& Kernel::instantiate(const std::string& type_names) { + launcher_ = std::make_unique(this); + return launcher_->instantiate(type_names); +} + +const std::string& Kernel::name() const { + return impl_->name; +} + +// ============================================================================ +// KernelLauncher implementation +// ============================================================================ + +KernelLauncher::KernelLauncher(Kernel* k) : kernel_(k) {} + +KernelLauncher& KernelLauncher::instantiate() { + return instantiate(""); +} + +KernelLauncher& KernelLauncher::instantiate(const std::string& type_names) { + kernel_->impl_->instantiation = type_names; + instantiation_ = type_names; + + if (kernel_->impl_->program) { + // Build the name expression and register it + std::string nameExpr = buildNameExpression(kernel_->impl_->name, type_names); + + // Get the lowered name (this triggers compilation if needed) + std::string loweredName = kernel_->impl_->program->getLoweredName(nameExpr); + + // Ensure module is compiled + kernel_->impl_->program->ensureCompiled(); + + // Now get the function handle + hipError_t err = hipModuleGetFunction(&kernel_->impl_->func, + kernel_->impl_->program->module, + loweredName.c_str()); + if (err != hipSuccess) { + throw std::runtime_error("hipModuleGetFunction failed for '" + nameExpr + + "' (lowered: " + loweredName + "): " + hipGetErrorString(err)); + } + } + instantiated_ = true; + return *this; +} + +KernelLauncher& KernelLauncher::configure(dim3 grid, dim3 block, size_t shared_mem, gpuStream_t stream) { + grid_ = grid; + block_ = block; + shared_mem_ = shared_mem; + stream_ = stream; + return *this; +} + +// ============================================================================ +// Program implementation - wraps ProgramImpl +// ============================================================================ + +struct Program::Impl : public ProgramImpl {}; + +Program::Program() : impl_(std::make_unique()) {} +Program::~Program() = default; +Program::Program(Program&&) noexcept = default; +Program& Program::operator=(Program&&) noexcept = default; + +Program::operator bool() const { + return impl_ != nullptr; +} + +Kernel Program::kernel(const std::string& name) { + Kernel k; + k.impl_->name = name; + k.impl_->program = impl_.get(); + return k; +} + +// ============================================================================ +// ProgramCache implementation +// ============================================================================ + +struct ProgramCache::Impl { + // No caching at this level; callers cache via shared_ptr +}; + +ProgramCache::ProgramCache() : impl_(std::make_unique()) {} +ProgramCache::~ProgramCache() = default; + +Program ProgramCache::program(const std::string& name, + const std::string& source, + const std::vector>& headers, + const std::vector& flags) { + // Create a new Program with deferred compilation (lazy compilation on first kernel use) + Program result; + result.impl_->name = name; + result.impl_->source = source; + result.impl_->headers = headers; + result.impl_->flags = flags; + result.impl_->needsRecompile = true; + return result; +} + +void ProgramCache::clear() { + // No-op: no caching at this level +} + +// ============================================================================ +// CompilerFlags implementation +// ============================================================================ + +CompilerFlags& CompilerFlags::include(const std::string& path) { + flags_.push_back("-I" + path); + return *this; +} + +CompilerFlags& CompilerFlags::define(const std::string& macro, const std::string& value) { + if (value.empty()) { + flags_.push_back("-D" + macro); + } else { + flags_.push_back("-D" + macro + "=" + value); + } + return *this; +} + +CompilerFlags& CompilerFlags::arch(const std::string& arch) { + flags_.push_back("--gpu-architecture=" + arch); + return *this; +} + +CompilerFlags& CompilerFlags::flag(const std::string& raw_flag) { + flags_.push_back(raw_flag); + return *this; +} + +std::vector CompilerFlags::build() const { + return flags_; +} + +// ============================================================================ +// KernelLauncher::launchRaw() - non-templated launch for HIP +// ============================================================================ + +void KernelLauncher::launchRaw(void** kernel_args, size_t num_args) { + if (!kernel_->impl_->func) { + throw std::runtime_error("Kernel function '" + kernel_->impl_->name + "' not resolved"); + } + + // Skip launch if grid has zero dimension (no work to do). + // CUDA may silently accept grid=0, but HIP returns hipErrorInvalidValue. + if (grid_.x == 0 || grid_.y == 0 || grid_.z == 0) { + return; + } + + hipError_t err = + hipModuleLaunchKernel(kernel_->impl_->func, grid_.x, grid_.y, grid_.z, block_.x, block_.y, block_.z, + shared_mem_, stream_, kernel_args, nullptr); + + checkHip(err, ("hipModuleLaunchKernel failed for '" + kernel_->impl_->name + "'").c_str()); + (void)num_args; +} + +} // namespace jit +} // namespace deme + +#endif // USE_HIP diff --git a/src/core/utils/ManagedMemory.hpp b/src/core/utils/ManagedMemory.hpp index b219dcc4..71bed2c7 100644 --- a/src/core/utils/ManagedMemory.hpp +++ b/src/core/utils/ManagedMemory.hpp @@ -13,7 +13,7 @@ // Convenience functions to help with Managed Memory (allocated using ManagedAllocator, typically) namespace deme { -#if CUDART_VERSION >= 13000 +#if !defined(USE_HIP) && CUDART_VERSION >= 13000 inline cudaMemLocation make_device_location(int device) { cudaMemLocation loc{}; loc.type = cudaMemLocationTypeDevice; @@ -25,7 +25,10 @@ inline cudaMemLocation make_device_location(int device) { // Underlying implementation template inline void __migrate_impl(T* data, std::size_t size, int device, cudaStream_t stream = 0) { -#if CUDART_VERSION >= 13000 +#if defined(USE_HIP) + // HIP uses the 4-arg form: hipMemPrefetchAsync(const void*, size_t, int dstDevice, hipStream_t) + cudaMemPrefetchAsync(static_cast(data), size * sizeof(T), device, stream); +#elif CUDART_VERSION >= 13000 // CUDA 13.0+: cudaMemPrefetchAsync(const void*, size_t, cudaMemLocation, unsigned int flags, cudaStream_t) auto loc = make_device_location(device); unsigned int flags = 0; @@ -74,18 +77,30 @@ void migrate(const std::vector& data, int device, cudaStream_t stream = 0) { // Aliases for cudaMemoryAdvise constants enum class ManagedAdvice { +#if defined(USE_HIP) + READ_MOSTLY = hipMemAdviseSetReadMostly, + PREFERRED_LOC = hipMemAdviseSetPreferredLocation, + ACCESSED_BY = hipMemAdviseSetAccessedBy, + UNSET_READ_MOSTLY = hipMemAdviseUnsetReadMostly, + UNSET_PREFERRED_LOC = hipMemAdviseUnsetPreferredLocation, + UNSET_ACCESSED_BY = hipMemAdviseUnsetAccessedBy +#else READ_MOSTLY = cudaMemAdviseSetReadMostly, PREFERRED_LOC = cudaMemAdviseSetPreferredLocation, ACCESSED_BY = cudaMemAdviseSetAccessedBy, UNSET_READ_MOSTLY = cudaMemAdviseUnsetReadMostly, UNSET_PREFERRED_LOC = cudaMemAdviseUnsetPreferredLocation, UNSET_ACCESSED_BY = cudaMemAdviseUnsetAccessedBy +#endif }; // Underlying implementation template void __advise_impl(const T* data, std::size_t size, ManagedAdvice advice, int device) { -#if CUDART_VERSION >= 13000 +#if defined(USE_HIP) + // HIP uses the 4-arg form: hipMemAdvise(const void*, size_t, hipMemoryAdvise, int) + hipMemAdvise(static_cast(data), size * sizeof(T), static_cast(advice), device); +#elif CUDART_VERSION >= 13000 cudaMemLocation loc{}; loc.type = cudaMemLocationTypeDevice; loc.id = device; diff --git a/src/core/utils/cuda_to_hip.h b/src/core/utils/cuda_to_hip.h new file mode 100644 index 00000000..8c3c314c --- /dev/null +++ b/src/core/utils/cuda_to_hip.h @@ -0,0 +1,123 @@ +// Copyright (c) 2021, SBEL GPU Development Team +// Copyright (c) 2021, University of Wisconsin - Madison +// Copyright (c) 2026, Advanced Micro Devices, Inc. +// +// SPDX-License-Identifier: BSD-3-Clause + +// CUDA-to-HIP compatibility header for DEM-Engine. +// On ROCm this aliases CUDA runtime API symbols to their HIP equivalents. +// On CUDA this is a passthrough to the standard CUDA runtime. + +#ifndef DEME_CUDA_TO_HIP_H +#define DEME_CUDA_TO_HIP_H + +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + +#include + +// Runtime API types +#define cudaStream_t hipStream_t +#define cudaEvent_t hipEvent_t +#define cudaError_t hipError_t +#define cudaDeviceProp hipDeviceProp_t + +// Error codes +#define cudaSuccess hipSuccess +#define cudaErrorInvalidValue hipErrorInvalidValue +#define cudaErrorMemoryAllocation hipErrorMemoryAllocation +#define cudaErrorNotSupported hipErrorNotSupported + +// Memory management +#define cudaMalloc hipMalloc +#define cudaFree hipFree +#define cudaMallocManaged hipMallocManaged +#define cudaMallocHost hipHostMalloc +#define cudaHostAlloc hipHostMalloc +#define cudaFreeHost hipHostFree +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync +#define cudaMemPrefetchAsync hipMemPrefetchAsync + +// Memory copy kinds +#define cudaMemcpyHostToHost hipMemcpyHostToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDefault hipMemcpyDefault + +// Memory attach flags +#define cudaMemAttachGlobal hipMemAttachGlobal + +// Host alloc flags +#define cudaHostAllocDefault hipHostMallocDefault + +// Stream management +#define cudaStreamCreate hipStreamCreate +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamSynchronize hipStreamSynchronize + +// Device management +#define cudaGetDevice hipGetDevice +#define cudaSetDevice hipSetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaDeviceReset hipDeviceReset + +// Error handling +#define cudaGetLastError hipGetLastError +#define cudaGetErrorString hipGetErrorString +#define cudaPeekAtLastError hipPeekAtLastError + +// Pointer attributes +#define cudaPointerAttributes hipPointerAttribute_t +#define cudaPointerGetAttributes hipPointerGetAttributes + +// Memory types - use a namespace alias to preserve scoped access syntax. +// In hiprtc JIT kernels, hipMemoryType may not be available (device-only runtime). +// Provide a minimal fallback. +#ifdef __HIPCC_RTC__ + // hiprtc device-only context: define our own enum for compatibility + enum hipMemoryType { hipMemoryTypeUnregistered = 0, hipMemoryTypeHost = 1, + hipMemoryTypeDevice = 2, hipMemoryTypeManaged = 3 }; +#endif + +namespace cudaMemoryType { + constexpr hipMemoryType cudaMemoryTypeUnregistered = hipMemoryTypeUnregistered; + constexpr hipMemoryType cudaMemoryTypeHost = hipMemoryTypeHost; + constexpr hipMemoryType cudaMemoryTypeDevice = hipMemoryTypeDevice; + constexpr hipMemoryType cudaMemoryTypeManaged = hipMemoryTypeManaged; +} + +// Memory advise +#define cudaMemAdvise hipMemAdvise +#define cudaMemoryAdvise hipMemoryAdvise + +// CUB -> hipCUB +#define cub hipcub + +// cuRAND device headers -> hipRAND (not needed in this shim, handled in API.h) + +// CUDA floating-point intrinsics with directed rounding. +// HIP/AMD does not provide exact equivalents; use standard operations. +// The _ru suffix means "round up" (toward +infinity), _rd means "round down", +// _rn means "round to nearest", _rz means "round toward zero". +// For most physics simulations, the difference is negligible. +#if defined(__HIP_DEVICE_COMPILE__) || defined(__HIPCC__) + __device__ inline float __frcp_ru(float x) { return 1.0f / x; } + __device__ inline double __drcp_ru(double x) { return 1.0 / x; } + __device__ inline float __fmul_ru(float x, float y) { return x * y; } + __device__ inline double __dmul_ru(double x, double y) { return x * y; } + __device__ inline float __fadd_ru(float x, float y) { return x + y; } + __device__ inline double __dadd_ru(double x, double y) { return x + y; } +#endif + +#else // CUDA + +#include + +#endif // USE_HIP + +#endif // DEME_CUDA_TO_HIP_H diff --git a/src/kernel/CUDAMathHelpers.cuh b/src/kernel/CUDAMathHelpers.cuh index bb523c3d..167bc53d 100644 --- a/src/kernel/CUDAMathHelpers.cuh +++ b/src/kernel/CUDAMathHelpers.cuh @@ -45,7 +45,7 @@ #undef strtok_r #endif -#include "cuda_runtime.h" +#include #ifndef EXIT_WAIVED #define EXIT_WAIVED 2 @@ -54,7 +54,7 @@ using uint = unsigned int; using ushort = unsigned short; -#ifndef __CUDACC__ +#if !defined(__CUDACC__) && !defined(__HIPCC_RTC__) && !defined(__HIP_DEVICE_COMPILE__) //////////////////////////////////////////////////////////////////////////////// // override implementations of CUDA functions //////////////////////////////////////////////////////////////////////////////// @@ -241,8 +241,12 @@ inline __host__ __device__ int4 operator-(int4& a) { //////////////////////////////////////////////////////////////////////////////// // addition +// Note: HIP provides operator+, operator+= for HIP_vector_type in +// amd_hip_vector_types.h, so we guard vector-vector operators to avoid +// ambiguity. Scalar versions may still be needed. //////////////////////////////////////////////////////////////////////////////// +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float2 operator+(float2 a, float2 b) { return make_float2(a.x + b.x, a.y + b.y); } @@ -250,6 +254,7 @@ inline __host__ __device__ void operator+=(float2& a, float2 b) { a.x += b.x; a.y += b.y; } +#endif inline __host__ __device__ float2 operator+(float2 a, float b) { return make_float2(a.x + b, a.y + b); } @@ -297,6 +302,7 @@ inline __host__ __device__ void operator+=(uint2& a, uint b) { a.y += b; } +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float3 operator+(float3 a, float3 b) { return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); } @@ -305,6 +311,7 @@ inline __host__ __device__ void operator+=(float3& a, float3 b) { a.y += b.y; a.z += b.z; } +#endif inline __host__ __device__ float3 operator+(float3 a, float b) { return make_float3(a.x + b, a.y + b, a.z + b); } @@ -358,6 +365,7 @@ inline __host__ __device__ float3 operator+(float b, float3 a) { return make_float3(a.x + b, a.y + b, a.z + b); } +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float4 operator+(float4 a, float4 b) { return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } @@ -367,6 +375,7 @@ inline __host__ __device__ void operator+=(float4& a, float4 b) { a.z += b.z; a.w += b.w; } +#endif inline __host__ __device__ float4 operator+(float4 a, float b) { return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); } @@ -428,6 +437,7 @@ inline __host__ __device__ void operator+=(uint4& a, uint b) { // subtract //////////////////////////////////////////////////////////////////////////////// +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float2 operator-(float2 a, float2 b) { return make_float2(a.x - b.x, a.y - b.y); } @@ -435,6 +445,7 @@ inline __host__ __device__ void operator-=(float2& a, float2 b) { a.x -= b.x; a.y -= b.y; } +#endif inline __host__ __device__ float2 operator-(float2 a, float b) { return make_float2(a.x - b, a.y - b); } @@ -482,6 +493,7 @@ inline __host__ __device__ void operator-=(uint2& a, uint b) { a.y -= b; } +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float3 operator-(float3 a, float3 b) { return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); } @@ -490,6 +502,7 @@ inline __host__ __device__ void operator-=(float3& a, float3 b) { a.y -= b.y; a.z -= b.z; } +#endif inline __host__ __device__ float3 operator-(float3 a, float b) { return make_float3(a.x - b, a.y - b, a.z - b); } @@ -542,6 +555,7 @@ inline __host__ __device__ void operator-=(uint3& a, uint b) { a.z -= b; } +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float4 operator-(float4 a, float4 b) { return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); } @@ -551,6 +565,7 @@ inline __host__ __device__ void operator-=(float4& a, float4 b) { a.z -= b.z; a.w -= b.w; } +#endif inline __host__ __device__ float4 operator-(float4 a, float b) { return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); } @@ -607,8 +622,11 @@ inline __host__ __device__ void operator-=(uint4& a, uint b) { //////////////////////////////////////////////////////////////////////////////// // multiply +// Note: HIP provides operator*(floatN, floatN) in amd_hip_vector_types.h +// so we guard the vector-to-vector operators to avoid ambiguity //////////////////////////////////////////////////////////////////////////////// +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); } @@ -616,6 +634,7 @@ inline __host__ __device__ void operator*=(float2& a, float2 b) { a.x *= b.x; a.y *= b.y; } +#endif inline __host__ __device__ float2 operator*(float2 a, float b) { return make_float2(a.x * b, a.y * b); } @@ -663,6 +682,7 @@ inline __host__ __device__ void operator*=(uint2& a, uint b) { a.y *= b; } +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float3 operator*(float3 a, float3 b) { return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); } @@ -671,6 +691,7 @@ inline __host__ __device__ void operator*=(float3& a, float3 b) { a.y *= b.y; a.z *= b.z; } +#endif inline __host__ __device__ float3 operator*(float3 a, float b) { return make_float3(a.x * b, a.y * b, a.z * b); } @@ -723,6 +744,7 @@ inline __host__ __device__ void operator*=(uint3& a, uint b) { a.z *= b; } +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float4 operator*(float4 a, float4 b) { return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); } @@ -732,6 +754,7 @@ inline __host__ __device__ void operator*=(float4& a, float4 b) { a.z *= b.z; a.w *= b.w; } +#endif inline __host__ __device__ float4 operator*(float4 a, float b) { return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); } @@ -791,8 +814,10 @@ inline __host__ __device__ void operator*=(uint4& a, uint b) { //////////////////////////////////////////////////////////////////////////////// // divide +// Note: HIP provides operator/(floatN, floatN) in amd_hip_vector_types.h //////////////////////////////////////////////////////////////////////////////// +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float2 operator/(float2 a, float2 b) { return make_float2(a.x / b.x, a.y / b.y); } @@ -800,6 +825,7 @@ inline __host__ __device__ void operator/=(float2& a, float2 b) { a.x /= b.x; a.y /= b.y; } +#endif inline __host__ __device__ float2 operator/(float2 a, float b) { return make_float2(a.x / b, a.y / b); } @@ -811,6 +837,7 @@ inline __host__ __device__ float2 operator/(float b, float2 a) { return make_float2(b / a.x, b / a.y); } +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float3 operator/(float3 a, float3 b) { return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); } @@ -819,6 +846,7 @@ inline __host__ __device__ void operator/=(float3& a, float3 b) { a.y /= b.y; a.z /= b.z; } +#endif inline __host__ __device__ float3 operator/(float3 a, float b) { return make_float3(a.x / b, a.y / b, a.z / b); } @@ -831,6 +859,7 @@ inline __host__ __device__ float3 operator/(float b, float3 a) { return make_float3(b / a.x, b / a.y, b / a.z); } +#ifndef __HIP_PLATFORM_AMD__ inline __host__ __device__ float4 operator/(float4 a, float4 b) { return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); } @@ -840,6 +869,7 @@ inline __host__ __device__ void operator/=(float4& a, float4 b) { a.z /= b.z; a.w /= b.w; } +#endif inline __host__ __device__ float4 operator/(float4 a, float b) { return make_float4(a.x / b, a.y / b, a.z / b, a.w / b); } @@ -1453,11 +1483,20 @@ inline __host__ __device__ T2 to_real3(const T1& a) { } // Cause an error inside a kernel -#define DEME_ABORT_KERNEL(...) \ - { \ - printf(__VA_ARGS__); \ - __threadfence(); \ - asm volatile("trap;"); \ - } +#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP) + #define DEME_ABORT_KERNEL(...) \ + { \ + printf(__VA_ARGS__); \ + __threadfence(); \ + __builtin_trap(); \ + } +#else + #define DEME_ABORT_KERNEL(...) \ + { \ + printf(__VA_ARGS__); \ + __threadfence(); \ + asm volatile("trap;"); \ + } +#endif #endif