Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 95 additions & 37 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,29 @@ set(DEME_VERSION_MAJOR 2)
set(DEME_VERSION_MINOR 1)
set(DEME_VERSION_PATCH 0)

project(
Chrono-DEM-Engine
VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH}
LANGUAGES CXX CUDA
)
# HIP/ROCm support option (must be set before project() to select the right language)
option(USE_HIP "Build with HIP for AMD GPUs" OFF)

if(USE_HIP)
project(
Chrono-DEM-Engine
VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH}
LANGUAGES CXX HIP
)
# Set default HIP architecture if not specified
if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "")
set(CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "HIP architectures")
endif()
add_compile_definitions(USE_HIP)
# CXX files that include hip_runtime.h need this defined (HIP compiler defines it automatically)
add_compile_definitions(__HIP_PLATFORM_AMD__)
else()
project(
Chrono-DEM-Engine
VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH}
LANGUAGES CXX CUDA
)
endif()

if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
message(STATUS "Setting CMAKE_BUILD_TYPE to 'Release' (default)")
Expand All @@ -41,20 +59,29 @@ include(cmake/FixNinjaColors.cmake)
fix_ninja_colors()


find_package(CUDAToolkit REQUIRED)

# Find CUB library (this might need to be done in source-level config)
find_package(
CUB REQUIRED
HINTS ${CUDAToolkit_ROOT}/lib64/cmake/cub
)
if(USE_HIP)
# Find hipCUB for HIP builds
find_package(hipcub REQUIRED)
find_package(hip REQUIRED)
else()
find_package(CUDAToolkit REQUIRED)
# Find CUB library (this might need to be done in source-level config)
find_package(
CUB REQUIRED
HINTS ${CUDAToolkit_ROOT}/lib64/cmake/cub
)
endif()

# Find NVIDIA's Jitify library
find_path(
NVIDIAJitifyPath
NAMES jitify.hpp
PATHS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/jitify"
)
# Find Jitify library (NVIDIA version only; HIP uses hiprtc directly via JitKernel abstraction)
if(NOT USE_HIP)
find_path(
JitifyPath
NAMES jitify.hpp
PATHS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/jitify"
)
# Keep backward compat alias
set(NVIDIAJitifyPath ${JitifyPath})
endif()

# Let the user decide if they want to use ChPF
option(USE_CHPF "Toggle the use of ChPF for outputting" OFF)
Expand Down Expand Up @@ -119,8 +146,13 @@ cxx_std_autodetect()
set(ProjectIncludeSource "${CMAKE_CURRENT_SOURCE_DIR}/src")
set(ProjectIncludeGenerated "${CMAKE_BINARY_DIR}/src")

# Global fix for CUDA language bug
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
# Global fix for CUDA/HIP language bug
if(USE_HIP)
# Add HIP include directories for all targets (both CXX and HIP)
include_directories(${HIP_INCLUDE_DIR})
else()
include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
endif()


#------------------------------------------------------------
Expand Down Expand Up @@ -164,22 +196,41 @@ if(USE_CHPF)
target_compile_definitions(simulator_multi_gpu PUBLIC DEME_USE_CHPF)
set(USE_CHPF_STR "ON")

target_link_libraries(simulator_multi_gpu
PUBLIC CUDA::cudart
PUBLIC CUDA::nvrtc
PUBLIC CUDA::cuda_driver
PUBLIC ${ChPF_IMPORTED_NAME}
PUBLIC DEMERuntimeDataHelper
)
if(USE_HIP)
find_library(HIPRTC_LIB hiprtc HINTS ${ROCM_PATH}/lib /opt/rocm/lib REQUIRED)
target_link_libraries(simulator_multi_gpu
PUBLIC hip::host
PUBLIC ${HIPRTC_LIB}
PUBLIC ${ChPF_IMPORTED_NAME}
PUBLIC DEMERuntimeDataHelper
)
else()
target_link_libraries(simulator_multi_gpu
PUBLIC CUDA::cudart
PUBLIC CUDA::nvrtc
PUBLIC CUDA::cuda_driver
PUBLIC ${ChPF_IMPORTED_NAME}
PUBLIC DEMERuntimeDataHelper
)
endif()
else()
set(USE_CHPF_STR "OFF")

target_link_libraries(simulator_multi_gpu
PUBLIC CUDA::cudart
PUBLIC CUDA::nvrtc
PUBLIC CUDA::cuda_driver
PUBLIC DEMERuntimeDataHelper
)
if(USE_HIP)
find_library(HIPRTC_LIB hiprtc HINTS ${ROCM_PATH}/lib /opt/rocm/lib REQUIRED)
target_link_libraries(simulator_multi_gpu
PUBLIC hip::host
PUBLIC ${HIPRTC_LIB}
PUBLIC DEMERuntimeDataHelper
)
else()
target_link_libraries(simulator_multi_gpu
PUBLIC CUDA::cudart
PUBLIC CUDA::nvrtc
PUBLIC CUDA::cuda_driver
PUBLIC DEMERuntimeDataHelper
)
endif()
endif()

# If use managed arrays, define a macro
Expand All @@ -194,10 +245,17 @@ if(WIN32)
endif()

# Attach include directories to the top-level library target
set_target_properties(simulator_multi_gpu
PROPERTIES
LINKER_LANGUAGE CUDA
)
if(USE_HIP)
set_target_properties(simulator_multi_gpu
PROPERTIES
LINKER_LANGUAGE HIP
)
else()
set_target_properties(simulator_multi_gpu
PROPERTIES
LINKER_LANGUAGE CUDA
)
endif()

# ---------------------------------------------------------------------------- #
# Export and Install The Generated Targets
Expand Down
17 changes: 16 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ __A dual-GPU DEM solver with complex grain geometry support__

DEM-Engine, nicknamed _DEME_, does Discrete Element Method simulations:

- Using up to two GPUs at the same time (works great on consumer _and_ data center GPUs).
- Using up to two GPUs at the same time (works great on consumer _and_ data center GPUs, on both NVIDIA CUDA and AMD ROCm).
- With the particles having complex shapes represented by clumped spheres.
- With support for customizable contact force models (want to add a non-standard cohesive force, or an electrostatic repulsive force? You got this).
- With an emphasis on computational efficiency. As a rule of thumb, using 3-sphere clump elements, simulating 1 million elements for 1 million time steps takes around 1 hour on two RTX 3080s.
Expand Down Expand Up @@ -139,6 +139,21 @@ Some additional troubleshooting tips for building the project:
- If CUB is not found, then you may manually set it in the `ccmake` GUI as `/usr/local/cuda/lib64/cmake/cub`. It may be a slightly different path on your machine or cluster.
- If `libcudacxx` is not found, then you may manually set it in the `ccmake` GUI as `/usr/local/cuda-12.8/targets/x86_64-linux/lib/cmake/libcudacxx`. Depending on your CUDA version it may be a slightly different path on your machine or cluster. You may also try to find these packages using `find`.

### AMD GPUs (ROCm)

_DEME_ also builds and runs on AMD GPUs through ROCm/HIP. Instead of CUDA, install a recent [ROCm](https://rocm.docs.amd.com/) release (the HIP runtime, `hiprtc`, and `hipCUB` are required). You do not need the NVIDIA/jitify submodule for an AMD build; runtime kernel compilation is handled by `hiprtc`.

Configure the project the same way as in the **Linux and WSL** section, but pass `-DUSE_HIP=ON`, and set `CMAKE_HIP_ARCHITECTURES` to your GPU's architecture (for example `gfx90a` for MI200-series, `gfx1100` for RDNA3 desktop, or `gfx1201` for RDNA4). An example:

```
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Release -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm ..
ninja
```

If `CMAKE_HIP_ARCHITECTURES` is left unset it defaults to `gfx90a`. You can list the architecture of an installed device with `rocminfo`. If the ROCm install is not on CMake's default search path, point `-DCMAKE_PREFIX_PATH` at it (e.g. `/opt/rocm`) so `find_package` can locate hip and hipCUB. The demos are then run exactly as in the **Numerical examples** section.

### Windows

The process is similar to [the installation of Chrono](https://api.projectchrono.org/tutorial_install_chrono.html), which you can use as reference. The steps depend on your choice of tools, and what listed here are our recommendation.
Expand Down
8 changes: 5 additions & 3 deletions src/DEM/API.h
Original file line number Diff line number Diff line change
Expand Up @@ -1370,12 +1370,12 @@ class DEMSolver {
void PrintKinematicScratchSpaceUsage() const { kT->printScratchSpaceUsage(); }

/// Let dT do this call and return the reduce value of the inspected quantity.
float dTInspectReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
float dTInspectReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
const std::string& kernel_name,
INSPECT_ENTITY_TYPE thing_to_insp,
CUB_REDUCE_FLAVOR reduce_flavor,
bool all_domain);
float* dTInspectNoReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
float* dTInspectNoReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
const std::string& kernel_name,
INSPECT_ENTITY_TYPE thing_to_insp,
CUB_REDUCE_FLAVOR reduce_flavor,
Expand Down Expand Up @@ -1509,7 +1509,9 @@ class DEMSolver {
int m_updateFreq = 20;

// The extra libs that the kernels need to include.
std::string kernel_includes = "#include <curand_kernel.h>\n";
// Default: none (curand/hiprand not used by built-in kernels).
// Users can add custom includes via SetKernelInclude() if needed.
std::string kernel_includes = "";

// If and how we should add boundaries to the simulation world upon initialization. Choose between none, all and
// top_open.
Expand Down
15 changes: 14 additions & 1 deletion src/DEM/APIPrivate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2121,7 +2121,20 @@ inline void DEMSolver::equipSimParams(std::unordered_map<std::string, std::strin

// Some constants that we should consider using or not using
// strMap["_nAnalGM_"] = std::to_string(nAnalGM);
strMap["_nActiveLoadingThreads_"] = std::to_string(NUM_ACTIVE_TEMPLATE_LOADING_THREADS);
// Query device warp size at runtime for correct multi-arch support (wave32 vs wave64)
int runtimeWarpSize = DEME_CUDA_WARP_SIZE; // Compile-time fallback
#if defined(USE_HIP)
{
int dev = 0;
cudaDeviceProp prop;
if (cudaGetDevice(&dev) == cudaSuccess && cudaGetDeviceProperties(&prop, dev) == cudaSuccess) {
runtimeWarpSize = prop.warpSize; // 64 on gfx90a, 32 on gfx1100
}
}
#endif
clumpComponentOffset_t nActiveLoadingThreads = static_cast<clumpComponentOffset_t>(
DEME_MIN(DEME_MIN(runtimeWarpSize, DEME_KT_CD_NTHREADS_PER_BLOCK), DEME_NUM_BODIES_PER_BLOCK));
strMap["_nActiveLoadingThreads_"] = std::to_string(nActiveLoadingThreads);
// nTotalBodyTopologies includes clump topologies and ext obj topologies
strMap["_nDistinctMassProperties_"] = std::to_string(nDistinctMassProperties);
strMap["_nJitifiableClumpComponents_"] = std::to_string(nJitifiableClumpComponents);
Expand Down
4 changes: 2 additions & 2 deletions src/DEM/APIPublic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2526,7 +2526,7 @@ void DEMSolver::ClearThreadCollaborationStats() {
dT->nTotalSteps = 0;
}

float DEMSolver::dTInspectReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
float DEMSolver::dTInspectReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
const std::string& kernel_name,
INSPECT_ENTITY_TYPE thing_to_insp,
CUB_REDUCE_FLAVOR reduce_flavor,
Expand All @@ -2537,7 +2537,7 @@ float DEMSolver::dTInspectReduce(const std::shared_ptr<jitify::Program>& inspect
return (float)(*pRes);
}

float* DEMSolver::dTInspectNoReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
float* DEMSolver::dTInspectNoReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
const std::string& kernel_name,
INSPECT_ENTITY_TYPE thing_to_insp,
CUB_REDUCE_FLAVOR reduce_flavor,
Expand Down
4 changes: 2 additions & 2 deletions src/DEM/AuxClasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,10 +229,10 @@ void DEMInspector::Initialize(const std::unordered_map<std::string, std::string>
my_subs["_inRegionPolicy_"] = in_region_specifier;
my_subs["_quantityQueryProcess_"] = inspection_code;
if (thing_to_insp == INSPECT_ENTITY_TYPE::SPHERE) {
inspection_kernel = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
inspection_kernel = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
"DEMSphereQueryKernels", JitHelper::KERNEL_DIR / "DEMSphereQueryKernels.cu", my_subs, options)));
} else if (thing_to_insp == INSPECT_ENTITY_TYPE::CLUMP || thing_to_insp == INSPECT_ENTITY_TYPE::EVERYTHING) {
inspection_kernel = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
inspection_kernel = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
"DEMOwnerQueryKernels", JitHelper::KERNEL_DIR / "DEMOwnerQueryKernels.cu", my_subs, options)));
} else {
std::stringstream ss;
Expand Down
7 changes: 1 addition & 6 deletions src/DEM/AuxClasses.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@
#include "../core/utils/JitHelper.h"
#include "Defines.h"

// Forward declare jitify::Program to avoid downstream dependency
namespace jitify {
class Program;
}

namespace deme {

class DEMSolver;
Expand All @@ -25,7 +20,7 @@ class DEMDynamicThread;
/// their simulation entites, in a given region.
class DEMInspector {
private:
std::shared_ptr<jitify::Program> inspection_kernel;
std::shared_ptr<deme::jit::Program> inspection_kernel;

std::string inspection_code;
std::string in_region_code;
Expand Down
8 changes: 2 additions & 6 deletions src/DEM/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,13 @@ target_include_directories(
PUBLIC ${ProjectIncludeGenerated}
)

# DEM is pure CXX, no GPU library dependencies.
# CUB/hipcub is only needed by algorithms target.
if(USE_CHPF)
target_link_libraries(
DEM
PUBLIC CUB::CUB
PUBLIC ${ChPF_IMPORTED_NAME}
)
else()
target_link_libraries(
DEM
PUBLIC CUB::CUB
)
endif()

# if(WIN32)
Expand Down
14 changes: 12 additions & 2 deletions src/DEM/Defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <cmath>

#include "VariableTypes.h"
#include "cuda_runtime.h"
#include <core/utils/cuda_to_hip.h>

#define DEME_MIN(a, b) ((a < b) ? a : b)
#define DEME_MAX(a, b) ((a > b) ? a : b)
Expand All @@ -29,7 +29,17 @@ namespace deme {
#define DEME_TINY_FLOAT 1e-12 ///< Appears to be very sensitive to even smaller values...
#define DEME_HUGE_FLOAT 1e15
#define DEME_BITS_PER_BYTE 8
#define DEME_CUDA_WARP_SIZE 32
// Wavefront/warp size: 64 only on AMD wave64 device code (gfx8/gfx9 GCN/CDNA); 32 everywhere
// else (AMD RDNA, NVIDIA, and host code). ROCm 7.2.x does not provide __AMDGCN_WAVEFRONT_SIZE__,
// so device code keys off the __GFX*__ macros. For JIT kernels the actual device warp size is
// queried at runtime and substituted.
#if defined(__HIP_DEVICE_COMPILE__) && (defined(__GFX8__) || defined(__GFX9__))
#define DEME_WARP_SIZE 64
#else
#define DEME_WARP_SIZE 32
#endif
// Legacy macro for compatibility
#define DEME_CUDA_WARP_SIZE DEME_WARP_SIZE
#define DEME_MAX_WILDCARD_NUM 16
// In bin--triangle intersection scan, all bins are enlarged by a factor of this following constant, so that no triangle
// lies in between bins and not picked up by any bins.
Expand Down
16 changes: 8 additions & 8 deletions src/DEM/dT.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2519,41 +2519,41 @@ void DEMDynamicThread::jitifyKernels(const std::unordered_map<std::string, std::
const std::vector<std::string>& JitifyOptions) {
// First one is force array preparation kernels
{
prep_force_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
prep_force_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
"DEMPrepForceKernels", JitHelper::KERNEL_DIR / "DEMPrepForceKernels.cu", Subs, JitifyOptions)));
}
// Then force calculation kernels
{
cal_force_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
cal_force_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
"DEMCalcForceKernels", JitHelper::KERNEL_DIR / "DEMCalcForceKernels.cu", Subs, JitifyOptions)));
}
// Then force accumulation kernels
if (solverFlags.useCubForceCollect) {
collect_force_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
collect_force_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
"DEMCollectForceKernels", JitHelper::KERNEL_DIR / "DEMCollectForceKernels.cu", Subs, JitifyOptions)));
} else {
collect_force_kernels = std::make_shared<jitify::Program>(std::move(
collect_force_kernels = std::make_shared<deme::jit::Program>(std::move(
JitHelper::buildProgram("DEMCollectForceKernels_Compact",
JitHelper::KERNEL_DIR / "DEMCollectForceKernels_Compact.cu", Subs, JitifyOptions)));
}
// Then integration kernels
{
integrator_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
integrator_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
"DEMIntegrationKernels", JitHelper::KERNEL_DIR / "DEMIntegrationKernels.cu", Subs, JitifyOptions)));
}
// Then kernels that are... wildcards, which make on-the-fly changes to solver data
if (solverFlags.canFamilyChangeOnDevice) {
mod_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
mod_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
"DEMModeratorKernels", JitHelper::KERNEL_DIR / "DEMModeratorKernels.cu", Subs, JitifyOptions)));
}
// Then misc kernels
{
misc_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
misc_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
"DEMMiscKernels", JitHelper::KERNEL_DIR / "DEMMiscKernels.cu", Subs, JitifyOptions)));
}
}

float* DEMDynamicThread::inspectCall(const std::shared_ptr<jitify::Program>& inspection_kernel,
float* DEMDynamicThread::inspectCall(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
const std::string& kernel_name,
INSPECT_ENTITY_TYPE thing_to_insp,
CUB_REDUCE_FLAVOR reduce_flavor,
Expand Down
Loading