From 3f14058ca3c6ebd2d3ec08625cbc2b8c2aa439cb Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Fri, 19 Jun 2026 23:28:26 +0000
Subject: [PATCH] [ROCm] Add AMD GPU support via ROCm/HIP

This adds AMD GPU support to DEM-Engine through ROCm/HIP, alongside the
existing CUDA path. The NVIDIA build is unchanged: when USE_HIP is off the
project compiles and links exactly as before.

To review, start with src/core/utils/JitKernel.h and its backends. DEM-Engine
compiles its force and contact kernels at runtime. JitKernel is a unified
interface over jitify v1 (NVRTC) on CUDA and hiprtc on ROCm, keeping the same
fluent launch syntax on both. The templated launch() (JitKernel.inl) packs its
arguments into an array of pointers and forwards to a non-template launchRaw();
JitKernel_hip.cpp implements launchRaw() with hipModuleLaunchKernel and
JitKernel_cuda.cpp implements it with jitify's cuLaunchKernel wrapper. Because
hiprtc requires every name expression to be registered before compilation while
the engine registers kernels incrementally, the HIP backend defers compilation
until the first launch. src/core/utils/cuda_to_hip.h aliases the CUDA runtime
API symbols the engine uses to their HIP equivalents on the AMD path, so the
shared device and host code stays single-source.

CMakeLists.txt gains a USE_HIP option. When set, CMake enables the HIP
language, compiles the existing .cu sources as HIP, defines
__HIP_PLATFORM_AMD__, and links hip::host, hiprtc, and hipCUB.
CMAKE_HIP_ARCHITECTURES selects the target GPU and defaults to gfx90a. The
NVIDIA/jitify submodule is only required for CUDA builds. DEME_WARP_SIZE is 64
only for AMD wave64 device code (gfx8/gfx9) and 32 elsewhere; the per-launch
warp size is queried at runtime for JIT kernels.

Test Plan:

Built and exercised on three AMD GPUs running the engine's demo suite,
including the runtime-compiled force and contact-detection kernels: an Instinct
MI250X (gfx90a, CDNA2) and a Radeon RX 7900 XTX (gfx1100, RDNA3) on Linux, and
a Radeon RX 9070 XT (gfx1201, RDNA4) on Windows.

```
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm ..
ninja
```

The CUDA build is unaffected; it was reconfirmed by compiling and linking the
full project (all demos) with nvcc (CUDA 12.8, USE_HIP=OFF).

This work was authored with the assistance of Claude, an AI assistant by
Anthropic.
---
 CMakeLists.txt                              | 132 +++++--
 README.md                                   |  17 +-
 src/DEM/API.h                               |   8 +-
 src/DEM/APIPrivate.cpp                      |  15 +-
 src/DEM/APIPublic.cpp                       |   4 +-
 src/DEM/AuxClasses.cpp                      |   4 +-
 src/DEM/AuxClasses.h                        |   7 +-
 src/DEM/CMakeLists.txt                      |   8 +-
 src/DEM/Defines.h                           |  14 +-
 src/DEM/dT.cpp                              |  16 +-
 src/DEM/dT.h                                |  23 +-
 src/DEM/kT.cpp                              |  12 +-
 src/DEM/kT.h                                |  18 +-
 src/algorithms/CMakeLists.txt               |  58 ++-
 src/algorithms/DEMCubContactDetection.cu    |  15 +-
 src/algorithms/DEMCubForceCollection.cu     |   7 +-
 src/algorithms/DEMCubInstantiations.cu      |   5 +
 src/algorithms/DEMCubWrappers.cu            |   9 +
 src/algorithms/DEMStaticDeviceSubroutines.h |  12 +-
 src/core/CMakeLists.txt                     |  50 ++-
 src/core/utils/CudaAllocator.hpp            |   2 +-
 src/core/utils/GpuError.h                   |   2 +-
 src/core/utils/GpuManager.h                 |   2 +-
 src/core/utils/JitHelper.cpp                | 155 +++++---
 src/core/utils/JitHelper.h                  |  14 +-
 src/core/utils/JitKernel.h                  | 189 +++++++++
 src/core/utils/JitKernel.inl                |  64 +++
 src/core/utils/JitKernel_cuda.cpp           | 205 ++++++++++
 src/core/utils/JitKernel_hip.cpp            | 406 ++++++++++++++++++++
 src/core/utils/ManagedMemory.hpp            |  21 +-
 src/core/utils/cuda_to_hip.h                | 123 ++++++
 src/kernel/CUDAMathHelpers.cuh              |  55 ++-
 32 files changed, 1448 insertions(+), 224 deletions(-)
 create mode 100644 src/core/utils/JitKernel.h
 create mode 100644 src/core/utils/JitKernel.inl
 create mode 100644 src/core/utils/JitKernel_cuda.cpp
 create mode 100644 src/core/utils/JitKernel_hip.cpp
 create mode 100644 src/core/utils/cuda_to_hip.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f805145..a1d2b2cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,11 +14,29 @@ set(DEME_VERSION_MAJOR   2)
 set(DEME_VERSION_MINOR   1)
 set(DEME_VERSION_PATCH   0)
 
-project(
-	Chrono-DEM-Engine
-	VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH}
-	LANGUAGES CXX CUDA
-)
+# HIP/ROCm support option (must be set before project() to select the right language)
+option(USE_HIP "Build with HIP for AMD GPUs" OFF)
+
+if(USE_HIP)
+	project(
+		Chrono-DEM-Engine
+		VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH}
+		LANGUAGES CXX HIP
+	)
+	# Set default HIP architecture if not specified
+	if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "")
+		set(CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "HIP architectures")
+	endif()
+	add_compile_definitions(USE_HIP)
+	# CXX files that include hip_runtime.h need this defined (HIP compiler defines it automatically)
+	add_compile_definitions(__HIP_PLATFORM_AMD__)
+else()
+	project(
+		Chrono-DEM-Engine
+		VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH}
+		LANGUAGES CXX CUDA
+	)
+endif()
 
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   message(STATUS "Setting CMAKE_BUILD_TYPE to 'Release' (default)")
@@ -41,20 +59,29 @@ include(cmake/FixNinjaColors.cmake)
 fix_ninja_colors()
 
 
-find_package(CUDAToolkit REQUIRED)
-
-# Find CUB library (this might need to be done in source-level config)
-find_package(
-	CUB REQUIRED
-	HINTS ${CUDAToolkit_ROOT}/lib64/cmake/cub
-)
+if(USE_HIP)
+	# Find hipCUB for HIP builds
+	find_package(hipcub REQUIRED)
+	find_package(hip REQUIRED)
+else()
+	find_package(CUDAToolkit REQUIRED)
+	# Find CUB library (this might need to be done in source-level config)
+	find_package(
+		CUB REQUIRED
+		HINTS ${CUDAToolkit_ROOT}/lib64/cmake/cub
+	)
+endif()
 
-# Find NVIDIA's Jitify library 
-find_path(
-	NVIDIAJitifyPath
-	NAMES jitify.hpp
-	PATHS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/jitify"
-)
+# Find Jitify library (NVIDIA version only; HIP uses hiprtc directly via JitKernel abstraction)
+if(NOT USE_HIP)
+	find_path(
+		JitifyPath
+		NAMES jitify.hpp
+		PATHS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/jitify"
+	)
+	# Keep backward compat alias
+	set(NVIDIAJitifyPath ${JitifyPath})
+endif()
 
 # Let the user decide if they want to use ChPF
 option(USE_CHPF "Toggle the use of ChPF for outputting" OFF)
@@ -119,8 +146,13 @@ cxx_std_autodetect()
 set(ProjectIncludeSource "${CMAKE_CURRENT_SOURCE_DIR}/src")
 set(ProjectIncludeGenerated "${CMAKE_BINARY_DIR}/src")
 
-# Global fix for CUDA language bug
-include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+# Global fix for CUDA/HIP language bug
+if(USE_HIP)
+	# Add HIP include directories for all targets (both CXX and HIP)
+	include_directories(${HIP_INCLUDE_DIR})
+else()
+	include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+endif()
 
 
 #------------------------------------------------------------
@@ -164,22 +196,41 @@ if(USE_CHPF)
     target_compile_definitions(simulator_multi_gpu PUBLIC DEME_USE_CHPF)
 	set(USE_CHPF_STR "ON")
 
-	target_link_libraries(simulator_multi_gpu
-		PUBLIC CUDA::cudart
-		PUBLIC CUDA::nvrtc
-		PUBLIC CUDA::cuda_driver
-		PUBLIC ${ChPF_IMPORTED_NAME}
-		PUBLIC DEMERuntimeDataHelper
-	)
+	if(USE_HIP)
+		find_library(HIPRTC_LIB hiprtc HINTS ${ROCM_PATH}/lib /opt/rocm/lib REQUIRED)
+		target_link_libraries(simulator_multi_gpu
+			PUBLIC hip::host
+			PUBLIC ${HIPRTC_LIB}
+			PUBLIC ${ChPF_IMPORTED_NAME}
+			PUBLIC DEMERuntimeDataHelper
+		)
+	else()
+		target_link_libraries(simulator_multi_gpu
+			PUBLIC CUDA::cudart
+			PUBLIC CUDA::nvrtc
+			PUBLIC CUDA::cuda_driver
+			PUBLIC ${ChPF_IMPORTED_NAME}
+			PUBLIC DEMERuntimeDataHelper
+		)
+	endif()
 else()
 	set(USE_CHPF_STR "OFF")
 
-	target_link_libraries(simulator_multi_gpu
-		PUBLIC CUDA::cudart
-		PUBLIC CUDA::nvrtc
-		PUBLIC CUDA::cuda_driver
-		PUBLIC DEMERuntimeDataHelper
-	)
+	if(USE_HIP)
+		find_library(HIPRTC_LIB hiprtc HINTS ${ROCM_PATH}/lib /opt/rocm/lib REQUIRED)
+		target_link_libraries(simulator_multi_gpu
+			PUBLIC hip::host
+			PUBLIC ${HIPRTC_LIB}
+			PUBLIC DEMERuntimeDataHelper
+		)
+	else()
+		target_link_libraries(simulator_multi_gpu
+			PUBLIC CUDA::cudart
+			PUBLIC CUDA::nvrtc
+			PUBLIC CUDA::cuda_driver
+			PUBLIC DEMERuntimeDataHelper
+		)
+	endif()
 endif()
 
 # If use managed arrays, define a macro
@@ -194,10 +245,17 @@ if(WIN32)
 endif()
 
 # Attach include directories to the top-level library target
-set_target_properties(simulator_multi_gpu
-	PROPERTIES
-	LINKER_LANGUAGE CUDA
-)
+if(USE_HIP)
+	set_target_properties(simulator_multi_gpu
+		PROPERTIES
+		LINKER_LANGUAGE HIP
+	)
+else()
+	set_target_properties(simulator_multi_gpu
+		PROPERTIES
+		LINKER_LANGUAGE CUDA
+	)
+endif()
 
 # ---------------------------------------------------------------------------- #
 # Export and Install The Generated Targets 
diff --git a/README.md b/README.md
index f6d3f9b0..45011764 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ __A dual-GPU DEM solver with complex grain geometry support__
 
 DEM-Engine, nicknamed _DEME_, does Discrete Element Method simulations:
 
-- Using up to two GPUs at the same time (works great on consumer _and_ data center GPUs).
+- Using up to two GPUs at the same time (works great on consumer _and_ data center GPUs, on both NVIDIA CUDA and AMD ROCm).
 - With the particles having complex shapes represented by clumped spheres.
 - With support for customizable contact force models (want to add a non-standard cohesive force, or an electrostatic repulsive force? You got this).
 - With an emphasis on computational efficiency. As a rule of thumb, using 3-sphere clump elements, simulating 1 million elements for 1 million time steps takes around 1 hour on two RTX 3080s.
@@ -139,6 +139,21 @@ Some additional troubleshooting tips for building the project:
 - If CUB is not found, then you may manually set it in the `ccmake` GUI as `/usr/local/cuda/lib64/cmake/cub`. It may be a slightly different path on your machine or cluster.
 - If `libcudacxx` is not found, then you may manually set it in the `ccmake` GUI as `/usr/local/cuda-12.8/targets/x86_64-linux/lib/cmake/libcudacxx`. Depending on your CUDA version it may be a slightly different path on your machine or cluster. You may also try to find these packages using `find`. 
 
+### AMD GPUs (ROCm)
+
+_DEME_ also builds and runs on AMD GPUs through ROCm/HIP. Instead of CUDA, install a recent [ROCm](https://rocm.docs.amd.com/) release (the HIP runtime, `hiprtc`, and `hipCUB` are required). You do not need the NVIDIA/jitify submodule for an AMD build; runtime kernel compilation is handled by `hiprtc`.
+
+Configure the project the same way as in the **Linux and WSL** section, but pass `-DUSE_HIP=ON`, and set `CMAKE_HIP_ARCHITECTURES` to your GPU's architecture (for example `gfx90a` for MI200-series, `gfx1100` for RDNA3 desktop, or `gfx1201` for RDNA4). An example:
+
+```
+mkdir build
+cd build
+cmake -DCMAKE_BUILD_TYPE=Release -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm ..
+ninja
+```
+
+If `CMAKE_HIP_ARCHITECTURES` is left unset it defaults to `gfx90a`. You can list the architecture of an installed device with `rocminfo`. If the ROCm install is not on CMake's default search path, point `-DCMAKE_PREFIX_PATH` at it (e.g. `/opt/rocm`) so `find_package` can locate hip and hipCUB. The demos are then run exactly as in the **Numerical examples** section.
+
 ### Windows
 
 The process is similar to [the installation of Chrono](https://api.projectchrono.org/tutorial_install_chrono.html), which you can use as reference. The steps depend on your choice of tools, and what listed here are our recommendation.
diff --git a/src/DEM/API.h b/src/DEM/API.h
index c1ff1665..67575c3d 100644
--- a/src/DEM/API.h
+++ b/src/DEM/API.h
@@ -1370,12 +1370,12 @@ class DEMSolver {
     void PrintKinematicScratchSpaceUsage() const { kT->printScratchSpaceUsage(); }
 
     /// Let dT do this call and return the reduce value of the inspected quantity.
-    float dTInspectReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
+    float dTInspectReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                           const std::string& kernel_name,
                           INSPECT_ENTITY_TYPE thing_to_insp,
                           CUB_REDUCE_FLAVOR reduce_flavor,
                           bool all_domain);
-    float* dTInspectNoReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
+    float* dTInspectNoReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                              const std::string& kernel_name,
                              INSPECT_ENTITY_TYPE thing_to_insp,
                              CUB_REDUCE_FLAVOR reduce_flavor,
@@ -1509,7 +1509,9 @@ class DEMSolver {
     int m_updateFreq = 20;
 
     // The extra libs that the kernels need to include.
-    std::string kernel_includes = "#include <curand_kernel.h>\n";
+    // Default: none (curand/hiprand not used by built-in kernels).
+    // Users can add custom includes via SetKernelInclude() if needed.
+    std::string kernel_includes = "";
 
     // If and how we should add boundaries to the simulation world upon initialization. Choose between none, all and
     // top_open.
diff --git a/src/DEM/APIPrivate.cpp b/src/DEM/APIPrivate.cpp
index 08250829..eebab753 100644
--- a/src/DEM/APIPrivate.cpp
+++ b/src/DEM/APIPrivate.cpp
@@ -2121,7 +2121,20 @@ inline void DEMSolver::equipSimParams(std::unordered_map<std::string, std::strin
 
     // Some constants that we should consider using or not using
     // strMap["_nAnalGM_"] = std::to_string(nAnalGM);
-    strMap["_nActiveLoadingThreads_"] = std::to_string(NUM_ACTIVE_TEMPLATE_LOADING_THREADS);
+    // Query device warp size at runtime for correct multi-arch support (wave32 vs wave64)
+    int runtimeWarpSize = DEME_CUDA_WARP_SIZE;  // Compile-time fallback
+#if defined(USE_HIP)
+    {
+        int dev = 0;
+        cudaDeviceProp prop;
+        if (cudaGetDevice(&dev) == cudaSuccess && cudaGetDeviceProperties(&prop, dev) == cudaSuccess) {
+            runtimeWarpSize = prop.warpSize;  // 64 on gfx90a, 32 on gfx1100
+        }
+    }
+#endif
+    clumpComponentOffset_t nActiveLoadingThreads = static_cast<clumpComponentOffset_t>(
+        DEME_MIN(DEME_MIN(runtimeWarpSize, DEME_KT_CD_NTHREADS_PER_BLOCK), DEME_NUM_BODIES_PER_BLOCK));
+    strMap["_nActiveLoadingThreads_"] = std::to_string(nActiveLoadingThreads);
     // nTotalBodyTopologies includes clump topologies and ext obj topologies
     strMap["_nDistinctMassProperties_"] = std::to_string(nDistinctMassProperties);
     strMap["_nJitifiableClumpComponents_"] = std::to_string(nJitifiableClumpComponents);
diff --git a/src/DEM/APIPublic.cpp b/src/DEM/APIPublic.cpp
index 41b5c006..93b37abe 100644
--- a/src/DEM/APIPublic.cpp
+++ b/src/DEM/APIPublic.cpp
@@ -2526,7 +2526,7 @@ void DEMSolver::ClearThreadCollaborationStats() {
     dT->nTotalSteps = 0;
 }
 
-float DEMSolver::dTInspectReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
+float DEMSolver::dTInspectReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                                  const std::string& kernel_name,
                                  INSPECT_ENTITY_TYPE thing_to_insp,
                                  CUB_REDUCE_FLAVOR reduce_flavor,
@@ -2537,7 +2537,7 @@ float DEMSolver::dTInspectReduce(const std::shared_ptr<jitify::Program>& inspect
     return (float)(*pRes);
 }
 
-float* DEMSolver::dTInspectNoReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
+float* DEMSolver::dTInspectNoReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                                     const std::string& kernel_name,
                                     INSPECT_ENTITY_TYPE thing_to_insp,
                                     CUB_REDUCE_FLAVOR reduce_flavor,
diff --git a/src/DEM/AuxClasses.cpp b/src/DEM/AuxClasses.cpp
index 5e2bb13b..a000f38a 100644
--- a/src/DEM/AuxClasses.cpp
+++ b/src/DEM/AuxClasses.cpp
@@ -229,10 +229,10 @@ void DEMInspector::Initialize(const std::unordered_map<std::string, std::string>
     my_subs["_inRegionPolicy_"] = in_region_specifier;
     my_subs["_quantityQueryProcess_"] = inspection_code;
     if (thing_to_insp == INSPECT_ENTITY_TYPE::SPHERE) {
-        inspection_kernel = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        inspection_kernel = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMSphereQueryKernels", JitHelper::KERNEL_DIR / "DEMSphereQueryKernels.cu", my_subs, options)));
     } else if (thing_to_insp == INSPECT_ENTITY_TYPE::CLUMP || thing_to_insp == INSPECT_ENTITY_TYPE::EVERYTHING) {
-        inspection_kernel = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        inspection_kernel = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMOwnerQueryKernels", JitHelper::KERNEL_DIR / "DEMOwnerQueryKernels.cu", my_subs, options)));
     } else {
         std::stringstream ss;
diff --git a/src/DEM/AuxClasses.h b/src/DEM/AuxClasses.h
index 310d1b4b..09baada1 100644
--- a/src/DEM/AuxClasses.h
+++ b/src/DEM/AuxClasses.h
@@ -11,11 +11,6 @@
 #include "../core/utils/JitHelper.h"
 #include "Defines.h"
 
-// Forward declare jitify::Program to avoid downstream dependency
-namespace jitify {
-class Program;
-}
-
 namespace deme {
 
 class DEMSolver;
@@ -25,7 +20,7 @@ class DEMDynamicThread;
 /// their simulation entites, in a given region.
 class DEMInspector {
   private:
-    std::shared_ptr<jitify::Program> inspection_kernel;
+    std::shared_ptr<deme::jit::Program> inspection_kernel;
 
     std::string inspection_code;
     std::string in_region_code;
diff --git a/src/DEM/CMakeLists.txt b/src/DEM/CMakeLists.txt
index 43787a51..7cb96d09 100644
--- a/src/DEM/CMakeLists.txt
+++ b/src/DEM/CMakeLists.txt
@@ -13,17 +13,13 @@ target_include_directories(
 	PUBLIC ${ProjectIncludeGenerated}
 )
 
+# DEM is pure CXX, no GPU library dependencies.
+# CUB/hipcub is only needed by algorithms target.
 if(USE_CHPF)
 	target_link_libraries(
 		DEM
-		PUBLIC CUB::CUB
 		PUBLIC ${ChPF_IMPORTED_NAME}
 	)
-else()
-	target_link_libraries(
-		DEM
-		PUBLIC CUB::CUB
-	)
 endif()
 
 # if(WIN32)
diff --git a/src/DEM/Defines.h b/src/DEM/Defines.h
index c2a8de8e..68ffdfb8 100644
--- a/src/DEM/Defines.h
+++ b/src/DEM/Defines.h
@@ -12,7 +12,7 @@
 #include <cmath>
 
 #include "VariableTypes.h"
-#include "cuda_runtime.h"
+#include <core/utils/cuda_to_hip.h>
 
 #define DEME_MIN(a, b) ((a < b) ? a : b)
 #define DEME_MAX(a, b) ((a > b) ? a : b)
@@ -29,7 +29,17 @@ namespace deme {
 #define DEME_TINY_FLOAT 1e-12                ///< Appears to be very sensitive to even smaller values...
 #define DEME_HUGE_FLOAT 1e15
 #define DEME_BITS_PER_BYTE 8
-#define DEME_CUDA_WARP_SIZE 32
+// Wavefront/warp size: 64 only on AMD wave64 device code (gfx8/gfx9 GCN/CDNA); 32 everywhere
+// else (AMD RDNA, NVIDIA, and host code). ROCm 7.2.x does not provide __AMDGCN_WAVEFRONT_SIZE__,
+// so device code keys off the __GFX*__ macros. For JIT kernels the actual device warp size is
+// queried at runtime and substituted.
+#if defined(__HIP_DEVICE_COMPILE__) && (defined(__GFX8__) || defined(__GFX9__))
+    #define DEME_WARP_SIZE 64
+#else
+    #define DEME_WARP_SIZE 32
+#endif
+// Legacy macro for compatibility
+#define DEME_CUDA_WARP_SIZE DEME_WARP_SIZE
 #define DEME_MAX_WILDCARD_NUM 16
 // In bin--triangle intersection scan, all bins are enlarged by a factor of this following constant, so that no triangle
 // lies in between bins and not picked up by any bins.
diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp
index 0d047bb5..09bfc364 100644
--- a/src/DEM/dT.cpp
+++ b/src/DEM/dT.cpp
@@ -2519,41 +2519,41 @@ void DEMDynamicThread::jitifyKernels(const std::unordered_map<std::string, std::
                                      const std::vector<std::string>& JitifyOptions) {
     // First one is force array preparation kernels
     {
-        prep_force_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        prep_force_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMPrepForceKernels", JitHelper::KERNEL_DIR / "DEMPrepForceKernels.cu", Subs, JitifyOptions)));
     }
     // Then force calculation kernels
     {
-        cal_force_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        cal_force_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMCalcForceKernels", JitHelper::KERNEL_DIR / "DEMCalcForceKernels.cu", Subs, JitifyOptions)));
     }
     // Then force accumulation kernels
     if (solverFlags.useCubForceCollect) {
-        collect_force_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        collect_force_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMCollectForceKernels", JitHelper::KERNEL_DIR / "DEMCollectForceKernels.cu", Subs, JitifyOptions)));
     } else {
-        collect_force_kernels = std::make_shared<jitify::Program>(std::move(
+        collect_force_kernels = std::make_shared<deme::jit::Program>(std::move(
             JitHelper::buildProgram("DEMCollectForceKernels_Compact",
                                     JitHelper::KERNEL_DIR / "DEMCollectForceKernels_Compact.cu", Subs, JitifyOptions)));
     }
     // Then integration kernels
     {
-        integrator_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        integrator_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMIntegrationKernels", JitHelper::KERNEL_DIR / "DEMIntegrationKernels.cu", Subs, JitifyOptions)));
     }
     // Then kernels that are... wildcards, which make on-the-fly changes to solver data
     if (solverFlags.canFamilyChangeOnDevice) {
-        mod_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        mod_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMModeratorKernels", JitHelper::KERNEL_DIR / "DEMModeratorKernels.cu", Subs, JitifyOptions)));
     }
     // Then misc kernels
     {
-        misc_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        misc_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMMiscKernels", JitHelper::KERNEL_DIR / "DEMMiscKernels.cu", Subs, JitifyOptions)));
     }
 }
 
-float* DEMDynamicThread::inspectCall(const std::shared_ptr<jitify::Program>& inspection_kernel,
+float* DEMDynamicThread::inspectCall(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                                      const std::string& kernel_name,
                                      INSPECT_ENTITY_TYPE thing_to_insp,
                                      CUB_REDUCE_FLAVOR reduce_flavor,
diff --git a/src/DEM/dT.h b/src/DEM/dT.h
index 3222088d..80df1866 100644
--- a/src/DEM/dT.h
+++ b/src/DEM/dT.h
@@ -10,6 +10,7 @@
 #include <vector>
 #include <thread>
 #include <unordered_map>
+#include <unordered_set>
 #include <set>
 #include <functional>
 
@@ -23,10 +24,12 @@
 #include "Structs.h"
 #include "AuxClasses.h"
 
-// Forward declare jitify::Program to avoid downstream dependency
-namespace jitify {
+// Forward declare deme::jit::Program
+namespace deme {
+namespace jit {
 class Program;
 }
+}
 
 namespace deme {
 
@@ -631,7 +634,7 @@ class DEMDynamicThread {
                        const std::vector<std::string>& JitifyOptions);
 
     // Execute this kernel, then return the reduced value
-    float* inspectCall(const std::shared_ptr<jitify::Program>& inspection_kernel,
+    float* inspectCall(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                        const std::string& kernel_name,
                        INSPECT_ENTITY_TYPE thing_to_insp,
                        CUB_REDUCE_FLAVOR reduce_flavor,
@@ -709,13 +712,13 @@ class DEMDynamicThread {
         const std::function<bool(unsigned int, unsigned int, unsigned int, unsigned int)>& condition);
 
     // Just-in-time compiled kernels
-    std::shared_ptr<jitify::Program> prep_force_kernels;
-    std::shared_ptr<jitify::Program> cal_force_kernels;
-    std::shared_ptr<jitify::Program> collect_force_kernels;
-    std::shared_ptr<jitify::Program> integrator_kernels;
-    // std::shared_ptr<jitify::Program> quarry_stats_kernels;
-    std::shared_ptr<jitify::Program> mod_kernels;
-    std::shared_ptr<jitify::Program> misc_kernels;
+    std::shared_ptr<deme::jit::Program> prep_force_kernels;
+    std::shared_ptr<deme::jit::Program> cal_force_kernels;
+    std::shared_ptr<deme::jit::Program> collect_force_kernels;
+    std::shared_ptr<deme::jit::Program> integrator_kernels;
+    // std::shared_ptr<deme::jit::Program> quarry_stats_kernels;
+    std::shared_ptr<deme::jit::Program> mod_kernels;
+    std::shared_ptr<deme::jit::Program> misc_kernels;
 
     // Adjuster for update freq
     class AccumStepUpdater {
diff --git a/src/DEM/kT.cpp b/src/DEM/kT.cpp
index c56815df..2d140d68 100644
--- a/src/DEM/kT.cpp
+++ b/src/DEM/kT.cpp
@@ -880,34 +880,34 @@ void DEMKinematicThread::jitifyKernels(const std::unordered_map<std::string, std
                                        const std::vector<std::string>& JitifyOptions) {
     // First one is bin_sphere_kernels kernels, which figure out the bin--sphere touch pairs
     {
-        bin_sphere_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        bin_sphere_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMBinSphereKernels", JitHelper::KERNEL_DIR / "DEMBinSphereKernels.cu", Subs, JitifyOptions)));
     }
     // Then CD kernels
     {
-        sphere_contact_kernels = std::make_shared<jitify::Program>(std::move(
+        sphere_contact_kernels = std::make_shared<deme::jit::Program>(std::move(
             JitHelper::buildProgram("DEMContactKernels_SphereSphere",
                                     JitHelper::KERNEL_DIR / "DEMContactKernels_SphereSphere.cu", Subs, JitifyOptions)));
     }
     // Then triangle--bin intersection-related kernels
     {
-        bin_triangle_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        bin_triangle_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMBinTriangleKernels", JitHelper::KERNEL_DIR / "DEMBinTriangleKernels.cu", Subs, JitifyOptions)));
     }
     // Then sphere--triangle contact detection-related kernels
     {
-        sphTri_contact_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        sphTri_contact_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMContactKernels_SphereTriangle", JitHelper::KERNEL_DIR / "DEMContactKernels_SphereTriangle.cu", Subs,
             JitifyOptions)));
     }
     // Then contact history mapping kernels
     {
-        history_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        history_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMHistoryMappingKernels", JitHelper::KERNEL_DIR / "DEMHistoryMappingKernels.cu", Subs, JitifyOptions)));
     }
     // Then misc kernels
     {
-        misc_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        misc_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMMiscKernels", JitHelper::KERNEL_DIR / "DEMMiscKernels.cu", Subs, JitifyOptions)));
     }
 }
diff --git a/src/DEM/kT.h b/src/DEM/kT.h
index 54c8ed93..6032b10e 100644
--- a/src/DEM/kT.h
+++ b/src/DEM/kT.h
@@ -23,10 +23,12 @@
 #include "Defines.h"
 #include "Structs.h"
 
-// Forward declare jitify::Program to avoid downstream dependency
-namespace jitify {
+// Forward declare deme::jit::Program
+namespace deme {
+namespace jit {
 class Program;
 }
+}
 
 namespace deme {
 
@@ -408,12 +410,12 @@ class DEMKinematicThread {
 
     // Just-in-time compiled kernels
     // jitify::Program bin_sphere_kernels = JitHelper::buildProgram("bin_sphere_kernels", " ");
-    std::shared_ptr<jitify::Program> bin_sphere_kernels;
-    std::shared_ptr<jitify::Program> bin_triangle_kernels;
-    std::shared_ptr<jitify::Program> sphTri_contact_kernels;
-    std::shared_ptr<jitify::Program> sphere_contact_kernels;
-    std::shared_ptr<jitify::Program> history_kernels;
-    std::shared_ptr<jitify::Program> misc_kernels;
+    std::shared_ptr<deme::jit::Program> bin_sphere_kernels;
+    std::shared_ptr<deme::jit::Program> bin_triangle_kernels;
+    std::shared_ptr<deme::jit::Program> sphTri_contact_kernels;
+    std::shared_ptr<deme::jit::Program> sphere_contact_kernels;
+    std::shared_ptr<deme::jit::Program> history_kernels;
+    std::shared_ptr<deme::jit::Program> misc_kernels;
 
     // Adjuster for bin size
     class AccumTimer {
diff --git a/src/algorithms/CMakeLists.txt b/src/algorithms/CMakeLists.txt
index 796d48f0..ff9a88ca 100644
--- a/src/algorithms/CMakeLists.txt
+++ b/src/algorithms/CMakeLists.txt
@@ -15,10 +15,17 @@ target_include_directories(
 	PUBLIC ${ProjectIncludeGenerated}
 )
 
-target_link_libraries(
-	algorithms
-	PUBLIC CUB::CUB
-)
+if(USE_HIP)
+	target_link_libraries(
+		algorithms
+		PUBLIC hip::hipcub
+	)
+else()
+	target_link_libraries(
+		algorithms
+		PUBLIC CUB::CUB
+	)
+endif()
 
 ### HOST HEADERS ONLY (.h, .hpp) ###
 set(algorithms_interface
@@ -45,23 +52,36 @@ target_sources(
 	PRIVATE ${algorithms_sources}
 )
 
-# Get a list of supported architectures
-include(../../cmake/CudaSupportedArchitectures.cmake)
-cuda_supported_architectures()
+if(USE_HIP)
+	# Mark .cu sources as HIP language
+	set_source_files_properties(${algorithms_sources} PROPERTIES LANGUAGE HIP)
 
-# Convert the cuda supported architectures into their respective CMake properties
-set(virtual_architectures "")
-foreach(arch ${CUDASUP_ARCHITECTURES})
-	list(APPEND virtual_architectures "${arch}-virtual")
-endforeach()
+	set_target_properties(
+		algorithms PROPERTIES
+		POSITION_INDEPENDENT_CODE True
+		CXX_STANDARD ${CXXSTD_SUPPORTED}
+		PUBLIC_HEADER "${algorithms_interface}"
+		HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}"
+	)
+else()
+	# Get a list of supported architectures
+	include(../../cmake/CudaSupportedArchitectures.cmake)
+	cuda_supported_architectures()
 
-set_target_properties(
-	algorithms PROPERTIES 
-	POSITION_INDEPENDENT_CODE True
-	CXX_STANDARD ${CXXSTD_SUPPORTED}
-	PUBLIC_HEADER "${algorithms_interface}"
-	CUDA_ARCHITECTURES "${virtual_architectures}"
-)
+	# Convert the cuda supported architectures into their respective CMake properties
+	set(virtual_architectures "")
+	foreach(arch ${CUDASUP_ARCHITECTURES})
+		list(APPEND virtual_architectures "${arch}-virtual")
+	endforeach()
+
+	set_target_properties(
+		algorithms PROPERTIES
+		POSITION_INDEPENDENT_CODE True
+		CXX_STANDARD ${CXXSTD_SUPPORTED}
+		PUBLIC_HEADER "${algorithms_interface}"
+		CUDA_ARCHITECTURES "${virtual_architectures}"
+	)
+endif()
 
 install(
 	TARGETS algorithms
diff --git a/src/algorithms/DEMCubContactDetection.cu b/src/algorithms/DEMCubContactDetection.cu
index dccc4702..7158b6df 100644
--- a/src/algorithms/DEMCubContactDetection.cu
+++ b/src/algorithms/DEMCubContactDetection.cu
@@ -3,7 +3,12 @@
 //
 //	SPDX-License-Identifier: BSD-3-Clause
 
+#include <core/utils/cuda_to_hip.h>
+#if defined(USE_HIP)
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 // #include <thrust/sort.h>
 #include <kernel/DEMHelperKernels.cuh>
 
@@ -35,11 +40,11 @@ inline void contactEventArraysResize(size_t nContactPairs,
     granData.toDevice();
 }
 
-void contactDetection(std::shared_ptr<jitify::Program>& bin_sphere_kernels,
-                      std::shared_ptr<jitify::Program>& bin_triangle_kernels,
-                      std::shared_ptr<jitify::Program>& sphere_contact_kernels,
-                      std::shared_ptr<jitify::Program>& sphTri_contact_kernels,
-                      std::shared_ptr<jitify::Program>& history_kernels,
+void contactDetection(std::shared_ptr<deme::jit::Program>& bin_sphere_kernels,
+                      std::shared_ptr<deme::jit::Program>& bin_triangle_kernels,
+                      std::shared_ptr<deme::jit::Program>& sphere_contact_kernels,
+                      std::shared_ptr<deme::jit::Program>& sphTri_contact_kernels,
+                      std::shared_ptr<deme::jit::Program>& history_kernels,
                       DualStruct<DEMDataKT>& granData,
                       DualStruct<DEMSimParams>& simParams,
                       SolverFlags& solverFlags,
diff --git a/src/algorithms/DEMCubForceCollection.cu b/src/algorithms/DEMCubForceCollection.cu
index 44c1052f..bf07dc65 100644
--- a/src/algorithms/DEMCubForceCollection.cu
+++ b/src/algorithms/DEMCubForceCollection.cu
@@ -3,7 +3,12 @@
 //
 //	SPDX-License-Identifier: BSD-3-Clause
 
+#include <core/utils/cuda_to_hip.h>
+#if defined(USE_HIP)
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 // #include <thrust/sort.h>
 #include <kernel/DEMHelperKernels.cuh>
 
@@ -16,7 +21,7 @@
 
 namespace deme {
 
-void collectContactForcesThruCub(std::shared_ptr<jitify::Program>& collect_force_kernels,
+void collectContactForcesThruCub(std::shared_ptr<deme::jit::Program>& collect_force_kernels,
                                  DualStruct<DEMDataDT>& granData,
                                  const size_t nContactPairs,
                                  const size_t nClumps,
diff --git a/src/algorithms/DEMCubInstantiations.cu b/src/algorithms/DEMCubInstantiations.cu
index 25735e9d..0e9f0873 100644
--- a/src/algorithms/DEMCubInstantiations.cu
+++ b/src/algorithms/DEMCubInstantiations.cu
@@ -3,7 +3,12 @@
 //
 //	SPDX-License-Identifier: BSD-3-Clause
 
+#include <core/utils/cuda_to_hip.h>
+#if defined(USE_HIP)
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 #include <algorithms/DEMStaticDeviceSubroutines.h>
 
 #include <core/utils/GpuError.h>
diff --git a/src/algorithms/DEMCubWrappers.cu b/src/algorithms/DEMCubWrappers.cu
index 4ecfcbe5..aae5c9ea 100644
--- a/src/algorithms/DEMCubWrappers.cu
+++ b/src/algorithms/DEMCubWrappers.cu
@@ -3,7 +3,16 @@
 //
 //	SPDX-License-Identifier: BSD-3-Clause
 
+#include <core/utils/cuda_to_hip.h>
+#if defined(USE_HIP)
+#include <hipcub/hipcub.hpp>
+// CUB_RUNTIME_FUNCTION is a CUB macro not defined in hipCUB
+#ifndef CUB_RUNTIME_FUNCTION
+#define CUB_RUNTIME_FUNCTION
+#endif
+#else
 #include <cub/cub.cuh>
+#endif
 #include <DEM/Defines.h>
 #include <DEM/Structs.h>
 #include <core/utils/GpuError.h>
diff --git a/src/algorithms/DEMStaticDeviceSubroutines.h b/src/algorithms/DEMStaticDeviceSubroutines.h
index acd0cb4f..d2de6ef7 100644
--- a/src/algorithms/DEMStaticDeviceSubroutines.h
+++ b/src/algorithms/DEMStaticDeviceSubroutines.h
@@ -76,11 +76,11 @@ void cubSortByKey(T1* d_keys_in,
 // For kT and dT's private usage
 ////////////////////////////////////////////////////////////////////////////////
 
-void contactDetection(std::shared_ptr<jitify::Program>& bin_sphere_kernels,
-                      std::shared_ptr<jitify::Program>& bin_triangle_kernels,
-                      std::shared_ptr<jitify::Program>& sphere_contact_kernels,
-                      std::shared_ptr<jitify::Program>& sphTri_contact_kernels,
-                      std::shared_ptr<jitify::Program>& history_kernels,
+void contactDetection(std::shared_ptr<deme::jit::Program>& bin_sphere_kernels,
+                      std::shared_ptr<deme::jit::Program>& bin_triangle_kernels,
+                      std::shared_ptr<deme::jit::Program>& sphere_contact_kernels,
+                      std::shared_ptr<deme::jit::Program>& sphTri_contact_kernels,
+                      std::shared_ptr<deme::jit::Program>& history_kernels,
                       DualStruct<DEMDataKT>& granData,
                       DualStruct<DEMSimParams>& simParams,
                       SolverFlags& solverFlags,
@@ -99,7 +99,7 @@ void contactDetection(std::shared_ptr<jitify::Program>& bin_sphere_kernels,
                       SolverTimers& timers,
                       kTStateParams& stateParams);
 
-void collectContactForcesThruCub(std::shared_ptr<jitify::Program>& collect_force_kernels,
+void collectContactForcesThruCub(std::shared_ptr<deme::jit::Program>& collect_force_kernels,
                                  DualStruct<DEMDataDT>& granData,
                                  const size_t nContactPairs,
                                  const size_t nClumps,
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index c01c748e..a0b5dae9 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -14,12 +14,15 @@ configure_file(
 )
 
 
-message(STATUS "${core_message} Extracting NVIDIA Jitify header...") 
-configure_file(
-	${NVIDIAJitifyPath}/jitify.hpp
-	${CMAKE_BINARY_DIR}/src/jitify/jitify.hpp
-	COPYONLY
-)
+# Jitify is only needed for CUDA builds (HIP uses hiprtc directly via JitKernel abstraction)
+if(NOT USE_HIP)
+	message(STATUS "${core_message} Extracting Jitify header...")
+	configure_file(
+		${JitifyPath}/jitify.hpp
+		${CMAKE_BINARY_DIR}/src/jitify/jitify.hpp
+		COPYONLY
+	)
+endif()
 
 add_library(core OBJECT)
 
@@ -31,17 +34,13 @@ target_include_directories(
 )
 
 
+# Core is pure CXX, no GPU library dependencies.
+# CUB/hipcub is only needed by algorithms target.
 if(USE_CHPF)
 	target_link_libraries(
 		core
-		PUBLIC CUB::CUB
 		INTERFACE ${ChPF_IMPORTED_NAME}
 	)
-else()
-	target_link_libraries(
-		core
-		PUBLIC CUB::CUB
-	)
 endif()
 
 
@@ -50,6 +49,8 @@ set(core_headers
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/CudaAllocator.hpp
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/ManagedMemory.hpp
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/JitHelper.h
+	${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel.h
+	${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel.inl
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/ThreadManager.h
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/GpuError.h
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/GpuManager.h
@@ -68,6 +69,13 @@ set(core_sources
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/DEMEPaths.cpp
 )
 
+# JitKernel abstraction: select backend based on USE_HIP
+if(USE_HIP)
+	list(APPEND core_sources ${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel_hip.cpp)
+else()
+	list(APPEND core_sources ${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel_cuda.cpp)
+endif()
+
 target_sources(
 	core
 	PUBLIC ${core_headers} ${core_utils_headers}
@@ -98,13 +106,15 @@ install(
 		${CMAKE_INSTALL_INCLUDEDIR}/core
 )
 
-# Install Third-party Headers 
-install(
-	FILES 
-		"${NVIDIAJitifyPath}/jitify.hpp"
-	DESTINATION 
-		${CMAKE_INSTALL_INCLUDEDIR}/jitify
-)
+# Install Third-party Headers (jitify only for CUDA builds)
+if(NOT USE_HIP)
+	install(
+		FILES
+			"${NVIDIAJitifyPath}/jitify.hpp"
+		DESTINATION
+			${CMAKE_INSTALL_INCLUDEDIR}/jitify
+	)
+endif()
 
 
 # --------------------------------------------------------- #
@@ -114,7 +124,7 @@ install(
 
 message(STATUS "${core_message} Generating Runtime Data helper (build)...")
 set(DEME_RUNTIME_DATA_DIRECTORY "${CMAKE_BINARY_DIR}")
-set(DEME_RUNTIME_INCLUDE_DIRECTORY "${CMAKE_BINARY_DIR}")
+set(DEME_RUNTIME_INCLUDE_DIRECTORY "${CMAKE_SOURCE_DIR}/src")
 configure_file(
 	utils/RuntimeData.cpp.in utils/RuntimeDataBuild.cpp
 	@ONLY
diff --git a/src/core/utils/CudaAllocator.hpp b/src/core/utils/CudaAllocator.hpp
index e05b8942..32a576aa 100644
--- a/src/core/utils/CudaAllocator.hpp
+++ b/src/core/utils/CudaAllocator.hpp
@@ -8,7 +8,7 @@
 
 #include <core/ApiVersion.h>
 
-#include <cuda_runtime_api.h>
+#include "cuda_to_hip.h"
 #include <climits>
 #include <iostream>
 #include <memory>
diff --git a/src/core/utils/GpuError.h b/src/core/utils/GpuError.h
index 97d213b5..c24bc315 100644
--- a/src/core/utils/GpuError.h
+++ b/src/core/utils/GpuError.h
@@ -9,7 +9,7 @@
 #include <iostream>
 #include <sstream>
 #include <exception>
-#include <cuda_runtime_api.h>
+#include "cuda_to_hip.h"
 
 #define DEME_GPU_CALL(res) \
     { gpu_assert((res), __FILE__, __LINE__); }
diff --git a/src/core/utils/GpuManager.h b/src/core/utils/GpuManager.h
index bf9cf611..d5a5d3c3 100644
--- a/src/core/utils/GpuManager.h
+++ b/src/core/utils/GpuManager.h
@@ -1,7 +1,7 @@
 #ifndef DEME_GPU_MANAGER_H
 #define DEME_GPU_MANAGER_H
 
-#include <cuda_runtime_api.h>
+#include "cuda_to_hip.h"
 #include <vector>
 #include <mutex>
 
diff --git a/src/core/utils/JitHelper.cpp b/src/core/utils/JitHelper.cpp
index 9779b8a1..ff0a4706 100644
--- a/src/core/utils/JitHelper.cpp
+++ b/src/core/utils/JitHelper.cpp
@@ -12,21 +12,30 @@
 #include <algorithm>
 #include <cstdlib>
 #include <stdexcept>
-
-#include <cuda_runtime_api.h>
-
-// Compile-time default CUDA architecture fallback.
-// Can be overridden at build time via -DDEME_DEFAULT_CUDA_ARCH_STR="compute_XY".
-// At runtime, the environment variable DEME_DEFAULT_CUDA_ARCH takes precedence.
+#include <cstring>
+
+#include "cuda_to_hip.h"
+
+// Compile-time default architecture fallback.
+// Can be overridden at build time via -DDEME_DEFAULT_CUDA_ARCH_STR="compute_XY" (CUDA)
+// or -DDEME_DEFAULT_HIP_ARCH_STR="gfx90a" (HIP).
+// At runtime, the environment variable DEME_DEFAULT_CUDA_ARCH (CUDA) or
+// DEME_DEFAULT_HIP_ARCH (HIP) takes precedence.
+#if defined(USE_HIP)
+#ifndef DEME_DEFAULT_HIP_ARCH_STR
+    #define DEME_DEFAULT_HIP_ARCH_STR "gfx90a"
+#endif
+#else
 #ifndef DEME_DEFAULT_CUDA_ARCH_STR
     #define DEME_DEFAULT_CUDA_ARCH_STR "compute_75"
 #endif
+#endif
 
 #include <core/ApiVersion.h>
 #include "RuntimeData.h"
 #include "JitHelper.h"
 
-jitify::JitCache* JitHelper::kcache = nullptr;
+deme::jit::ProgramCache* JitHelper::kcache = nullptr;
 
 const std::filesystem::path JitHelper::KERNEL_DIR = DEMERuntimeDataHelper::data_path / "kernel";
 const std::filesystem::path JitHelper::KERNEL_INCLUDE_DIR = DEMERuntimeDataHelper::include_path;
@@ -47,13 +56,12 @@ void JitHelper::Header::substitute(const std::string& symbol, const std::string&
     }
 }
 
-jitify::Program JitHelper::buildProgram(
+deme::jit::Program JitHelper::buildProgram(
     const std::string& name,
     const std::filesystem::path& source,
     std::unordered_map<std::string, std::string> substitutions,
-    // std::vector<JitHelper::Header> headers, // THIS PARAMETER PROBABLY WON'T EVER BE USED
     std::vector<std::string> flags) {
-    // Double ensure include paths for runtime headers + CUDA/CCCL (cuda::std)
+    // Double ensure include paths for runtime headers + CUDA/CCCL (cuda::std) or ROCm
     auto add_inc = [&](const std::filesystem::path& p) {
         if (p.empty())
             return;
@@ -68,15 +76,59 @@ jitify::Program JitHelper::buildProgram(
     // Project/runtime includes
     add_inc(KERNEL_INCLUDE_DIR);
 
+    // Also add the source tree for header includes (cuda_to_hip.h, etc.)
+    // KERNEL_INCLUDE_DIR is the build tree, but some headers live in src/
+    // The source tree is one level up from build, then into src/
+    std::error_code ec;
+    std::filesystem::path src_include = std::filesystem::canonical(KERNEL_INCLUDE_DIR / "..", ec);
+    if (!ec) {
+        add_inc(src_include / "src");
+    }
+
     // Common fallbacks
+#if defined(USE_HIP)
+    // Helper: find clang builtin headers by scanning lib/llvm/lib/clang/<version>/include
+    auto add_clang_builtins = [&](const std::filesystem::path& rocm_root) {
+        std::filesystem::path clang_base = rocm_root / "lib" / "llvm" / "lib" / "clang";
+        std::error_code scan_ec;
+        for (const auto& entry : std::filesystem::directory_iterator(clang_base, scan_ec)) {
+            if (entry.is_directory()) {
+                add_inc(entry.path() / "include");
+                break;  // use the first (and typically only) version dir
+            }
+        }
+    };
+
+    // ROCm include paths for hipRTC
+    if (const char* rocm_path = std::getenv("ROCM_PATH")) {
+        add_inc(std::filesystem::path(rocm_path) / "include");
+        add_inc(std::filesystem::path(rocm_path) / "include" / "hipcub");
+        add_inc(std::filesystem::path(rocm_path) / "include" / "rocprim");
+        // Clang builtin headers (stddef.h, etc.) for hiprtc -- scan for actual version dir
+        add_clang_builtins(std::filesystem::path(rocm_path));
+    }
+    add_inc("/opt/rocm/include");
+    add_inc("/opt/rocm/include/hipcub");
+    add_inc("/opt/rocm/include/rocprim");
+    // Clang builtin headers for hiprtc (needed for stddef.h, stdint.h, etc.)
+    add_clang_builtins(std::filesystem::path("/opt/rocm"));
+#else
     if (const char* cuda_home = std::getenv("CUDA_HOME")) {
         add_inc(std::filesystem::path(cuda_home) / "include");
         add_inc(std::filesystem::path(cuda_home) / "include" / "cccl");
     }
     add_inc("/usr/local/cuda/include");
     add_inc("/usr/local/cuda/include/cccl");
+#endif
 
+#if defined(USE_HIP)
+    // For hiprtc: start with hip_runtime.h to provide HIP types/functions in device code.
+    // The program name is passed separately to hiprtcCreateProgram.
+    std::string code = "#include <hip/hip_runtime.h>\n";
+#else
+    // For CUDA/jitify: the program name on the first line is a jitify requirement.
     std::string code = name + "\n";
+#endif
 
     code.append(JitHelper::loadSourceFile(source));
     // Apply the substitutions
@@ -84,64 +136,53 @@ jitify::Program JitHelper::buildProgram(
         code = std::regex_replace(code, std::regex(subst.first), subst.second);
     }
 
-    std::vector<std::string> header_code;
-    // THIS BLOCK IS ONLY NEEDED IF THE headers PARAMETER IS USED
-    /*
-    for (auto it = headers.begin(); it != headers.end(); it++) {
-        header_code.push_back(it->getSource());
-    }
-    */
-
     if (kcache == nullptr)
-        kcache = new jitify::JitCache();
+        kcache = new deme::jit::ProgramCache();
 
-    // Stage 1: Explicit architecture detection.
-    // Query the active CUDA device compute capability and build an -arch flag.
+    // Detect GPU architecture and add appropriate flags
+    std::string arch_flag;
     {
         int dev = 0;
         cudaDeviceProp prop;
         memset(&prop, 0, sizeof(prop));
-        if (cudaGetDevice(&dev) == cudaSuccess && cudaGetDeviceProperties(&prop, dev) == cudaSuccess &&
-            prop.major > 0 && prop.minor >= 0) {
-            std::string detected_arch = "compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
-            std::vector<std::string> arch_flags = flags;
-            arch_flags.push_back("-arch=" + detected_arch);
-            return kcache->program(code, header_code, arch_flags);
+        if (cudaGetDevice(&dev) == cudaSuccess && cudaGetDeviceProperties(&prop, dev) == cudaSuccess) {
+#if defined(USE_HIP)
+            // HIP: use gcnArchName (e.g., "gfx90a")
+            std::string detected_arch = prop.gcnArchName;
+            // Strip any suffix after the base arch name (e.g., "gfx90a:sramecc+:xnack-" -> "gfx90a")
+            size_t colon_pos = detected_arch.find(':');
+            if (colon_pos != std::string::npos) {
+                detected_arch = detected_arch.substr(0, colon_pos);
+            }
+            arch_flag = "--gpu-architecture=" + detected_arch;
+#else
+            if (prop.major > 0 && prop.minor >= 0) {
+                std::string detected_arch = "compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
+                arch_flag = "-arch=" + detected_arch;
+            }
+#endif
         }
     }
 
-    // Stage 2: Jitify auto-detect.
-    // Device detection failed or returned an invalid compute capability; compile
-    // without an explicit -arch flag and let Jitify detect the architecture.
-    try {
-        return kcache->program(code, header_code, flags);
-    } catch (const std::exception& e) {
-        const std::string err_msg = e.what();
-        // Only fall through to Stage 3 when the failure is architecture-related.
-        const bool is_arch_error =
-            (err_msg.find("arch") != std::string::npos || err_msg.find("compute_") != std::string::npos ||
-             err_msg.find("sm_") != std::string::npos);
-        if (!is_arch_error) {
-            throw;
-        }
-
-        // Stage 3: Hardcoded default fallback.
-        // Use the DEME_DEFAULT_CUDA_ARCH environment variable if set; otherwise fall
-        // back to the compile-time constant DEME_DEFAULT_CUDA_ARCH_STR.
+    // If device detection failed, use fallback
+    if (arch_flag.empty()) {
+#if defined(USE_HIP)
+        const char* env_arch = std::getenv("DEME_DEFAULT_HIP_ARCH");
+        const std::string fallback_arch =
+            (env_arch != nullptr && env_arch[0] != '\0') ? std::string(env_arch) : DEME_DEFAULT_HIP_ARCH_STR;
+        arch_flag = "--gpu-architecture=" + fallback_arch;
+#else
         const char* env_arch = std::getenv("DEME_DEFAULT_CUDA_ARCH");
         const std::string fallback_arch =
             (env_arch != nullptr && env_arch[0] != '\0') ? std::string(env_arch) : DEME_DEFAULT_CUDA_ARCH_STR;
-        std::vector<std::string> fallback_flags = flags;
-        fallback_flags.push_back("-arch=" + fallback_arch);
-        try {
-            return kcache->program(code, header_code, fallback_flags);
-        } catch (const std::exception& e3) {
-            std::string ctx = "Jitify compilation failed with fallback arch '";
-            ctx += fallback_arch;
-            ctx += (env_arch != nullptr && env_arch[0] != '\0') ? "' (from DEME_DEFAULT_CUDA_ARCH env var): "
-                                                                : "' (compile-time default): ";
-            ctx += e3.what();
-            throw std::runtime_error(ctx);
-        }
+        arch_flag = "-arch=" + fallback_arch;
+#endif
     }
+
+    std::vector<std::string> final_flags = flags;
+    final_flags.push_back(arch_flag);
+
+    // Use the unified JitKernel abstraction - no headers needed for our usage
+    std::vector<std::pair<std::string, std::string>> headers;
+    return kcache->program(name, code, headers, final_flags);
 }
diff --git a/src/core/utils/JitHelper.h b/src/core/utils/JitHelper.h
index 638f23cd..c3dee0d8 100644
--- a/src/core/utils/JitHelper.h
+++ b/src/core/utils/JitHelper.h
@@ -7,11 +7,12 @@
 #define DEME_JIT_HELPER_H
 
 #include <filesystem>
+#include <fstream>
 #include <string>
 #include <vector>
 #include <unordered_map>
 
-#include <jitify/jitify.hpp>
+#include "JitKernel.h"
 
 #if defined(_WIN32) || defined(_WIN64)
     #undef max
@@ -31,24 +32,17 @@ class JitHelper {
         std::string _source;
     };
 
-    static jitify::Program buildProgram(
+    static deme::jit::Program buildProgram(
         const std::string& name,
         const std::filesystem::path& source,
         std::unordered_map<std::string, std::string> substitutions = std::unordered_map<std::string, std::string>(),
         std::vector<std::string> flags = std::vector<std::string>());
 
-    //// I'm pretty sure C++17 auto-converts this
-    // static jitify::Program buildProgram(
-    // 	const std::string& name, const std::string& code,
-    // 	std::vector<Header> headers = 0,
-    // 	std::vector<std::string> flags = 0
-    // );
-
     static const std::filesystem::path KERNEL_DIR;
     static const std::filesystem::path KERNEL_INCLUDE_DIR;
 
   private:
-    static jitify::JitCache* kcache;
+    static deme::jit::ProgramCache* kcache;
 
     inline static std::string loadSourceFile(const std::filesystem::path& sourcefile) {
         std::string code;
diff --git a/src/core/utils/JitKernel.h b/src/core/utils/JitKernel.h
new file mode 100644
index 00000000..39e99ae2
--- /dev/null
+++ b/src/core/utils/JitKernel.h
@@ -0,0 +1,189 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//  Copyright (c) 2026, Advanced Micro Devices, Inc.
+//
+//  SPDX-License-Identifier: BSD-3-Clause
+
+// Unified runtime kernel compilation API for DEM-Engine.
+// Abstracts over jitify v1 (CUDA/NVRTC) and hiprtc (ROCm), enabling both
+// platforms to use the same fluent kernel launch syntax.
+
+#ifndef DEME_JIT_KERNEL_H
+#define DEME_JIT_KERNEL_H
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <cstddef>
+#include <utility>
+
+#ifdef USE_HIP
+#include <hip/hip_runtime.h>
+using gpuStream_t = hipStream_t;
+#else
+#include <cuda_runtime.h>
+using gpuStream_t = cudaStream_t;
+#endif
+
+namespace deme {
+namespace jit {
+
+class Kernel;
+class Program;
+
+// Fluent kernel launch builder - matches existing jitify v1 usage pattern.
+// Usage: program->kernel("name")().instantiate().configure(...).launch(args...)
+class KernelLauncher {
+  public:
+    // For kernels without template parameters
+    KernelLauncher& instantiate();
+
+    // For kernels with template parameters.
+    // type_names: comma-separated type names as they appear in the kernel template.
+    // On hiprtc, these get baked into the mangled kernel name.
+    KernelLauncher& instantiate(const std::string& type_names);
+
+    // Configure launch parameters
+    KernelLauncher& configure(dim3 grid, dim3 block, size_t shared_mem = 0, gpuStream_t stream = 0);
+
+    // Launch with arguments - variadic template forwards to backend
+    // The implementation is in the .inl file for HIP, or calls jitify for CUDA
+    template <typename... Args>
+    void launch(Args... args);
+
+    // Non-templated launch for backend implementations
+    void launchRaw(void** kernel_args, size_t num_args);
+
+    // Kernel can create KernelLaunchers
+    explicit KernelLauncher(Kernel* k);
+
+  private:
+    friend class Kernel;
+
+    Kernel* kernel_;
+    dim3 grid_{1, 1, 1};
+    dim3 block_{1, 1, 1};
+    size_t shared_mem_ = 0;
+    gpuStream_t stream_ = 0;
+    std::string instantiation_;
+    bool instantiated_ = false;
+};
+
+// Handle to a kernel function within a compiled program.
+// Created by Program::kernel(name).
+class Kernel {
+  public:
+    ~Kernel();
+    Kernel(Kernel&&) noexcept;
+    Kernel& operator=(Kernel&&) noexcept;
+
+    // Deleted copy operations - kernel handles are move-only
+    Kernel(const Kernel&) = delete;
+    Kernel& operator=(const Kernel&) = delete;
+
+    // Start a launch chain. Returns a KernelLauncher for fluent chaining.
+    KernelLauncher operator()();
+
+    // Direct instantiate() method to match jitify v1 API.
+    // Usage: program->kernel("name").instantiate().configure(...).launch(...)
+    KernelLauncher& instantiate();
+    KernelLauncher& instantiate(const std::string& type_names);
+
+    // Get kernel name (for debugging/error messages)
+    const std::string& name() const;
+
+  private:
+    friend class Program;
+    friend class KernelLauncher;
+    Kernel();
+
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+    std::unique_ptr<KernelLauncher> launcher_;  // For instantiate() return reference
+};
+
+// A compiled GPU program containing one or more kernels.
+// Created by ProgramCache::program().
+class Program {
+  public:
+    ~Program();
+    Program(Program&&) noexcept;
+    Program& operator=(Program&&) noexcept;
+
+    // Deleted copy operations - programs are move-only
+    Program(const Program&) = delete;
+    Program& operator=(const Program&) = delete;
+
+    // Get a kernel by name. The kernel can then be launched via:
+    // program->kernel("name")().instantiate().configure(...).launch(...)
+    Kernel kernel(const std::string& name);
+
+    // Check if program is valid (compiled successfully)
+    explicit operator bool() const;
+
+  private:
+    friend class ProgramCache;
+    Program();
+
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+
+// Caches compiled programs to avoid recompilation.
+// Thread-safe: multiple threads can call program() concurrently.
+class ProgramCache {
+  public:
+    ProgramCache();
+    ~ProgramCache();
+
+    // Compile a program.
+    // - name: identifier for the program (typically the kernel file name)
+    // - source: kernel source code (prepended with name for jitify compatibility)
+    // - headers: additional header code as (name, source) pairs
+    // - flags: compiler flags (-I, -D, -arch, etc.)
+    //
+    // Returns a new Program by value. Caching is not performed at this level;
+    // callers typically cache via shared_ptr<Program> at the application level.
+    Program program(const std::string& name,
+                    const std::string& source,
+                    const std::vector<std::pair<std::string, std::string>>& headers = {},
+                    const std::vector<std::string>& flags = {});
+
+    // Clear all cached programs. Invalidates all Program references.
+    void clear();
+
+  private:
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+
+// Helper to build compiler flags in a platform-agnostic way.
+class CompilerFlags {
+  public:
+    // Add an include path (-I)
+    CompilerFlags& include(const std::string& path);
+
+    // Add a macro definition (-D)
+    CompilerFlags& define(const std::string& macro, const std::string& value = "");
+
+    // Set target architecture (compute_75 for CUDA, gfx90a for HIP)
+    // The flag format is handled automatically per platform.
+    CompilerFlags& arch(const std::string& arch);
+
+    // Add a raw flag (passed through unchanged)
+    CompilerFlags& flag(const std::string& raw_flag);
+
+    // Build the final flags vector
+    std::vector<std::string> build() const;
+
+  private:
+    std::vector<std::string> flags_;
+};
+
+}  // namespace jit
+}  // namespace deme
+
+// Include the template implementation (must be visible at all call sites)
+#include "JitKernel.inl"
+
+#endif  // DEME_JIT_KERNEL_H
diff --git a/src/core/utils/JitKernel.inl b/src/core/utils/JitKernel.inl
new file mode 100644
index 00000000..6984dd94
--- /dev/null
+++ b/src/core/utils/JitKernel.inl
@@ -0,0 +1,64 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//  Copyright (c) 2026, Advanced Micro Devices, Inc.
+//
+//  SPDX-License-Identifier: BSD-3-Clause
+
+// Template implementation for JitKernel.h
+// This file is included at the end of JitKernel.h
+
+#ifndef DEME_JIT_KERNEL_INL
+#define DEME_JIT_KERNEL_INL
+
+#include <cstring>
+#include <vector>
+#include <memory>
+
+namespace deme {
+namespace jit {
+
+namespace detail {
+
+// Helper to pack a single argument by copying to a buffer
+template <typename T>
+inline void packArg(std::vector<void*>& args, std::vector<std::unique_ptr<char[]>>& storage, T arg) {
+    auto buf = std::make_unique<char[]>(sizeof(T));
+    std::memcpy(buf.get(), &arg, sizeof(T));
+    args.push_back(buf.get());
+    storage.push_back(std::move(buf));
+}
+
+// Recursive pack for variadic args
+template <typename T, typename... Rest>
+inline void packArgs(std::vector<void*>& args, std::vector<std::unique_ptr<char[]>>& storage, T first, Rest... rest) {
+    packArg(args, storage, first);
+    if constexpr (sizeof...(rest) > 0) {
+        packArgs(args, storage, rest...);
+    }
+}
+
+// Base case: no args
+inline void packArgs(std::vector<void*>&, std::vector<std::unique_ptr<char[]>>&) {}
+
+}  // namespace detail
+
+template <typename... Args>
+void KernelLauncher::launch(Args... args) {
+    // Pack each argument into a heap buffer and collect pointers to them, then
+    // forward to the backend's non-template raw launcher (hipModuleLaunchKernel
+    // on ROCm, jitify's cuLaunchKernel wrapper on CUDA). Both runtimes take the
+    // same "array of pointers to argument values" convention.
+    std::vector<void*> kernel_args;
+    std::vector<std::unique_ptr<char[]>> arg_storage;
+
+    if constexpr (sizeof...(args) > 0) {
+        detail::packArgs(kernel_args, arg_storage, args...);
+    }
+
+    launchRaw(kernel_args.empty() ? nullptr : kernel_args.data(), kernel_args.size());
+}
+
+}  // namespace jit
+}  // namespace deme
+
+#endif  // DEME_JIT_KERNEL_INL
diff --git a/src/core/utils/JitKernel_cuda.cpp b/src/core/utils/JitKernel_cuda.cpp
new file mode 100644
index 00000000..c55126e7
--- /dev/null
+++ b/src/core/utils/JitKernel_cuda.cpp
@@ -0,0 +1,205 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//  Copyright (c) 2026, Advanced Micro Devices, Inc.
+//
+//  SPDX-License-Identifier: BSD-3-Clause
+
+// CUDA backend for JitKernel using jitify v1 (NVRTC)
+
+#ifndef USE_HIP
+
+#include "JitKernel.h"
+
+// Jitify is in the build tree at ${CMAKE_BINARY_DIR}/src/jitify/
+// Include path is added by CMake target_include_directories
+#include <jitify/jitify.hpp>
+#include <unordered_map>
+#include <stdexcept>
+#include <mutex>
+
+namespace deme {
+namespace jit {
+
+// ============================================================================
+// Kernel implementation
+// ============================================================================
+
+struct Kernel::Impl {
+    std::string name;
+    jitify::Program* program = nullptr;  // Non-owning reference to parent program
+    jitify::KernelInstantiation instantiation;
+    bool has_instantiation = false;
+};
+
+Kernel::Kernel() : impl_(std::make_unique<Impl>()) {}
+Kernel::~Kernel() = default;
+Kernel::Kernel(Kernel&&) noexcept = default;
+Kernel& Kernel::operator=(Kernel&&) noexcept = default;
+
+KernelLauncher Kernel::operator()() {
+    return KernelLauncher(this);
+}
+
+KernelLauncher& Kernel::instantiate() {
+    launcher_ = std::make_unique<KernelLauncher>(this);
+    return launcher_->instantiate();
+}
+
+KernelLauncher& Kernel::instantiate(const std::string& type_names) {
+    launcher_ = std::make_unique<KernelLauncher>(this);
+    return launcher_->instantiate(type_names);
+}
+
+const std::string& Kernel::name() const {
+    return impl_->name;
+}
+
+// ============================================================================
+// KernelLauncher implementation
+// ============================================================================
+
+KernelLauncher::KernelLauncher(Kernel* k) : kernel_(k) {}
+
+KernelLauncher& KernelLauncher::instantiate() {
+    kernel_->impl_->instantiation = kernel_->impl_->program->kernel(kernel_->impl_->name).instantiate();
+    kernel_->impl_->has_instantiation = true;
+    instantiated_ = true;
+    return *this;
+}
+
+KernelLauncher& KernelLauncher::instantiate(const std::string& type_names) {
+    // Parse comma-separated type names and instantiate with them.
+    // Jitify uses variadic templates for instantiation, but we can pass
+    // the type string directly to the kernel() call via name mangling.
+    // For CUDA/jitify, we use the instantiate(type_name) overload.
+    kernel_->impl_->instantiation = kernel_->impl_->program->kernel(kernel_->impl_->name).instantiate({type_names});
+    kernel_->impl_->has_instantiation = true;
+    instantiated_ = true;
+    instantiation_ = type_names;
+    return *this;
+}
+
+KernelLauncher& KernelLauncher::configure(dim3 grid, dim3 block, size_t shared_mem, gpuStream_t stream) {
+    grid_ = grid;
+    block_ = block;
+    shared_mem_ = shared_mem;
+    stream_ = stream;
+    return *this;
+}
+
+// ============================================================================
+// Program implementation
+// ============================================================================
+
+struct Program::Impl {
+    jitify::Program jit_program;
+};
+
+Program::Program() : impl_(std::make_unique<Impl>()) {}
+Program::~Program() = default;
+Program::Program(Program&&) noexcept = default;
+Program& Program::operator=(Program&&) noexcept = default;
+
+Program::operator bool() const {
+    return impl_ != nullptr;
+}
+
+Kernel Program::kernel(const std::string& name) {
+    Kernel k;
+    k.impl_->name = name;
+    k.impl_->program = &impl_->jit_program;
+    return k;
+}
+
+// ============================================================================
+// ProgramCache implementation
+// ============================================================================
+
+struct ProgramCache::Impl {
+    jitify::JitCache cache;
+};
+
+ProgramCache::ProgramCache() : impl_(std::make_unique<Impl>()) {}
+ProgramCache::~ProgramCache() = default;
+
+Program ProgramCache::program(const std::string& name,
+                              const std::string& source,
+                              const std::vector<std::pair<std::string, std::string>>& headers,
+                              const std::vector<std::string>& flags) {
+    // jitify::JitCache handles caching internally, so we just wrap the result
+    (void)name;  // Name is embedded in source for jitify
+
+    // Extract header sources (jitify v1 takes just the source strings)
+    std::vector<std::string> header_sources;
+    for (const auto& h : headers) {
+        header_sources.push_back(h.second);
+    }
+
+    Program prog;
+    prog.impl_->jit_program = impl_->cache.program(source, header_sources, flags);
+    return prog;
+}
+
+void ProgramCache::clear() {
+    // No-op: jitify::JitCache doesn't expose a clear method
+}
+
+// ============================================================================
+// CompilerFlags implementation
+// ============================================================================
+
+CompilerFlags& CompilerFlags::include(const std::string& path) {
+    flags_.push_back("-I" + path);
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::define(const std::string& macro, const std::string& value) {
+    if (value.empty()) {
+        flags_.push_back("-D" + macro);
+    } else {
+        flags_.push_back("-D" + macro + "=" + value);
+    }
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::arch(const std::string& arch) {
+    flags_.push_back("-arch=" + arch);
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::flag(const std::string& raw_flag) {
+    flags_.push_back(raw_flag);
+    return *this;
+}
+
+std::vector<std::string> CompilerFlags::build() const {
+    return flags_;
+}
+
+// ============================================================================
+// KernelLauncher::launchRaw() - launch via jitify's cuLaunchKernel wrapper
+// ============================================================================
+
+// The templated launch() (in JitKernel.inl) packs each argument into a buffer
+// and hands the resulting array of pointers here, identically to the HIP
+// backend. jitify's launch takes the same "vector of pointers to argument
+// values" convention, so we forward straight to it.
+void KernelLauncher::launchRaw(void** kernel_args, size_t num_args) {
+    if (!kernel_->impl_->has_instantiation) {
+        throw std::runtime_error("Kernel '" + kernel_->impl_->name + "' not instantiated before launch");
+    }
+
+    // Skip launch if any grid dimension is zero (no work); keeps parity with the
+    // HIP backend, where a zero grid is an error rather than a silent no-op.
+    if (grid_.x == 0 || grid_.y == 0 || grid_.z == 0) {
+        return;
+    }
+
+    std::vector<void*> arg_ptrs(kernel_args, kernel_args + num_args);
+    kernel_->impl_->instantiation.configure(grid_, block_, shared_mem_, stream_).safe_launch(arg_ptrs);
+}
+
+}  // namespace jit
+}  // namespace deme
+
+#endif  // !USE_HIP
diff --git a/src/core/utils/JitKernel_hip.cpp b/src/core/utils/JitKernel_hip.cpp
new file mode 100644
index 00000000..3dba0cb3
--- /dev/null
+++ b/src/core/utils/JitKernel_hip.cpp
@@ -0,0 +1,406 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//  Copyright (c) 2026, Advanced Micro Devices, Inc.
+//
+//  SPDX-License-Identifier: BSD-3-Clause
+
+// HIP backend for JitKernel using hiprtc.
+//
+// Key design: hiprtc requires all name expressions (kernel names to resolve)
+// be registered BEFORE compilation via hiprtcAddNameExpression(). Since the
+// DEM-Engine API calls buildProgram() first, then kernel().instantiate() later,
+// we use lazy compilation: defer hiprtcCompileProgram until the first kernel
+// launch, by which point all kernel+instantiation requests have been collected.
+
+#ifdef USE_HIP
+
+#include "JitKernel.h"
+#include <hip/hiprtc.h>
+#include <hip/hip_runtime.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <stdexcept>
+#include <mutex>
+#include <vector>
+#include <cstring>
+#include <algorithm>
+
+namespace deme {
+namespace jit {
+
+namespace {
+
+void checkHiprtc(hiprtcResult result, const char* msg) {
+    if (result != HIPRTC_SUCCESS) {
+        throw std::runtime_error(std::string(msg) + ": " + hiprtcGetErrorString(result));
+    }
+}
+
+void checkHip(hipError_t result, const char* msg) {
+    if (result != hipSuccess) {
+        throw std::runtime_error(std::string(msg) + ": " + hipGetErrorString(result));
+    }
+}
+
+// Build the name expression string for hiprtcAddNameExpression.
+// For a templated kernel like "modifyComponents" instantiated with "deme::DEMDataKT",
+// the expression is "modifyComponents<deme::DEMDataKT>".
+std::string buildNameExpression(const std::string& name, const std::string& instantiation) {
+    if (instantiation.empty()) {
+        return name;
+    }
+    return name + "<" + instantiation + ">";
+}
+
+}  // namespace
+
+// Forward declaration of ProgramImpl so Kernel::Impl can reference it
+struct ProgramImpl {
+    // Source and compilation options (stored for lazy/deferred compilation)
+    std::string name;
+    std::string source;
+    std::vector<std::pair<std::string, std::string>> headers;
+    std::vector<std::string> flags;
+
+    // Kernel name expressions that have been requested
+    // Key: name expression (e.g., "modifyComponents<deme::DEMDataKT>")
+    // Value: lowered/mangled name after compilation
+    std::unordered_map<std::string, std::string> loweredNames;
+
+    // Name expressions pending addition (requested but not yet compiled)
+    std::unordered_set<std::string> pendingNameExpressions;
+
+    // Compiled module (nullptr until first compilation)
+    hipModule_t module = nullptr;
+
+    // Compiled code buffer (kept for potential recompilation)
+    std::vector<char> codeBuffer;
+
+    // Whether we need to recompile (new name expressions added since last compile)
+    bool needsRecompile = true;
+
+    std::mutex mutex;
+
+    ~ProgramImpl() {
+        if (module) {
+            (void)hipModuleUnload(module);
+        }
+    }
+
+    // Compile or recompile with all registered name expressions
+    void ensureCompiled() {
+        std::lock_guard<std::mutex> lock(mutex);
+
+        if (!needsRecompile && module != nullptr) {
+            return;  // Already compiled with current name expressions
+        }
+
+        // Unload previous module if any
+        if (module) {
+            (void)hipModuleUnload(module);
+            module = nullptr;
+        }
+
+        // Create hiprtc program
+        hiprtcProgram prog;
+        std::vector<const char*> header_names;
+        std::vector<const char*> header_sources;
+        for (const auto& h : headers) {
+            header_names.push_back(h.first.c_str());
+            header_sources.push_back(h.second.c_str());
+        }
+
+        checkHiprtc(hiprtcCreateProgram(&prog, source.c_str(), name.c_str(),
+                                        static_cast<int>(header_sources.size()),
+                                        header_sources.data(), header_names.data()),
+                    "hiprtcCreateProgram");
+
+        // Register all name expressions (pending ones plus any previously resolved)
+        std::vector<std::string> allNameExpressions;
+        for (const auto& pending : pendingNameExpressions) {
+            allNameExpressions.push_back(pending);
+        }
+        for (const auto& resolved : loweredNames) {
+            // Re-register previously resolved names (needed for recompilation)
+            if (pendingNameExpressions.find(resolved.first) == pendingNameExpressions.end()) {
+                allNameExpressions.push_back(resolved.first);
+            }
+        }
+
+        for (const auto& expr : allNameExpressions) {
+            hiprtcResult res = hiprtcAddNameExpression(prog, expr.c_str());
+            if (res != HIPRTC_SUCCESS) {
+                hiprtcDestroyProgram(&prog);
+                throw std::runtime_error("hiprtcAddNameExpression failed for '" + expr +
+                                         "': " + hiprtcGetErrorString(res));
+            }
+        }
+
+        // Convert flags to const char* array
+        std::vector<const char*> options;
+        for (const auto& f : flags) {
+            options.push_back(f.c_str());
+        }
+
+        // Compile
+        hiprtcResult compileResult = hiprtcCompileProgram(prog, static_cast<int>(options.size()), options.data());
+
+        if (compileResult != HIPRTC_SUCCESS) {
+            size_t logSize;
+            hiprtcGetProgramLogSize(prog, &logSize);
+            std::string log(logSize, '\0');
+            hiprtcGetProgramLog(prog, log.data());
+            hiprtcDestroyProgram(&prog);
+            throw std::runtime_error("hiprtc compilation failed for '" + name + "':\n" + log);
+        }
+
+        // Get lowered names for all expressions
+        for (const auto& expr : allNameExpressions) {
+            const char* lowered = nullptr;
+            hiprtcResult res = hiprtcGetLoweredName(prog, expr.c_str(), &lowered);
+            if (res != HIPRTC_SUCCESS) {
+                hiprtcDestroyProgram(&prog);
+                throw std::runtime_error("hiprtcGetLoweredName failed for '" + expr +
+                                         "': " + hiprtcGetErrorString(res));
+            }
+            loweredNames[expr] = std::string(lowered);
+        }
+
+        // Clear pending set - all are now resolved
+        pendingNameExpressions.clear();
+
+        // Get compiled code
+        size_t codeSize;
+        checkHiprtc(hiprtcGetCodeSize(prog, &codeSize), "hiprtcGetCodeSize");
+        codeBuffer.resize(codeSize);
+        checkHiprtc(hiprtcGetCode(prog, codeBuffer.data()), "hiprtcGetCode");
+
+        hiprtcDestroyProgram(&prog);
+
+        // Load module
+        checkHip(hipModuleLoadData(&module, codeBuffer.data()), "hipModuleLoadData");
+
+        needsRecompile = false;
+    }
+
+    // Register a name expression (kernel + optional instantiation)
+    // Returns the lowered name, triggering compilation if needed
+    std::string getLoweredName(const std::string& nameExpr) {
+        {
+            std::lock_guard<std::mutex> lock(mutex);
+
+            // Check if already resolved
+            auto it = loweredNames.find(nameExpr);
+            if (it != loweredNames.end() && !needsRecompile) {
+                return it->second;
+            }
+
+            // Add to pending if not already there
+            if (loweredNames.find(nameExpr) == loweredNames.end()) {
+                pendingNameExpressions.insert(nameExpr);
+                needsRecompile = true;
+            }
+        }
+
+        // Compile (releases and re-acquires lock internally)
+        ensureCompiled();
+
+        // Now look up the lowered name
+        std::lock_guard<std::mutex> lock(mutex);
+        auto it = loweredNames.find(nameExpr);
+        if (it == loweredNames.end()) {
+            throw std::runtime_error("Internal error: lowered name not found for '" + nameExpr + "'");
+        }
+        return it->second;
+    }
+};
+
+// ============================================================================
+// Kernel implementation
+// ============================================================================
+
+struct Kernel::Impl {
+    std::string name;
+    std::string instantiation;
+    hipFunction_t func = nullptr;
+    ProgramImpl* program = nullptr;  // Non-owning reference to parent program
+};
+
+Kernel::Kernel() : impl_(std::make_unique<Impl>()) {}
+Kernel::~Kernel() = default;
+Kernel::Kernel(Kernel&&) noexcept = default;
+Kernel& Kernel::operator=(Kernel&&) noexcept = default;
+
+KernelLauncher Kernel::operator()() {
+    return KernelLauncher(this);
+}
+
+KernelLauncher& Kernel::instantiate() {
+    launcher_ = std::make_unique<KernelLauncher>(this);
+    return launcher_->instantiate();
+}
+
+KernelLauncher& Kernel::instantiate(const std::string& type_names) {
+    launcher_ = std::make_unique<KernelLauncher>(this);
+    return launcher_->instantiate(type_names);
+}
+
+const std::string& Kernel::name() const {
+    return impl_->name;
+}
+
+// ============================================================================
+// KernelLauncher implementation
+// ============================================================================
+
+KernelLauncher::KernelLauncher(Kernel* k) : kernel_(k) {}
+
+KernelLauncher& KernelLauncher::instantiate() {
+    return instantiate("");
+}
+
+KernelLauncher& KernelLauncher::instantiate(const std::string& type_names) {
+    kernel_->impl_->instantiation = type_names;
+    instantiation_ = type_names;
+
+    if (kernel_->impl_->program) {
+        // Build the name expression and register it
+        std::string nameExpr = buildNameExpression(kernel_->impl_->name, type_names);
+
+        // Get the lowered name (this triggers compilation if needed)
+        std::string loweredName = kernel_->impl_->program->getLoweredName(nameExpr);
+
+        // Ensure module is compiled
+        kernel_->impl_->program->ensureCompiled();
+
+        // Now get the function handle
+        hipError_t err = hipModuleGetFunction(&kernel_->impl_->func,
+                                               kernel_->impl_->program->module,
+                                               loweredName.c_str());
+        if (err != hipSuccess) {
+            throw std::runtime_error("hipModuleGetFunction failed for '" + nameExpr +
+                                     "' (lowered: " + loweredName + "): " + hipGetErrorString(err));
+        }
+    }
+    instantiated_ = true;
+    return *this;
+}
+
+KernelLauncher& KernelLauncher::configure(dim3 grid, dim3 block, size_t shared_mem, gpuStream_t stream) {
+    grid_ = grid;
+    block_ = block;
+    shared_mem_ = shared_mem;
+    stream_ = stream;
+    return *this;
+}
+
+// ============================================================================
+// Program implementation - wraps ProgramImpl
+// ============================================================================
+
+struct Program::Impl : public ProgramImpl {};
+
+Program::Program() : impl_(std::make_unique<Impl>()) {}
+Program::~Program() = default;
+Program::Program(Program&&) noexcept = default;
+Program& Program::operator=(Program&&) noexcept = default;
+
+Program::operator bool() const {
+    return impl_ != nullptr;
+}
+
+Kernel Program::kernel(const std::string& name) {
+    Kernel k;
+    k.impl_->name = name;
+    k.impl_->program = impl_.get();
+    return k;
+}
+
+// ============================================================================
+// ProgramCache implementation
+// ============================================================================
+
+struct ProgramCache::Impl {
+    // No caching at this level; callers cache via shared_ptr<Program>
+};
+
+ProgramCache::ProgramCache() : impl_(std::make_unique<Impl>()) {}
+ProgramCache::~ProgramCache() = default;
+
+Program ProgramCache::program(const std::string& name,
+                              const std::string& source,
+                              const std::vector<std::pair<std::string, std::string>>& headers,
+                              const std::vector<std::string>& flags) {
+    // Create a new Program with deferred compilation (lazy compilation on first kernel use)
+    Program result;
+    result.impl_->name = name;
+    result.impl_->source = source;
+    result.impl_->headers = headers;
+    result.impl_->flags = flags;
+    result.impl_->needsRecompile = true;
+    return result;
+}
+
+void ProgramCache::clear() {
+    // No-op: no caching at this level
+}
+
+// ============================================================================
+// CompilerFlags implementation
+// ============================================================================
+
+CompilerFlags& CompilerFlags::include(const std::string& path) {
+    flags_.push_back("-I" + path);
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::define(const std::string& macro, const std::string& value) {
+    if (value.empty()) {
+        flags_.push_back("-D" + macro);
+    } else {
+        flags_.push_back("-D" + macro + "=" + value);
+    }
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::arch(const std::string& arch) {
+    flags_.push_back("--gpu-architecture=" + arch);
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::flag(const std::string& raw_flag) {
+    flags_.push_back(raw_flag);
+    return *this;
+}
+
+std::vector<std::string> CompilerFlags::build() const {
+    return flags_;
+}
+
+// ============================================================================
+// KernelLauncher::launchRaw() - non-templated launch for HIP
+// ============================================================================
+
+void KernelLauncher::launchRaw(void** kernel_args, size_t num_args) {
+    if (!kernel_->impl_->func) {
+        throw std::runtime_error("Kernel function '" + kernel_->impl_->name + "' not resolved");
+    }
+
+    // Skip launch if grid has zero dimension (no work to do).
+    // CUDA may silently accept grid=0, but HIP returns hipErrorInvalidValue.
+    if (grid_.x == 0 || grid_.y == 0 || grid_.z == 0) {
+        return;
+    }
+
+    hipError_t err =
+        hipModuleLaunchKernel(kernel_->impl_->func, grid_.x, grid_.y, grid_.z, block_.x, block_.y, block_.z,
+                              shared_mem_, stream_, kernel_args, nullptr);
+
+    checkHip(err, ("hipModuleLaunchKernel failed for '" + kernel_->impl_->name + "'").c_str());
+    (void)num_args;
+}
+
+}  // namespace jit
+}  // namespace deme
+
+#endif  // USE_HIP
diff --git a/src/core/utils/ManagedMemory.hpp b/src/core/utils/ManagedMemory.hpp
index b219dcc4..71bed2c7 100644
--- a/src/core/utils/ManagedMemory.hpp
+++ b/src/core/utils/ManagedMemory.hpp
@@ -13,7 +13,7 @@
 // Convenience functions to help with Managed Memory (allocated using ManagedAllocator, typically)
 namespace deme {
 
-#if CUDART_VERSION >= 13000
+#if !defined(USE_HIP) && CUDART_VERSION >= 13000
 inline cudaMemLocation make_device_location(int device) {
     cudaMemLocation loc{};
     loc.type = cudaMemLocationTypeDevice;
@@ -25,7 +25,10 @@ inline cudaMemLocation make_device_location(int device) {
 // Underlying implementation
 template <class T>
 inline void __migrate_impl(T* data, std::size_t size, int device, cudaStream_t stream = 0) {
-#if CUDART_VERSION >= 13000
+#if defined(USE_HIP)
+    // HIP uses the 4-arg form: hipMemPrefetchAsync(const void*, size_t, int dstDevice, hipStream_t)
+    cudaMemPrefetchAsync(static_cast<void*>(data), size * sizeof(T), device, stream);
+#elif CUDART_VERSION >= 13000
     // CUDA 13.0+: cudaMemPrefetchAsync(const void*, size_t, cudaMemLocation, unsigned int flags, cudaStream_t)
     auto loc = make_device_location(device);
     unsigned int flags = 0;
@@ -74,18 +77,30 @@ void migrate(const std::vector<T>& data, int device, cudaStream_t stream = 0) {
 
 // Aliases for cudaMemoryAdvise constants
 enum class ManagedAdvice {
+#if defined(USE_HIP)
+    READ_MOSTLY = hipMemAdviseSetReadMostly,
+    PREFERRED_LOC = hipMemAdviseSetPreferredLocation,
+    ACCESSED_BY = hipMemAdviseSetAccessedBy,
+    UNSET_READ_MOSTLY = hipMemAdviseUnsetReadMostly,
+    UNSET_PREFERRED_LOC = hipMemAdviseUnsetPreferredLocation,
+    UNSET_ACCESSED_BY = hipMemAdviseUnsetAccessedBy
+#else
     READ_MOSTLY = cudaMemAdviseSetReadMostly,
     PREFERRED_LOC = cudaMemAdviseSetPreferredLocation,
     ACCESSED_BY = cudaMemAdviseSetAccessedBy,
     UNSET_READ_MOSTLY = cudaMemAdviseUnsetReadMostly,
     UNSET_PREFERRED_LOC = cudaMemAdviseUnsetPreferredLocation,
     UNSET_ACCESSED_BY = cudaMemAdviseUnsetAccessedBy
+#endif
 };
 
 // Underlying implementation
 template <class T>
 void __advise_impl(const T* data, std::size_t size, ManagedAdvice advice, int device) {
-#if CUDART_VERSION >= 13000
+#if defined(USE_HIP)
+    // HIP uses the 4-arg form: hipMemAdvise(const void*, size_t, hipMemoryAdvise, int)
+    hipMemAdvise(static_cast<const void*>(data), size * sizeof(T), static_cast<hipMemoryAdvise>(advice), device);
+#elif CUDART_VERSION >= 13000
     cudaMemLocation loc{};
     loc.type = cudaMemLocationTypeDevice;
     loc.id = device;
diff --git a/src/core/utils/cuda_to_hip.h b/src/core/utils/cuda_to_hip.h
new file mode 100644
index 00000000..8c3c314c
--- /dev/null
+++ b/src/core/utils/cuda_to_hip.h
@@ -0,0 +1,123 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//  Copyright (c) 2026, Advanced Micro Devices, Inc.
+//
+//  SPDX-License-Identifier: BSD-3-Clause
+
+// CUDA-to-HIP compatibility header for DEM-Engine.
+// On ROCm this aliases CUDA runtime API symbols to their HIP equivalents.
+// On CUDA this is a passthrough to the standard CUDA runtime.
+
+#ifndef DEME_CUDA_TO_HIP_H
+#define DEME_CUDA_TO_HIP_H
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+
+#include <hip/hip_runtime.h>
+
+// Runtime API types
+#define cudaStream_t hipStream_t
+#define cudaEvent_t hipEvent_t
+#define cudaError_t hipError_t
+#define cudaDeviceProp hipDeviceProp_t
+
+// Error codes
+#define cudaSuccess hipSuccess
+#define cudaErrorInvalidValue hipErrorInvalidValue
+#define cudaErrorMemoryAllocation hipErrorMemoryAllocation
+#define cudaErrorNotSupported hipErrorNotSupported
+
+// Memory management
+#define cudaMalloc hipMalloc
+#define cudaFree hipFree
+#define cudaMallocManaged hipMallocManaged
+#define cudaMallocHost hipHostMalloc
+#define cudaHostAlloc hipHostMalloc
+#define cudaFreeHost hipHostFree
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemPrefetchAsync hipMemPrefetchAsync
+
+// Memory copy kinds
+#define cudaMemcpyHostToHost hipMemcpyHostToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDefault hipMemcpyDefault
+
+// Memory attach flags
+#define cudaMemAttachGlobal hipMemAttachGlobal
+
+// Host alloc flags
+#define cudaHostAllocDefault hipHostMallocDefault
+
+// Stream management
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamSynchronize hipStreamSynchronize
+
+// Device management
+#define cudaGetDevice hipGetDevice
+#define cudaSetDevice hipSetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaDeviceReset hipDeviceReset
+
+// Error handling
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorString hipGetErrorString
+#define cudaPeekAtLastError hipPeekAtLastError
+
+// Pointer attributes
+#define cudaPointerAttributes hipPointerAttribute_t
+#define cudaPointerGetAttributes hipPointerGetAttributes
+
+// Memory types - use a namespace alias to preserve scoped access syntax.
+// In hiprtc JIT kernels, hipMemoryType may not be available (device-only runtime).
+// Provide a minimal fallback.
+#ifdef __HIPCC_RTC__
+    // hiprtc device-only context: define our own enum for compatibility
+    enum hipMemoryType { hipMemoryTypeUnregistered = 0, hipMemoryTypeHost = 1,
+                         hipMemoryTypeDevice = 2, hipMemoryTypeManaged = 3 };
+#endif
+
+namespace cudaMemoryType {
+    constexpr hipMemoryType cudaMemoryTypeUnregistered = hipMemoryTypeUnregistered;
+    constexpr hipMemoryType cudaMemoryTypeHost = hipMemoryTypeHost;
+    constexpr hipMemoryType cudaMemoryTypeDevice = hipMemoryTypeDevice;
+    constexpr hipMemoryType cudaMemoryTypeManaged = hipMemoryTypeManaged;
+}
+
+// Memory advise
+#define cudaMemAdvise hipMemAdvise
+#define cudaMemoryAdvise hipMemoryAdvise
+
+// CUB -> hipCUB
+#define cub hipcub
+
+// cuRAND device headers -> hipRAND (not needed in this shim, handled in API.h)
+
+// CUDA floating-point intrinsics with directed rounding.
+// HIP/AMD does not provide exact equivalents; use standard operations.
+// The _ru suffix means "round up" (toward +infinity), _rd means "round down",
+// _rn means "round to nearest", _rz means "round toward zero".
+// For most physics simulations, the difference is negligible.
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__HIPCC__)
+    __device__ inline float __frcp_ru(float x) { return 1.0f / x; }
+    __device__ inline double __drcp_ru(double x) { return 1.0 / x; }
+    __device__ inline float __fmul_ru(float x, float y) { return x * y; }
+    __device__ inline double __dmul_ru(double x, double y) { return x * y; }
+    __device__ inline float __fadd_ru(float x, float y) { return x + y; }
+    __device__ inline double __dadd_ru(double x, double y) { return x + y; }
+#endif
+
+#else  // CUDA
+
+#include <cuda_runtime.h>
+
+#endif  // USE_HIP
+
+#endif  // DEME_CUDA_TO_HIP_H
diff --git a/src/kernel/CUDAMathHelpers.cuh b/src/kernel/CUDAMathHelpers.cuh
index bb523c3d..167bc53d 100644
--- a/src/kernel/CUDAMathHelpers.cuh
+++ b/src/kernel/CUDAMathHelpers.cuh
@@ -45,7 +45,7 @@
     #undef strtok_r
 #endif
 
-#include "cuda_runtime.h"
+#include <core/utils/cuda_to_hip.h>
 
 #ifndef EXIT_WAIVED
     #define EXIT_WAIVED 2
@@ -54,7 +54,7 @@
 using uint = unsigned int;
 using ushort = unsigned short;
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC_RTC__) && !defined(__HIP_DEVICE_COMPILE__)
     ////////////////////////////////////////////////////////////////////////////////
     // override implementations of CUDA functions
     ////////////////////////////////////////////////////////////////////////////////
@@ -241,8 +241,12 @@ inline __host__ __device__ int4 operator-(int4& a) {
 
 ////////////////////////////////////////////////////////////////////////////////
 // addition
+// Note: HIP provides operator+, operator+= for HIP_vector_type in
+// amd_hip_vector_types.h, so we guard vector-vector operators to avoid
+// ambiguity. Scalar versions may still be needed.
 ////////////////////////////////////////////////////////////////////////////////
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float2 operator+(float2 a, float2 b) {
     return make_float2(a.x + b.x, a.y + b.y);
 }
@@ -250,6 +254,7 @@ inline __host__ __device__ void operator+=(float2& a, float2 b) {
     a.x += b.x;
     a.y += b.y;
 }
+#endif
 inline __host__ __device__ float2 operator+(float2 a, float b) {
     return make_float2(a.x + b, a.y + b);
 }
@@ -297,6 +302,7 @@ inline __host__ __device__ void operator+=(uint2& a, uint b) {
     a.y += b;
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float3 operator+(float3 a, float3 b) {
     return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
 }
@@ -305,6 +311,7 @@ inline __host__ __device__ void operator+=(float3& a, float3 b) {
     a.y += b.y;
     a.z += b.z;
 }
+#endif
 inline __host__ __device__ float3 operator+(float3 a, float b) {
     return make_float3(a.x + b, a.y + b, a.z + b);
 }
@@ -358,6 +365,7 @@ inline __host__ __device__ float3 operator+(float b, float3 a) {
     return make_float3(a.x + b, a.y + b, a.z + b);
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float4 operator+(float4 a, float4 b) {
     return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
 }
@@ -367,6 +375,7 @@ inline __host__ __device__ void operator+=(float4& a, float4 b) {
     a.z += b.z;
     a.w += b.w;
 }
+#endif
 inline __host__ __device__ float4 operator+(float4 a, float b) {
     return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
 }
@@ -428,6 +437,7 @@ inline __host__ __device__ void operator+=(uint4& a, uint b) {
 // subtract
 ////////////////////////////////////////////////////////////////////////////////
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float2 operator-(float2 a, float2 b) {
     return make_float2(a.x - b.x, a.y - b.y);
 }
@@ -435,6 +445,7 @@ inline __host__ __device__ void operator-=(float2& a, float2 b) {
     a.x -= b.x;
     a.y -= b.y;
 }
+#endif
 inline __host__ __device__ float2 operator-(float2 a, float b) {
     return make_float2(a.x - b, a.y - b);
 }
@@ -482,6 +493,7 @@ inline __host__ __device__ void operator-=(uint2& a, uint b) {
     a.y -= b;
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float3 operator-(float3 a, float3 b) {
     return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
 }
@@ -490,6 +502,7 @@ inline __host__ __device__ void operator-=(float3& a, float3 b) {
     a.y -= b.y;
     a.z -= b.z;
 }
+#endif
 inline __host__ __device__ float3 operator-(float3 a, float b) {
     return make_float3(a.x - b, a.y - b, a.z - b);
 }
@@ -542,6 +555,7 @@ inline __host__ __device__ void operator-=(uint3& a, uint b) {
     a.z -= b;
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float4 operator-(float4 a, float4 b) {
     return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
 }
@@ -551,6 +565,7 @@ inline __host__ __device__ void operator-=(float4& a, float4 b) {
     a.z -= b.z;
     a.w -= b.w;
 }
+#endif
 inline __host__ __device__ float4 operator-(float4 a, float b) {
     return make_float4(a.x - b, a.y - b, a.z - b, a.w - b);
 }
@@ -607,8 +622,11 @@ inline __host__ __device__ void operator-=(uint4& a, uint b) {
 
 ////////////////////////////////////////////////////////////////////////////////
 // multiply
+// Note: HIP provides operator*(floatN, floatN) in amd_hip_vector_types.h
+// so we guard the vector-to-vector operators to avoid ambiguity
 ////////////////////////////////////////////////////////////////////////////////
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float2 operator*(float2 a, float2 b) {
     return make_float2(a.x * b.x, a.y * b.y);
 }
@@ -616,6 +634,7 @@ inline __host__ __device__ void operator*=(float2& a, float2 b) {
     a.x *= b.x;
     a.y *= b.y;
 }
+#endif
 inline __host__ __device__ float2 operator*(float2 a, float b) {
     return make_float2(a.x * b, a.y * b);
 }
@@ -663,6 +682,7 @@ inline __host__ __device__ void operator*=(uint2& a, uint b) {
     a.y *= b;
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float3 operator*(float3 a, float3 b) {
     return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
 }
@@ -671,6 +691,7 @@ inline __host__ __device__ void operator*=(float3& a, float3 b) {
     a.y *= b.y;
     a.z *= b.z;
 }
+#endif
 inline __host__ __device__ float3 operator*(float3 a, float b) {
     return make_float3(a.x * b, a.y * b, a.z * b);
 }
@@ -723,6 +744,7 @@ inline __host__ __device__ void operator*=(uint3& a, uint b) {
     a.z *= b;
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float4 operator*(float4 a, float4 b) {
     return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
 }
@@ -732,6 +754,7 @@ inline __host__ __device__ void operator*=(float4& a, float4 b) {
     a.z *= b.z;
     a.w *= b.w;
 }
+#endif
 inline __host__ __device__ float4 operator*(float4 a, float b) {
     return make_float4(a.x * b, a.y * b, a.z * b, a.w * b);
 }
@@ -791,8 +814,10 @@ inline __host__ __device__ void operator*=(uint4& a, uint b) {
 
 ////////////////////////////////////////////////////////////////////////////////
 // divide
+// Note: HIP provides operator/(floatN, floatN) in amd_hip_vector_types.h
 ////////////////////////////////////////////////////////////////////////////////
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float2 operator/(float2 a, float2 b) {
     return make_float2(a.x / b.x, a.y / b.y);
 }
@@ -800,6 +825,7 @@ inline __host__ __device__ void operator/=(float2& a, float2 b) {
     a.x /= b.x;
     a.y /= b.y;
 }
+#endif
 inline __host__ __device__ float2 operator/(float2 a, float b) {
     return make_float2(a.x / b, a.y / b);
 }
@@ -811,6 +837,7 @@ inline __host__ __device__ float2 operator/(float b, float2 a) {
     return make_float2(b / a.x, b / a.y);
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float3 operator/(float3 a, float3 b) {
     return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
 }
@@ -819,6 +846,7 @@ inline __host__ __device__ void operator/=(float3& a, float3 b) {
     a.y /= b.y;
     a.z /= b.z;
 }
+#endif
 inline __host__ __device__ float3 operator/(float3 a, float b) {
     return make_float3(a.x / b, a.y / b, a.z / b);
 }
@@ -831,6 +859,7 @@ inline __host__ __device__ float3 operator/(float b, float3 a) {
     return make_float3(b / a.x, b / a.y, b / a.z);
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float4 operator/(float4 a, float4 b) {
     return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
 }
@@ -840,6 +869,7 @@ inline __host__ __device__ void operator/=(float4& a, float4 b) {
     a.z /= b.z;
     a.w /= b.w;
 }
+#endif
 inline __host__ __device__ float4 operator/(float4 a, float b) {
     return make_float4(a.x / b, a.y / b, a.z / b, a.w / b);
 }
@@ -1453,11 +1483,20 @@ inline __host__ __device__ T2 to_real3(const T1& a) {
 }
 
 // Cause an error inside a kernel
-#define DEME_ABORT_KERNEL(...) \
-    {                          \
-        printf(__VA_ARGS__);   \
-        __threadfence();       \
-        asm volatile("trap;"); \
-    }
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    #define DEME_ABORT_KERNEL(...) \
+        {                          \
+            printf(__VA_ARGS__);   \
+            __threadfence();       \
+            __builtin_trap();      \
+        }
+#else
+    #define DEME_ABORT_KERNEL(...) \
+        {                          \
+            printf(__VA_ARGS__);   \
+            __threadfence();       \
+            asm volatile("trap;"); \
+        }
+#endif
 
 #endif