diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f805145..a1d2b2cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,11 +14,29 @@ set(DEME_VERSION_MAJOR   2)
 set(DEME_VERSION_MINOR   1)
 set(DEME_VERSION_PATCH   0)
 
-project(
-	Chrono-DEM-Engine
-	VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH}
-	LANGUAGES CXX CUDA
-)
+# HIP/ROCm support option (must be set before project() to select the right language)
+option(USE_HIP "Build with HIP for AMD GPUs" OFF)
+
+if(USE_HIP)
+	project(
+		Chrono-DEM-Engine
+		VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH}
+		LANGUAGES CXX HIP
+	)
+	# Set default HIP architecture if not specified
+	if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "")
+		set(CMAKE_HIP_ARCHITECTURES "gfx90a" CACHE STRING "HIP architectures")
+	endif()
+	add_compile_definitions(USE_HIP)
+	# CXX files that include hip_runtime.h need this defined (HIP compiler defines it automatically)
+	add_compile_definitions(__HIP_PLATFORM_AMD__)
+else()
+	project(
+		Chrono-DEM-Engine
+		VERSION ${DEME_VERSION_MAJOR}.${DEME_VERSION_MINOR}.${DEME_VERSION_PATCH}
+		LANGUAGES CXX CUDA
+	)
+endif()
 
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   message(STATUS "Setting CMAKE_BUILD_TYPE to 'Release' (default)")
@@ -41,20 +59,29 @@ include(cmake/FixNinjaColors.cmake)
 fix_ninja_colors()
 
 
-find_package(CUDAToolkit REQUIRED)
-
-# Find CUB library (this might need to be done in source-level config)
-find_package(
-	CUB REQUIRED
-	HINTS ${CUDAToolkit_ROOT}/lib64/cmake/cub
-)
+if(USE_HIP)
+	# Find hipCUB for HIP builds
+	find_package(hipcub REQUIRED)
+	find_package(hip REQUIRED)
+else()
+	find_package(CUDAToolkit REQUIRED)
+	# Find CUB library (this might need to be done in source-level config)
+	find_package(
+		CUB REQUIRED
+		HINTS ${CUDAToolkit_ROOT}/lib64/cmake/cub
+	)
+endif()
 
-# Find NVIDIA's Jitify library 
-find_path(
-	NVIDIAJitifyPath
-	NAMES jitify.hpp
-	PATHS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/jitify"
-)
+# Find Jitify library (NVIDIA version only; HIP uses hiprtc directly via JitKernel abstraction)
+if(NOT USE_HIP)
+	find_path(
+		JitifyPath
+		NAMES jitify.hpp
+		PATHS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/jitify"
+	)
+	# Keep backward compat alias
+	set(NVIDIAJitifyPath ${JitifyPath})
+endif()
 
 # Let the user decide if they want to use ChPF
 option(USE_CHPF "Toggle the use of ChPF for outputting" OFF)
@@ -119,8 +146,13 @@ cxx_std_autodetect()
 set(ProjectIncludeSource "${CMAKE_CURRENT_SOURCE_DIR}/src")
 set(ProjectIncludeGenerated "${CMAKE_BINARY_DIR}/src")
 
-# Global fix for CUDA language bug
-include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+# Global fix for CUDA/HIP language bug
+if(USE_HIP)
+	# Add HIP include directories for all targets (both CXX and HIP)
+	include_directories(${HIP_INCLUDE_DIR})
+else()
+	include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+endif()
 
 
 #------------------------------------------------------------
@@ -164,22 +196,41 @@ if(USE_CHPF)
     target_compile_definitions(simulator_multi_gpu PUBLIC DEME_USE_CHPF)
 	set(USE_CHPF_STR "ON")
 
-	target_link_libraries(simulator_multi_gpu
-		PUBLIC CUDA::cudart
-		PUBLIC CUDA::nvrtc
-		PUBLIC CUDA::cuda_driver
-		PUBLIC ${ChPF_IMPORTED_NAME}
-		PUBLIC DEMERuntimeDataHelper
-	)
+	if(USE_HIP)
+		find_library(HIPRTC_LIB hiprtc HINTS ${ROCM_PATH}/lib /opt/rocm/lib REQUIRED)
+		target_link_libraries(simulator_multi_gpu
+			PUBLIC hip::host
+			PUBLIC ${HIPRTC_LIB}
+			PUBLIC ${ChPF_IMPORTED_NAME}
+			PUBLIC DEMERuntimeDataHelper
+		)
+	else()
+		target_link_libraries(simulator_multi_gpu
+			PUBLIC CUDA::cudart
+			PUBLIC CUDA::nvrtc
+			PUBLIC CUDA::cuda_driver
+			PUBLIC ${ChPF_IMPORTED_NAME}
+			PUBLIC DEMERuntimeDataHelper
+		)
+	endif()
 else()
 	set(USE_CHPF_STR "OFF")
 
-	target_link_libraries(simulator_multi_gpu
-		PUBLIC CUDA::cudart
-		PUBLIC CUDA::nvrtc
-		PUBLIC CUDA::cuda_driver
-		PUBLIC DEMERuntimeDataHelper
-	)
+	if(USE_HIP)
+		find_library(HIPRTC_LIB hiprtc HINTS ${ROCM_PATH}/lib /opt/rocm/lib REQUIRED)
+		target_link_libraries(simulator_multi_gpu
+			PUBLIC hip::host
+			PUBLIC ${HIPRTC_LIB}
+			PUBLIC DEMERuntimeDataHelper
+		)
+	else()
+		target_link_libraries(simulator_multi_gpu
+			PUBLIC CUDA::cudart
+			PUBLIC CUDA::nvrtc
+			PUBLIC CUDA::cuda_driver
+			PUBLIC DEMERuntimeDataHelper
+		)
+	endif()
 endif()
 
 # If use managed arrays, define a macro
@@ -194,10 +245,17 @@ if(WIN32)
 endif()
 
 # Attach include directories to the top-level library target
-set_target_properties(simulator_multi_gpu
-	PROPERTIES
-	LINKER_LANGUAGE CUDA
-)
+if(USE_HIP)
+	set_target_properties(simulator_multi_gpu
+		PROPERTIES
+		LINKER_LANGUAGE HIP
+	)
+else()
+	set_target_properties(simulator_multi_gpu
+		PROPERTIES
+		LINKER_LANGUAGE CUDA
+	)
+endif()
 
 # ---------------------------------------------------------------------------- #
 # Export and Install The Generated Targets 
diff --git a/README.md b/README.md
index f6d3f9b0..45011764 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,7 @@ __A dual-GPU DEM solver with complex grain geometry support__
 
 DEM-Engine, nicknamed _DEME_, does Discrete Element Method simulations:
 
-- Using up to two GPUs at the same time (works great on consumer _and_ data center GPUs).
+- Using up to two GPUs at the same time (works great on consumer _and_ data center GPUs, on both NVIDIA CUDA and AMD ROCm).
 - With the particles having complex shapes represented by clumped spheres.
 - With support for customizable contact force models (want to add a non-standard cohesive force, or an electrostatic repulsive force? You got this).
 - With an emphasis on computational efficiency. As a rule of thumb, using 3-sphere clump elements, simulating 1 million elements for 1 million time steps takes around 1 hour on two RTX 3080s.
@@ -139,6 +139,21 @@ Some additional troubleshooting tips for building the project:
 - If CUB is not found, then you may manually set it in the `ccmake` GUI as `/usr/local/cuda/lib64/cmake/cub`. It may be a slightly different path on your machine or cluster.
 - If `libcudacxx` is not found, then you may manually set it in the `ccmake` GUI as `/usr/local/cuda-12.8/targets/x86_64-linux/lib/cmake/libcudacxx`. Depending on your CUDA version it may be a slightly different path on your machine or cluster. You may also try to find these packages using `find`. 
 
+### AMD GPUs (ROCm)
+
+_DEME_ also builds and runs on AMD GPUs through ROCm/HIP. Instead of CUDA, install a recent [ROCm](https://rocm.docs.amd.com/) release (the HIP runtime, `hiprtc`, and `hipCUB` are required). You do not need the NVIDIA/jitify submodule for an AMD build; runtime kernel compilation is handled by `hiprtc`.
+
+Configure the project the same way as in the **Linux and WSL** section, but pass `-DUSE_HIP=ON`, and set `CMAKE_HIP_ARCHITECTURES` to your GPU's architecture (for example `gfx90a` for MI200-series, `gfx1100` for RDNA3 desktop, or `gfx1201` for RDNA4). An example:
+
+```
+mkdir build
+cd build
+cmake -DCMAKE_BUILD_TYPE=Release -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a -DCMAKE_PREFIX_PATH=/opt/rocm ..
+ninja
+```
+
+If `CMAKE_HIP_ARCHITECTURES` is left unset it defaults to `gfx90a`. You can list the architecture of an installed device with `rocminfo`. If the ROCm install is not on CMake's default search path, point `-DCMAKE_PREFIX_PATH` at it (e.g. `/opt/rocm`) so `find_package` can locate hip and hipCUB. The demos are then run exactly as in the **Numerical examples** section.
+
 ### Windows
 
 The process is similar to [the installation of Chrono](https://api.projectchrono.org/tutorial_install_chrono.html), which you can use as reference. The steps depend on your choice of tools, and what listed here are our recommendation.
diff --git a/src/DEM/API.h b/src/DEM/API.h
index c1ff1665..67575c3d 100644
--- a/src/DEM/API.h
+++ b/src/DEM/API.h
@@ -1370,12 +1370,12 @@ class DEMSolver {
     void PrintKinematicScratchSpaceUsage() const { kT->printScratchSpaceUsage(); }
 
     /// Let dT do this call and return the reduce value of the inspected quantity.
-    float dTInspectReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
+    float dTInspectReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                           const std::string& kernel_name,
                           INSPECT_ENTITY_TYPE thing_to_insp,
                           CUB_REDUCE_FLAVOR reduce_flavor,
                           bool all_domain);
-    float* dTInspectNoReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
+    float* dTInspectNoReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                              const std::string& kernel_name,
                              INSPECT_ENTITY_TYPE thing_to_insp,
                              CUB_REDUCE_FLAVOR reduce_flavor,
@@ -1509,7 +1509,9 @@ class DEMSolver {
     int m_updateFreq = 20;
 
     // The extra libs that the kernels need to include.
-    std::string kernel_includes = "#include <curand_kernel.h>\n";
+    // Default: none (curand/hiprand not used by built-in kernels).
+    // Users can add custom includes via SetKernelInclude() if needed.
+    std::string kernel_includes = "";
 
     // If and how we should add boundaries to the simulation world upon initialization. Choose between none, all and
     // top_open.
diff --git a/src/DEM/APIPrivate.cpp b/src/DEM/APIPrivate.cpp
index 08250829..eebab753 100644
--- a/src/DEM/APIPrivate.cpp
+++ b/src/DEM/APIPrivate.cpp
@@ -2121,7 +2121,20 @@ inline void DEMSolver::equipSimParams(std::unordered_map<std::string, std::strin
 
     // Some constants that we should consider using or not using
     // strMap["_nAnalGM_"] = std::to_string(nAnalGM);
-    strMap["_nActiveLoadingThreads_"] = std::to_string(NUM_ACTIVE_TEMPLATE_LOADING_THREADS);
+    // Query device warp size at runtime for correct multi-arch support (wave32 vs wave64)
+    int runtimeWarpSize = DEME_CUDA_WARP_SIZE;  // Compile-time fallback
+#if defined(USE_HIP)
+    {
+        int dev = 0;
+        cudaDeviceProp prop;
+        if (cudaGetDevice(&dev) == cudaSuccess && cudaGetDeviceProperties(&prop, dev) == cudaSuccess) {
+            runtimeWarpSize = prop.warpSize;  // 64 on gfx90a, 32 on gfx1100
+        }
+    }
+#endif
+    clumpComponentOffset_t nActiveLoadingThreads = static_cast<clumpComponentOffset_t>(
+        DEME_MIN(DEME_MIN(runtimeWarpSize, DEME_KT_CD_NTHREADS_PER_BLOCK), DEME_NUM_BODIES_PER_BLOCK));
+    strMap["_nActiveLoadingThreads_"] = std::to_string(nActiveLoadingThreads);
     // nTotalBodyTopologies includes clump topologies and ext obj topologies
     strMap["_nDistinctMassProperties_"] = std::to_string(nDistinctMassProperties);
     strMap["_nJitifiableClumpComponents_"] = std::to_string(nJitifiableClumpComponents);
diff --git a/src/DEM/APIPublic.cpp b/src/DEM/APIPublic.cpp
index 41b5c006..93b37abe 100644
--- a/src/DEM/APIPublic.cpp
+++ b/src/DEM/APIPublic.cpp
@@ -2526,7 +2526,7 @@ void DEMSolver::ClearThreadCollaborationStats() {
     dT->nTotalSteps = 0;
 }
 
-float DEMSolver::dTInspectReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
+float DEMSolver::dTInspectReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                                  const std::string& kernel_name,
                                  INSPECT_ENTITY_TYPE thing_to_insp,
                                  CUB_REDUCE_FLAVOR reduce_flavor,
@@ -2537,7 +2537,7 @@ float DEMSolver::dTInspectReduce(const std::shared_ptr<jitify::Program>& inspect
     return (float)(*pRes);
 }
 
-float* DEMSolver::dTInspectNoReduce(const std::shared_ptr<jitify::Program>& inspection_kernel,
+float* DEMSolver::dTInspectNoReduce(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                                     const std::string& kernel_name,
                                     INSPECT_ENTITY_TYPE thing_to_insp,
                                     CUB_REDUCE_FLAVOR reduce_flavor,
diff --git a/src/DEM/AuxClasses.cpp b/src/DEM/AuxClasses.cpp
index 5e2bb13b..a000f38a 100644
--- a/src/DEM/AuxClasses.cpp
+++ b/src/DEM/AuxClasses.cpp
@@ -229,10 +229,10 @@ void DEMInspector::Initialize(const std::unordered_map<std::string, std::string>
     my_subs["_inRegionPolicy_"] = in_region_specifier;
     my_subs["_quantityQueryProcess_"] = inspection_code;
     if (thing_to_insp == INSPECT_ENTITY_TYPE::SPHERE) {
-        inspection_kernel = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        inspection_kernel = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMSphereQueryKernels", JitHelper::KERNEL_DIR / "DEMSphereQueryKernels.cu", my_subs, options)));
     } else if (thing_to_insp == INSPECT_ENTITY_TYPE::CLUMP || thing_to_insp == INSPECT_ENTITY_TYPE::EVERYTHING) {
-        inspection_kernel = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        inspection_kernel = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMOwnerQueryKernels", JitHelper::KERNEL_DIR / "DEMOwnerQueryKernels.cu", my_subs, options)));
     } else {
         std::stringstream ss;
diff --git a/src/DEM/AuxClasses.h b/src/DEM/AuxClasses.h
index 310d1b4b..09baada1 100644
--- a/src/DEM/AuxClasses.h
+++ b/src/DEM/AuxClasses.h
@@ -11,11 +11,6 @@
 #include "../core/utils/JitHelper.h"
 #include "Defines.h"
 
-// Forward declare jitify::Program to avoid downstream dependency
-namespace jitify {
-class Program;
-}
-
 namespace deme {
 
 class DEMSolver;
@@ -25,7 +20,7 @@ class DEMDynamicThread;
 /// their simulation entites, in a given region.
 class DEMInspector {
   private:
-    std::shared_ptr<jitify::Program> inspection_kernel;
+    std::shared_ptr<deme::jit::Program> inspection_kernel;
 
     std::string inspection_code;
     std::string in_region_code;
diff --git a/src/DEM/CMakeLists.txt b/src/DEM/CMakeLists.txt
index 43787a51..7cb96d09 100644
--- a/src/DEM/CMakeLists.txt
+++ b/src/DEM/CMakeLists.txt
@@ -13,17 +13,13 @@ target_include_directories(
 	PUBLIC ${ProjectIncludeGenerated}
 )
 
+# DEM is pure CXX, no GPU library dependencies.
+# CUB/hipcub is only needed by algorithms target.
 if(USE_CHPF)
 	target_link_libraries(
 		DEM
-		PUBLIC CUB::CUB
 		PUBLIC ${ChPF_IMPORTED_NAME}
 	)
-else()
-	target_link_libraries(
-		DEM
-		PUBLIC CUB::CUB
-	)
 endif()
 
 # if(WIN32)
diff --git a/src/DEM/Defines.h b/src/DEM/Defines.h
index c2a8de8e..68ffdfb8 100644
--- a/src/DEM/Defines.h
+++ b/src/DEM/Defines.h
@@ -12,7 +12,7 @@
 #include <cmath>
 
 #include "VariableTypes.h"
-#include "cuda_runtime.h"
+#include <core/utils/cuda_to_hip.h>
 
 #define DEME_MIN(a, b) ((a < b) ? a : b)
 #define DEME_MAX(a, b) ((a > b) ? a : b)
@@ -29,7 +29,17 @@ namespace deme {
 #define DEME_TINY_FLOAT 1e-12                ///< Appears to be very sensitive to even smaller values...
 #define DEME_HUGE_FLOAT 1e15
 #define DEME_BITS_PER_BYTE 8
-#define DEME_CUDA_WARP_SIZE 32
+// Wavefront/warp size: 64 only on AMD wave64 device code (gfx8/gfx9 GCN/CDNA); 32 everywhere
+// else (AMD RDNA, NVIDIA, and host code). ROCm 7.2.x does not provide __AMDGCN_WAVEFRONT_SIZE__,
+// so device code keys off the __GFX*__ macros. For JIT kernels the actual device warp size is
+// queried at runtime and substituted.
+#if defined(__HIP_DEVICE_COMPILE__) && (defined(__GFX8__) || defined(__GFX9__))
+    #define DEME_WARP_SIZE 64
+#else
+    #define DEME_WARP_SIZE 32
+#endif
+// Legacy macro for compatibility
+#define DEME_CUDA_WARP_SIZE DEME_WARP_SIZE
 #define DEME_MAX_WILDCARD_NUM 16
 // In bin--triangle intersection scan, all bins are enlarged by a factor of this following constant, so that no triangle
 // lies in between bins and not picked up by any bins.
diff --git a/src/DEM/dT.cpp b/src/DEM/dT.cpp
index 0d047bb5..09bfc364 100644
--- a/src/DEM/dT.cpp
+++ b/src/DEM/dT.cpp
@@ -2519,41 +2519,41 @@ void DEMDynamicThread::jitifyKernels(const std::unordered_map<std::string, std::
                                      const std::vector<std::string>& JitifyOptions) {
     // First one is force array preparation kernels
     {
-        prep_force_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        prep_force_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMPrepForceKernels", JitHelper::KERNEL_DIR / "DEMPrepForceKernels.cu", Subs, JitifyOptions)));
     }
     // Then force calculation kernels
     {
-        cal_force_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        cal_force_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMCalcForceKernels", JitHelper::KERNEL_DIR / "DEMCalcForceKernels.cu", Subs, JitifyOptions)));
     }
     // Then force accumulation kernels
     if (solverFlags.useCubForceCollect) {
-        collect_force_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        collect_force_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMCollectForceKernels", JitHelper::KERNEL_DIR / "DEMCollectForceKernels.cu", Subs, JitifyOptions)));
     } else {
-        collect_force_kernels = std::make_shared<jitify::Program>(std::move(
+        collect_force_kernels = std::make_shared<deme::jit::Program>(std::move(
             JitHelper::buildProgram("DEMCollectForceKernels_Compact",
                                     JitHelper::KERNEL_DIR / "DEMCollectForceKernels_Compact.cu", Subs, JitifyOptions)));
     }
     // Then integration kernels
     {
-        integrator_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        integrator_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMIntegrationKernels", JitHelper::KERNEL_DIR / "DEMIntegrationKernels.cu", Subs, JitifyOptions)));
     }
     // Then kernels that are... wildcards, which make on-the-fly changes to solver data
     if (solverFlags.canFamilyChangeOnDevice) {
-        mod_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        mod_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMModeratorKernels", JitHelper::KERNEL_DIR / "DEMModeratorKernels.cu", Subs, JitifyOptions)));
     }
     // Then misc kernels
     {
-        misc_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        misc_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMMiscKernels", JitHelper::KERNEL_DIR / "DEMMiscKernels.cu", Subs, JitifyOptions)));
     }
 }
 
-float* DEMDynamicThread::inspectCall(const std::shared_ptr<jitify::Program>& inspection_kernel,
+float* DEMDynamicThread::inspectCall(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                                      const std::string& kernel_name,
                                      INSPECT_ENTITY_TYPE thing_to_insp,
                                      CUB_REDUCE_FLAVOR reduce_flavor,
diff --git a/src/DEM/dT.h b/src/DEM/dT.h
index 3222088d..80df1866 100644
--- a/src/DEM/dT.h
+++ b/src/DEM/dT.h
@@ -10,6 +10,7 @@
 #include <vector>
 #include <thread>
 #include <unordered_map>
+#include <unordered_set>
 #include <set>
 #include <functional>
 
@@ -23,10 +24,12 @@
 #include "Structs.h"
 #include "AuxClasses.h"
 
-// Forward declare jitify::Program to avoid downstream dependency
-namespace jitify {
+// Forward declare deme::jit::Program
+namespace deme {
+namespace jit {
 class Program;
 }
+}
 
 namespace deme {
 
@@ -631,7 +634,7 @@ class DEMDynamicThread {
                        const std::vector<std::string>& JitifyOptions);
 
     // Execute this kernel, then return the reduced value
-    float* inspectCall(const std::shared_ptr<jitify::Program>& inspection_kernel,
+    float* inspectCall(const std::shared_ptr<deme::jit::Program>& inspection_kernel,
                        const std::string& kernel_name,
                        INSPECT_ENTITY_TYPE thing_to_insp,
                        CUB_REDUCE_FLAVOR reduce_flavor,
@@ -709,13 +712,13 @@ class DEMDynamicThread {
         const std::function<bool(unsigned int, unsigned int, unsigned int, unsigned int)>& condition);
 
     // Just-in-time compiled kernels
-    std::shared_ptr<jitify::Program> prep_force_kernels;
-    std::shared_ptr<jitify::Program> cal_force_kernels;
-    std::shared_ptr<jitify::Program> collect_force_kernels;
-    std::shared_ptr<jitify::Program> integrator_kernels;
-    // std::shared_ptr<jitify::Program> quarry_stats_kernels;
-    std::shared_ptr<jitify::Program> mod_kernels;
-    std::shared_ptr<jitify::Program> misc_kernels;
+    std::shared_ptr<deme::jit::Program> prep_force_kernels;
+    std::shared_ptr<deme::jit::Program> cal_force_kernels;
+    std::shared_ptr<deme::jit::Program> collect_force_kernels;
+    std::shared_ptr<deme::jit::Program> integrator_kernels;
+    // std::shared_ptr<deme::jit::Program> quarry_stats_kernels;
+    std::shared_ptr<deme::jit::Program> mod_kernels;
+    std::shared_ptr<deme::jit::Program> misc_kernels;
 
     // Adjuster for update freq
     class AccumStepUpdater {
diff --git a/src/DEM/kT.cpp b/src/DEM/kT.cpp
index c56815df..2d140d68 100644
--- a/src/DEM/kT.cpp
+++ b/src/DEM/kT.cpp
@@ -880,34 +880,34 @@ void DEMKinematicThread::jitifyKernels(const std::unordered_map<std::string, std
                                        const std::vector<std::string>& JitifyOptions) {
     // First one is bin_sphere_kernels kernels, which figure out the bin--sphere touch pairs
     {
-        bin_sphere_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        bin_sphere_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMBinSphereKernels", JitHelper::KERNEL_DIR / "DEMBinSphereKernels.cu", Subs, JitifyOptions)));
     }
     // Then CD kernels
     {
-        sphere_contact_kernels = std::make_shared<jitify::Program>(std::move(
+        sphere_contact_kernels = std::make_shared<deme::jit::Program>(std::move(
             JitHelper::buildProgram("DEMContactKernels_SphereSphere",
                                     JitHelper::KERNEL_DIR / "DEMContactKernels_SphereSphere.cu", Subs, JitifyOptions)));
     }
     // Then triangle--bin intersection-related kernels
     {
-        bin_triangle_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        bin_triangle_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMBinTriangleKernels", JitHelper::KERNEL_DIR / "DEMBinTriangleKernels.cu", Subs, JitifyOptions)));
     }
     // Then sphere--triangle contact detection-related kernels
     {
-        sphTri_contact_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        sphTri_contact_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMContactKernels_SphereTriangle", JitHelper::KERNEL_DIR / "DEMContactKernels_SphereTriangle.cu", Subs,
             JitifyOptions)));
     }
     // Then contact history mapping kernels
     {
-        history_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        history_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMHistoryMappingKernels", JitHelper::KERNEL_DIR / "DEMHistoryMappingKernels.cu", Subs, JitifyOptions)));
     }
     // Then misc kernels
     {
-        misc_kernels = std::make_shared<jitify::Program>(std::move(JitHelper::buildProgram(
+        misc_kernels = std::make_shared<deme::jit::Program>(std::move(JitHelper::buildProgram(
             "DEMMiscKernels", JitHelper::KERNEL_DIR / "DEMMiscKernels.cu", Subs, JitifyOptions)));
     }
 }
diff --git a/src/DEM/kT.h b/src/DEM/kT.h
index 54c8ed93..6032b10e 100644
--- a/src/DEM/kT.h
+++ b/src/DEM/kT.h
@@ -23,10 +23,12 @@
 #include "Defines.h"
 #include "Structs.h"
 
-// Forward declare jitify::Program to avoid downstream dependency
-namespace jitify {
+// Forward declare deme::jit::Program
+namespace deme {
+namespace jit {
 class Program;
 }
+}
 
 namespace deme {
 
@@ -408,12 +410,12 @@ class DEMKinematicThread {
 
     // Just-in-time compiled kernels
     // jitify::Program bin_sphere_kernels = JitHelper::buildProgram("bin_sphere_kernels", " ");
-    std::shared_ptr<jitify::Program> bin_sphere_kernels;
-    std::shared_ptr<jitify::Program> bin_triangle_kernels;
-    std::shared_ptr<jitify::Program> sphTri_contact_kernels;
-    std::shared_ptr<jitify::Program> sphere_contact_kernels;
-    std::shared_ptr<jitify::Program> history_kernels;
-    std::shared_ptr<jitify::Program> misc_kernels;
+    std::shared_ptr<deme::jit::Program> bin_sphere_kernels;
+    std::shared_ptr<deme::jit::Program> bin_triangle_kernels;
+    std::shared_ptr<deme::jit::Program> sphTri_contact_kernels;
+    std::shared_ptr<deme::jit::Program> sphere_contact_kernels;
+    std::shared_ptr<deme::jit::Program> history_kernels;
+    std::shared_ptr<deme::jit::Program> misc_kernels;
 
     // Adjuster for bin size
     class AccumTimer {
diff --git a/src/algorithms/CMakeLists.txt b/src/algorithms/CMakeLists.txt
index 796d48f0..ff9a88ca 100644
--- a/src/algorithms/CMakeLists.txt
+++ b/src/algorithms/CMakeLists.txt
@@ -15,10 +15,17 @@ target_include_directories(
 	PUBLIC ${ProjectIncludeGenerated}
 )
 
-target_link_libraries(
-	algorithms
-	PUBLIC CUB::CUB
-)
+if(USE_HIP)
+	target_link_libraries(
+		algorithms
+		PUBLIC hip::hipcub
+	)
+else()
+	target_link_libraries(
+		algorithms
+		PUBLIC CUB::CUB
+	)
+endif()
 
 ### HOST HEADERS ONLY (.h, .hpp) ###
 set(algorithms_interface
@@ -45,23 +52,36 @@ target_sources(
 	PRIVATE ${algorithms_sources}
 )
 
-# Get a list of supported architectures
-include(../../cmake/CudaSupportedArchitectures.cmake)
-cuda_supported_architectures()
+if(USE_HIP)
+	# Mark .cu sources as HIP language
+	set_source_files_properties(${algorithms_sources} PROPERTIES LANGUAGE HIP)
 
-# Convert the cuda supported architectures into their respective CMake properties
-set(virtual_architectures "")
-foreach(arch ${CUDASUP_ARCHITECTURES})
-	list(APPEND virtual_architectures "${arch}-virtual")
-endforeach()
+	set_target_properties(
+		algorithms PROPERTIES
+		POSITION_INDEPENDENT_CODE True
+		CXX_STANDARD ${CXXSTD_SUPPORTED}
+		PUBLIC_HEADER "${algorithms_interface}"
+		HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}"
+	)
+else()
+	# Get a list of supported architectures
+	include(../../cmake/CudaSupportedArchitectures.cmake)
+	cuda_supported_architectures()
 
-set_target_properties(
-	algorithms PROPERTIES 
-	POSITION_INDEPENDENT_CODE True
-	CXX_STANDARD ${CXXSTD_SUPPORTED}
-	PUBLIC_HEADER "${algorithms_interface}"
-	CUDA_ARCHITECTURES "${virtual_architectures}"
-)
+	# Convert the cuda supported architectures into their respective CMake properties
+	set(virtual_architectures "")
+	foreach(arch ${CUDASUP_ARCHITECTURES})
+		list(APPEND virtual_architectures "${arch}-virtual")
+	endforeach()
+
+	set_target_properties(
+		algorithms PROPERTIES
+		POSITION_INDEPENDENT_CODE True
+		CXX_STANDARD ${CXXSTD_SUPPORTED}
+		PUBLIC_HEADER "${algorithms_interface}"
+		CUDA_ARCHITECTURES "${virtual_architectures}"
+	)
+endif()
 
 install(
 	TARGETS algorithms
diff --git a/src/algorithms/DEMCubContactDetection.cu b/src/algorithms/DEMCubContactDetection.cu
index dccc4702..7158b6df 100644
--- a/src/algorithms/DEMCubContactDetection.cu
+++ b/src/algorithms/DEMCubContactDetection.cu
@@ -3,7 +3,12 @@
 //
 //	SPDX-License-Identifier: BSD-3-Clause
 
+#include <core/utils/cuda_to_hip.h>
+#if defined(USE_HIP)
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 // #include <thrust/sort.h>
 #include <kernel/DEMHelperKernels.cuh>
 
@@ -35,11 +40,11 @@ inline void contactEventArraysResize(size_t nContactPairs,
     granData.toDevice();
 }
 
-void contactDetection(std::shared_ptr<jitify::Program>& bin_sphere_kernels,
-                      std::shared_ptr<jitify::Program>& bin_triangle_kernels,
-                      std::shared_ptr<jitify::Program>& sphere_contact_kernels,
-                      std::shared_ptr<jitify::Program>& sphTri_contact_kernels,
-                      std::shared_ptr<jitify::Program>& history_kernels,
+void contactDetection(std::shared_ptr<deme::jit::Program>& bin_sphere_kernels,
+                      std::shared_ptr<deme::jit::Program>& bin_triangle_kernels,
+                      std::shared_ptr<deme::jit::Program>& sphere_contact_kernels,
+                      std::shared_ptr<deme::jit::Program>& sphTri_contact_kernels,
+                      std::shared_ptr<deme::jit::Program>& history_kernels,
                       DualStruct<DEMDataKT>& granData,
                       DualStruct<DEMSimParams>& simParams,
                       SolverFlags& solverFlags,
diff --git a/src/algorithms/DEMCubForceCollection.cu b/src/algorithms/DEMCubForceCollection.cu
index 44c1052f..bf07dc65 100644
--- a/src/algorithms/DEMCubForceCollection.cu
+++ b/src/algorithms/DEMCubForceCollection.cu
@@ -3,7 +3,12 @@
 //
 //	SPDX-License-Identifier: BSD-3-Clause
 
+#include <core/utils/cuda_to_hip.h>
+#if defined(USE_HIP)
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 // #include <thrust/sort.h>
 #include <kernel/DEMHelperKernels.cuh>
 
@@ -16,7 +21,7 @@
 
 namespace deme {
 
-void collectContactForcesThruCub(std::shared_ptr<jitify::Program>& collect_force_kernels,
+void collectContactForcesThruCub(std::shared_ptr<deme::jit::Program>& collect_force_kernels,
                                  DualStruct<DEMDataDT>& granData,
                                  const size_t nContactPairs,
                                  const size_t nClumps,
diff --git a/src/algorithms/DEMCubInstantiations.cu b/src/algorithms/DEMCubInstantiations.cu
index 25735e9d..0e9f0873 100644
--- a/src/algorithms/DEMCubInstantiations.cu
+++ b/src/algorithms/DEMCubInstantiations.cu
@@ -3,7 +3,12 @@
 //
 //	SPDX-License-Identifier: BSD-3-Clause
 
+#include <core/utils/cuda_to_hip.h>
+#if defined(USE_HIP)
+#include <hipcub/hipcub.hpp>
+#else
 #include <cub/cub.cuh>
+#endif
 #include <algorithms/DEMStaticDeviceSubroutines.h>
 
 #include <core/utils/GpuError.h>
diff --git a/src/algorithms/DEMCubWrappers.cu b/src/algorithms/DEMCubWrappers.cu
index 4ecfcbe5..aae5c9ea 100644
--- a/src/algorithms/DEMCubWrappers.cu
+++ b/src/algorithms/DEMCubWrappers.cu
@@ -3,7 +3,16 @@
 //
 //	SPDX-License-Identifier: BSD-3-Clause
 
+#include <core/utils/cuda_to_hip.h>
+#if defined(USE_HIP)
+#include <hipcub/hipcub.hpp>
+// CUB_RUNTIME_FUNCTION is a CUB macro not defined in hipCUB
+#ifndef CUB_RUNTIME_FUNCTION
+#define CUB_RUNTIME_FUNCTION
+#endif
+#else
 #include <cub/cub.cuh>
+#endif
 #include <DEM/Defines.h>
 #include <DEM/Structs.h>
 #include <core/utils/GpuError.h>
diff --git a/src/algorithms/DEMStaticDeviceSubroutines.h b/src/algorithms/DEMStaticDeviceSubroutines.h
index acd0cb4f..d2de6ef7 100644
--- a/src/algorithms/DEMStaticDeviceSubroutines.h
+++ b/src/algorithms/DEMStaticDeviceSubroutines.h
@@ -76,11 +76,11 @@ void cubSortByKey(T1* d_keys_in,
 // For kT and dT's private usage
 ////////////////////////////////////////////////////////////////////////////////
 
-void contactDetection(std::shared_ptr<jitify::Program>& bin_sphere_kernels,
-                      std::shared_ptr<jitify::Program>& bin_triangle_kernels,
-                      std::shared_ptr<jitify::Program>& sphere_contact_kernels,
-                      std::shared_ptr<jitify::Program>& sphTri_contact_kernels,
-                      std::shared_ptr<jitify::Program>& history_kernels,
+void contactDetection(std::shared_ptr<deme::jit::Program>& bin_sphere_kernels,
+                      std::shared_ptr<deme::jit::Program>& bin_triangle_kernels,
+                      std::shared_ptr<deme::jit::Program>& sphere_contact_kernels,
+                      std::shared_ptr<deme::jit::Program>& sphTri_contact_kernels,
+                      std::shared_ptr<deme::jit::Program>& history_kernels,
                       DualStruct<DEMDataKT>& granData,
                       DualStruct<DEMSimParams>& simParams,
                       SolverFlags& solverFlags,
@@ -99,7 +99,7 @@ void contactDetection(std::shared_ptr<jitify::Program>& bin_sphere_kernels,
                       SolverTimers& timers,
                       kTStateParams& stateParams);
 
-void collectContactForcesThruCub(std::shared_ptr<jitify::Program>& collect_force_kernels,
+void collectContactForcesThruCub(std::shared_ptr<deme::jit::Program>& collect_force_kernels,
                                  DualStruct<DEMDataDT>& granData,
                                  const size_t nContactPairs,
                                  const size_t nClumps,
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index c01c748e..a0b5dae9 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -14,12 +14,15 @@ configure_file(
 )
 
 
-message(STATUS "${core_message} Extracting NVIDIA Jitify header...") 
-configure_file(
-	${NVIDIAJitifyPath}/jitify.hpp
-	${CMAKE_BINARY_DIR}/src/jitify/jitify.hpp
-	COPYONLY
-)
+# Jitify is only needed for CUDA builds (HIP uses hiprtc directly via JitKernel abstraction)
+if(NOT USE_HIP)
+	message(STATUS "${core_message} Extracting Jitify header...")
+	configure_file(
+		${JitifyPath}/jitify.hpp
+		${CMAKE_BINARY_DIR}/src/jitify/jitify.hpp
+		COPYONLY
+	)
+endif()
 
 add_library(core OBJECT)
 
@@ -31,17 +34,13 @@ target_include_directories(
 )
 
 
+# Core is pure CXX, no GPU library dependencies.
+# CUB/hipcub is only needed by algorithms target.
 if(USE_CHPF)
 	target_link_libraries(
 		core
-		PUBLIC CUB::CUB
 		INTERFACE ${ChPF_IMPORTED_NAME}
 	)
-else()
-	target_link_libraries(
-		core
-		PUBLIC CUB::CUB
-	)
 endif()
 
 
@@ -50,6 +49,8 @@ set(core_headers
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/CudaAllocator.hpp
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/ManagedMemory.hpp
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/JitHelper.h
+	${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel.h
+	${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel.inl
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/ThreadManager.h
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/GpuError.h
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/GpuManager.h
@@ -68,6 +69,13 @@ set(core_sources
 	${CMAKE_CURRENT_SOURCE_DIR}/utils/DEMEPaths.cpp
 )
 
+# JitKernel abstraction: select backend based on USE_HIP
+if(USE_HIP)
+	list(APPEND core_sources ${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel_hip.cpp)
+else()
+	list(APPEND core_sources ${CMAKE_CURRENT_SOURCE_DIR}/utils/JitKernel_cuda.cpp)
+endif()
+
 target_sources(
 	core
 	PUBLIC ${core_headers} ${core_utils_headers}
@@ -98,13 +106,15 @@ install(
 		${CMAKE_INSTALL_INCLUDEDIR}/core
 )
 
-# Install Third-party Headers 
-install(
-	FILES 
-		"${NVIDIAJitifyPath}/jitify.hpp"
-	DESTINATION 
-		${CMAKE_INSTALL_INCLUDEDIR}/jitify
-)
+# Install Third-party Headers (jitify only for CUDA builds)
+if(NOT USE_HIP)
+	install(
+		FILES
+			"${NVIDIAJitifyPath}/jitify.hpp"
+		DESTINATION
+			${CMAKE_INSTALL_INCLUDEDIR}/jitify
+	)
+endif()
 
 
 # --------------------------------------------------------- #
@@ -114,7 +124,7 @@ install(
 
 message(STATUS "${core_message} Generating Runtime Data helper (build)...")
 set(DEME_RUNTIME_DATA_DIRECTORY "${CMAKE_BINARY_DIR}")
-set(DEME_RUNTIME_INCLUDE_DIRECTORY "${CMAKE_BINARY_DIR}")
+set(DEME_RUNTIME_INCLUDE_DIRECTORY "${CMAKE_SOURCE_DIR}/src")
 configure_file(
 	utils/RuntimeData.cpp.in utils/RuntimeDataBuild.cpp
 	@ONLY
diff --git a/src/core/utils/CudaAllocator.hpp b/src/core/utils/CudaAllocator.hpp
index e05b8942..32a576aa 100644
--- a/src/core/utils/CudaAllocator.hpp
+++ b/src/core/utils/CudaAllocator.hpp
@@ -8,7 +8,7 @@
 
 #include <core/ApiVersion.h>
 
-#include <cuda_runtime_api.h>
+#include "cuda_to_hip.h"
 #include <climits>
 #include <iostream>
 #include <memory>
diff --git a/src/core/utils/GpuError.h b/src/core/utils/GpuError.h
index 97d213b5..c24bc315 100644
--- a/src/core/utils/GpuError.h
+++ b/src/core/utils/GpuError.h
@@ -9,7 +9,7 @@
 #include <iostream>
 #include <sstream>
 #include <exception>
-#include <cuda_runtime_api.h>
+#include "cuda_to_hip.h"
 
 #define DEME_GPU_CALL(res) \
     { gpu_assert((res), __FILE__, __LINE__); }
diff --git a/src/core/utils/GpuManager.h b/src/core/utils/GpuManager.h
index bf9cf611..d5a5d3c3 100644
--- a/src/core/utils/GpuManager.h
+++ b/src/core/utils/GpuManager.h
@@ -1,7 +1,7 @@
 #ifndef DEME_GPU_MANAGER_H
 #define DEME_GPU_MANAGER_H
 
-#include <cuda_runtime_api.h>
+#include "cuda_to_hip.h"
 #include <vector>
 #include <mutex>
 
diff --git a/src/core/utils/JitHelper.cpp b/src/core/utils/JitHelper.cpp
index 9779b8a1..ff0a4706 100644
--- a/src/core/utils/JitHelper.cpp
+++ b/src/core/utils/JitHelper.cpp
@@ -12,21 +12,30 @@
 #include <algorithm>
 #include <cstdlib>
 #include <stdexcept>
-
-#include <cuda_runtime_api.h>
-
-// Compile-time default CUDA architecture fallback.
-// Can be overridden at build time via -DDEME_DEFAULT_CUDA_ARCH_STR="compute_XY".
-// At runtime, the environment variable DEME_DEFAULT_CUDA_ARCH takes precedence.
+#include <cstring>
+
+#include "cuda_to_hip.h"
+
+// Compile-time default architecture fallback.
+// Can be overridden at build time via -DDEME_DEFAULT_CUDA_ARCH_STR="compute_XY" (CUDA)
+// or -DDEME_DEFAULT_HIP_ARCH_STR="gfx90a" (HIP).
+// At runtime, the environment variable DEME_DEFAULT_CUDA_ARCH (CUDA) or
+// DEME_DEFAULT_HIP_ARCH (HIP) takes precedence.
+#if defined(USE_HIP)
+#ifndef DEME_DEFAULT_HIP_ARCH_STR
+    #define DEME_DEFAULT_HIP_ARCH_STR "gfx90a"
+#endif
+#else
 #ifndef DEME_DEFAULT_CUDA_ARCH_STR
     #define DEME_DEFAULT_CUDA_ARCH_STR "compute_75"
 #endif
+#endif
 
 #include <core/ApiVersion.h>
 #include "RuntimeData.h"
 #include "JitHelper.h"
 
-jitify::JitCache* JitHelper::kcache = nullptr;
+deme::jit::ProgramCache* JitHelper::kcache = nullptr;
 
 const std::filesystem::path JitHelper::KERNEL_DIR = DEMERuntimeDataHelper::data_path / "kernel";
 const std::filesystem::path JitHelper::KERNEL_INCLUDE_DIR = DEMERuntimeDataHelper::include_path;
@@ -47,13 +56,12 @@ void JitHelper::Header::substitute(const std::string& symbol, const std::string&
     }
 }
 
-jitify::Program JitHelper::buildProgram(
+deme::jit::Program JitHelper::buildProgram(
     const std::string& name,
     const std::filesystem::path& source,
     std::unordered_map<std::string, std::string> substitutions,
-    // std::vector<JitHelper::Header> headers, // THIS PARAMETER PROBABLY WON'T EVER BE USED
     std::vector<std::string> flags) {
-    // Double ensure include paths for runtime headers + CUDA/CCCL (cuda::std)
+    // Double ensure include paths for runtime headers + CUDA/CCCL (cuda::std) or ROCm
     auto add_inc = [&](const std::filesystem::path& p) {
         if (p.empty())
             return;
@@ -68,15 +76,59 @@ jitify::Program JitHelper::buildProgram(
     // Project/runtime includes
     add_inc(KERNEL_INCLUDE_DIR);
 
+    // Also add the source tree for header includes (cuda_to_hip.h, etc.)
+    // KERNEL_INCLUDE_DIR is the build tree, but some headers live in src/
+    // The source tree is one level up from build, then into src/
+    std::error_code ec;
+    std::filesystem::path src_include = std::filesystem::canonical(KERNEL_INCLUDE_DIR / "..", ec);
+    if (!ec) {
+        add_inc(src_include / "src");
+    }
+
     // Common fallbacks
+#if defined(USE_HIP)
+    // Helper: find clang builtin headers by scanning lib/llvm/lib/clang/<version>/include
+    auto add_clang_builtins = [&](const std::filesystem::path& rocm_root) {
+        std::filesystem::path clang_base = rocm_root / "lib" / "llvm" / "lib" / "clang";
+        std::error_code scan_ec;
+        for (const auto& entry : std::filesystem::directory_iterator(clang_base, scan_ec)) {
+            if (entry.is_directory()) {
+                add_inc(entry.path() / "include");
+                break;  // use the first (and typically only) version dir
+            }
+        }
+    };
+
+    // ROCm include paths for hipRTC
+    if (const char* rocm_path = std::getenv("ROCM_PATH")) {
+        add_inc(std::filesystem::path(rocm_path) / "include");
+        add_inc(std::filesystem::path(rocm_path) / "include" / "hipcub");
+        add_inc(std::filesystem::path(rocm_path) / "include" / "rocprim");
+        // Clang builtin headers (stddef.h, etc.) for hiprtc -- scan for actual version dir
+        add_clang_builtins(std::filesystem::path(rocm_path));
+    }
+    add_inc("/opt/rocm/include");
+    add_inc("/opt/rocm/include/hipcub");
+    add_inc("/opt/rocm/include/rocprim");
+    // Clang builtin headers for hiprtc (needed for stddef.h, stdint.h, etc.)
+    add_clang_builtins(std::filesystem::path("/opt/rocm"));
+#else
     if (const char* cuda_home = std::getenv("CUDA_HOME")) {
         add_inc(std::filesystem::path(cuda_home) / "include");
         add_inc(std::filesystem::path(cuda_home) / "include" / "cccl");
     }
     add_inc("/usr/local/cuda/include");
     add_inc("/usr/local/cuda/include/cccl");
+#endif
 
+#if defined(USE_HIP)
+    // For hiprtc: start with hip_runtime.h to provide HIP types/functions in device code.
+    // The program name is passed separately to hiprtcCreateProgram.
+    std::string code = "#include <hip/hip_runtime.h>\n";
+#else
+    // For CUDA/jitify: the program name on the first line is a jitify requirement.
     std::string code = name + "\n";
+#endif
 
     code.append(JitHelper::loadSourceFile(source));
     // Apply the substitutions
@@ -84,64 +136,53 @@ jitify::Program JitHelper::buildProgram(
         code = std::regex_replace(code, std::regex(subst.first), subst.second);
     }
 
-    std::vector<std::string> header_code;
-    // THIS BLOCK IS ONLY NEEDED IF THE headers PARAMETER IS USED
-    /*
-    for (auto it = headers.begin(); it != headers.end(); it++) {
-        header_code.push_back(it->getSource());
-    }
-    */
-
     if (kcache == nullptr)
-        kcache = new jitify::JitCache();
+        kcache = new deme::jit::ProgramCache();
 
-    // Stage 1: Explicit architecture detection.
-    // Query the active CUDA device compute capability and build an -arch flag.
+    // Detect GPU architecture and add appropriate flags
+    std::string arch_flag;
     {
         int dev = 0;
         cudaDeviceProp prop;
         memset(&prop, 0, sizeof(prop));
-        if (cudaGetDevice(&dev) == cudaSuccess && cudaGetDeviceProperties(&prop, dev) == cudaSuccess &&
-            prop.major > 0 && prop.minor >= 0) {
-            std::string detected_arch = "compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
-            std::vector<std::string> arch_flags = flags;
-            arch_flags.push_back("-arch=" + detected_arch);
-            return kcache->program(code, header_code, arch_flags);
+        if (cudaGetDevice(&dev) == cudaSuccess && cudaGetDeviceProperties(&prop, dev) == cudaSuccess) {
+#if defined(USE_HIP)
+            // HIP: use gcnArchName (e.g., "gfx90a")
+            std::string detected_arch = prop.gcnArchName;
+            // Strip any suffix after the base arch name (e.g., "gfx90a:sramecc+:xnack-" -> "gfx90a")
+            size_t colon_pos = detected_arch.find(':');
+            if (colon_pos != std::string::npos) {
+                detected_arch = detected_arch.substr(0, colon_pos);
+            }
+            arch_flag = "--gpu-architecture=" + detected_arch;
+#else
+            if (prop.major > 0 && prop.minor >= 0) {
+                std::string detected_arch = "compute_" + std::to_string(prop.major) + std::to_string(prop.minor);
+                arch_flag = "-arch=" + detected_arch;
+            }
+#endif
         }
     }
 
-    // Stage 2: Jitify auto-detect.
-    // Device detection failed or returned an invalid compute capability; compile
-    // without an explicit -arch flag and let Jitify detect the architecture.
-    try {
-        return kcache->program(code, header_code, flags);
-    } catch (const std::exception& e) {
-        const std::string err_msg = e.what();
-        // Only fall through to Stage 3 when the failure is architecture-related.
-        const bool is_arch_error =
-            (err_msg.find("arch") != std::string::npos || err_msg.find("compute_") != std::string::npos ||
-             err_msg.find("sm_") != std::string::npos);
-        if (!is_arch_error) {
-            throw;
-        }
-
-        // Stage 3: Hardcoded default fallback.
-        // Use the DEME_DEFAULT_CUDA_ARCH environment variable if set; otherwise fall
-        // back to the compile-time constant DEME_DEFAULT_CUDA_ARCH_STR.
+    // If device detection failed, use fallback
+    if (arch_flag.empty()) {
+#if defined(USE_HIP)
+        const char* env_arch = std::getenv("DEME_DEFAULT_HIP_ARCH");
+        const std::string fallback_arch =
+            (env_arch != nullptr && env_arch[0] != '\0') ? std::string(env_arch) : DEME_DEFAULT_HIP_ARCH_STR;
+        arch_flag = "--gpu-architecture=" + fallback_arch;
+#else
         const char* env_arch = std::getenv("DEME_DEFAULT_CUDA_ARCH");
         const std::string fallback_arch =
             (env_arch != nullptr && env_arch[0] != '\0') ? std::string(env_arch) : DEME_DEFAULT_CUDA_ARCH_STR;
-        std::vector<std::string> fallback_flags = flags;
-        fallback_flags.push_back("-arch=" + fallback_arch);
-        try {
-            return kcache->program(code, header_code, fallback_flags);
-        } catch (const std::exception& e3) {
-            std::string ctx = "Jitify compilation failed with fallback arch '";
-            ctx += fallback_arch;
-            ctx += (env_arch != nullptr && env_arch[0] != '\0') ? "' (from DEME_DEFAULT_CUDA_ARCH env var): "
-                                                                : "' (compile-time default): ";
-            ctx += e3.what();
-            throw std::runtime_error(ctx);
-        }
+        arch_flag = "-arch=" + fallback_arch;
+#endif
     }
+
+    std::vector<std::string> final_flags = flags;
+    final_flags.push_back(arch_flag);
+
+    // Use the unified JitKernel abstraction - no headers needed for our usage
+    std::vector<std::pair<std::string, std::string>> headers;
+    return kcache->program(name, code, headers, final_flags);
 }
diff --git a/src/core/utils/JitHelper.h b/src/core/utils/JitHelper.h
index 638f23cd..c3dee0d8 100644
--- a/src/core/utils/JitHelper.h
+++ b/src/core/utils/JitHelper.h
@@ -7,11 +7,12 @@
 #define DEME_JIT_HELPER_H
 
 #include <filesystem>
+#include <fstream>
 #include <string>
 #include <vector>
 #include <unordered_map>
 
-#include <jitify/jitify.hpp>
+#include "JitKernel.h"
 
 #if defined(_WIN32) || defined(_WIN64)
     #undef max
@@ -31,24 +32,17 @@ class JitHelper {
         std::string _source;
     };
 
-    static jitify::Program buildProgram(
+    static deme::jit::Program buildProgram(
         const std::string& name,
         const std::filesystem::path& source,
         std::unordered_map<std::string, std::string> substitutions = std::unordered_map<std::string, std::string>(),
         std::vector<std::string> flags = std::vector<std::string>());
 
-    //// I'm pretty sure C++17 auto-converts this
-    // static jitify::Program buildProgram(
-    // 	const std::string& name, const std::string& code,
-    // 	std::vector<Header> headers = 0,
-    // 	std::vector<std::string> flags = 0
-    // );
-
     static const std::filesystem::path KERNEL_DIR;
     static const std::filesystem::path KERNEL_INCLUDE_DIR;
 
   private:
-    static jitify::JitCache* kcache;
+    static deme::jit::ProgramCache* kcache;
 
     inline static std::string loadSourceFile(const std::filesystem::path& sourcefile) {
         std::string code;
diff --git a/src/core/utils/JitKernel.h b/src/core/utils/JitKernel.h
new file mode 100644
index 00000000..39e99ae2
--- /dev/null
+++ b/src/core/utils/JitKernel.h
@@ -0,0 +1,189 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//  Copyright (c) 2026, Advanced Micro Devices, Inc.
+//
+//  SPDX-License-Identifier: BSD-3-Clause
+
+// Unified runtime kernel compilation API for DEM-Engine.
+// Abstracts over jitify v1 (CUDA/NVRTC) and hiprtc (ROCm), enabling both
+// platforms to use the same fluent kernel launch syntax.
+
+#ifndef DEME_JIT_KERNEL_H
+#define DEME_JIT_KERNEL_H
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <cstddef>
+#include <utility>
+
+#ifdef USE_HIP
+#include <hip/hip_runtime.h>
+using gpuStream_t = hipStream_t;
+#else
+#include <cuda_runtime.h>
+using gpuStream_t = cudaStream_t;
+#endif
+
+namespace deme {
+namespace jit {
+
+class Kernel;
+class Program;
+
+// Fluent kernel launch builder - matches existing jitify v1 usage pattern.
+// Usage: program->kernel("name")().instantiate().configure(...).launch(args...)
+class KernelLauncher {
+  public:
+    // For kernels without template parameters
+    KernelLauncher& instantiate();
+
+    // For kernels with template parameters.
+    // type_names: comma-separated type names as they appear in the kernel template.
+    // On hiprtc, these get baked into the mangled kernel name.
+    KernelLauncher& instantiate(const std::string& type_names);
+
+    // Configure launch parameters
+    KernelLauncher& configure(dim3 grid, dim3 block, size_t shared_mem = 0, gpuStream_t stream = 0);
+
+    // Launch with arguments - variadic template forwards to backend
+    // The implementation is in the .inl file for HIP, or calls jitify for CUDA
+    template <typename... Args>
+    void launch(Args... args);
+
+    // Non-templated launch for backend implementations
+    void launchRaw(void** kernel_args, size_t num_args);
+
+    // Kernel can create KernelLaunchers
+    explicit KernelLauncher(Kernel* k);
+
+  private:
+    friend class Kernel;
+
+    Kernel* kernel_;
+    dim3 grid_{1, 1, 1};
+    dim3 block_{1, 1, 1};
+    size_t shared_mem_ = 0;
+    gpuStream_t stream_ = 0;
+    std::string instantiation_;
+    bool instantiated_ = false;
+};
+
+// Handle to a kernel function within a compiled program.
+// Created by Program::kernel(name).
+class Kernel {
+  public:
+    ~Kernel();
+    Kernel(Kernel&&) noexcept;
+    Kernel& operator=(Kernel&&) noexcept;
+
+    // Deleted copy operations - kernel handles are move-only
+    Kernel(const Kernel&) = delete;
+    Kernel& operator=(const Kernel&) = delete;
+
+    // Start a launch chain. Returns a KernelLauncher for fluent chaining.
+    KernelLauncher operator()();
+
+    // Direct instantiate() method to match jitify v1 API.
+    // Usage: program->kernel("name").instantiate().configure(...).launch(...)
+    KernelLauncher& instantiate();
+    KernelLauncher& instantiate(const std::string& type_names);
+
+    // Get kernel name (for debugging/error messages)
+    const std::string& name() const;
+
+  private:
+    friend class Program;
+    friend class KernelLauncher;
+    Kernel();
+
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+    std::unique_ptr<KernelLauncher> launcher_;  // For instantiate() return reference
+};
+
+// A compiled GPU program containing one or more kernels.
+// Created by ProgramCache::program().
+class Program {
+  public:
+    ~Program();
+    Program(Program&&) noexcept;
+    Program& operator=(Program&&) noexcept;
+
+    // Deleted copy operations - programs are move-only
+    Program(const Program&) = delete;
+    Program& operator=(const Program&) = delete;
+
+    // Get a kernel by name. The kernel can then be launched via:
+    // program->kernel("name")().instantiate().configure(...).launch(...)
+    Kernel kernel(const std::string& name);
+
+    // Check if program is valid (compiled successfully)
+    explicit operator bool() const;
+
+  private:
+    friend class ProgramCache;
+    Program();
+
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+
+// Caches compiled programs to avoid recompilation.
+// Thread-safe: multiple threads can call program() concurrently.
+class ProgramCache {
+  public:
+    ProgramCache();
+    ~ProgramCache();
+
+    // Compile a program.
+    // - name: identifier for the program (typically the kernel file name)
+    // - source: kernel source code (prepended with name for jitify compatibility)
+    // - headers: additional header code as (name, source) pairs
+    // - flags: compiler flags (-I, -D, -arch, etc.)
+    //
+    // Returns a new Program by value. Caching is not performed at this level;
+    // callers typically cache via shared_ptr<Program> at the application level.
+    Program program(const std::string& name,
+                    const std::string& source,
+                    const std::vector<std::pair<std::string, std::string>>& headers = {},
+                    const std::vector<std::string>& flags = {});
+
+    // Clear all cached programs. Invalidates all Program references.
+    void clear();
+
+  private:
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+
+// Helper to build compiler flags in a platform-agnostic way.
+class CompilerFlags {
+  public:
+    // Add an include path (-I)
+    CompilerFlags& include(const std::string& path);
+
+    // Add a macro definition (-D)
+    CompilerFlags& define(const std::string& macro, const std::string& value = "");
+
+    // Set target architecture (compute_75 for CUDA, gfx90a for HIP)
+    // The flag format is handled automatically per platform.
+    CompilerFlags& arch(const std::string& arch);
+
+    // Add a raw flag (passed through unchanged)
+    CompilerFlags& flag(const std::string& raw_flag);
+
+    // Build the final flags vector
+    std::vector<std::string> build() const;
+
+  private:
+    std::vector<std::string> flags_;
+};
+
+}  // namespace jit
+}  // namespace deme
+
+// Include the template implementation (must be visible at all call sites)
+#include "JitKernel.inl"
+
+#endif  // DEME_JIT_KERNEL_H
diff --git a/src/core/utils/JitKernel.inl b/src/core/utils/JitKernel.inl
new file mode 100644
index 00000000..6984dd94
--- /dev/null
+++ b/src/core/utils/JitKernel.inl
@@ -0,0 +1,64 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//  Copyright (c) 2026, Advanced Micro Devices, Inc.
+//
+//  SPDX-License-Identifier: BSD-3-Clause
+
+// Template implementation for JitKernel.h
+// This file is included at the end of JitKernel.h
+
+#ifndef DEME_JIT_KERNEL_INL
+#define DEME_JIT_KERNEL_INL
+
+#include <cstring>
+#include <vector>
+#include <memory>
+
+namespace deme {
+namespace jit {
+
+namespace detail {
+
+// Helper to pack a single argument by copying to a buffer
+template <typename T>
+inline void packArg(std::vector<void*>& args, std::vector<std::unique_ptr<char[]>>& storage, T arg) {
+    auto buf = std::make_unique<char[]>(sizeof(T));
+    std::memcpy(buf.get(), &arg, sizeof(T));
+    args.push_back(buf.get());
+    storage.push_back(std::move(buf));
+}
+
+// Recursive pack for variadic args
+template <typename T, typename... Rest>
+inline void packArgs(std::vector<void*>& args, std::vector<std::unique_ptr<char[]>>& storage, T first, Rest... rest) {
+    packArg(args, storage, first);
+    if constexpr (sizeof...(rest) > 0) {
+        packArgs(args, storage, rest...);
+    }
+}
+
+// Base case: no args
+inline void packArgs(std::vector<void*>&, std::vector<std::unique_ptr<char[]>>&) {}
+
+}  // namespace detail
+
+template <typename... Args>
+void KernelLauncher::launch(Args... args) {
+    // Pack each argument into a heap buffer and collect pointers to them, then
+    // forward to the backend's non-template raw launcher (hipModuleLaunchKernel
+    // on ROCm, jitify's cuLaunchKernel wrapper on CUDA). Both runtimes take the
+    // same "array of pointers to argument values" convention.
+    std::vector<void*> kernel_args;
+    std::vector<std::unique_ptr<char[]>> arg_storage;
+
+    if constexpr (sizeof...(args) > 0) {
+        detail::packArgs(kernel_args, arg_storage, args...);
+    }
+
+    launchRaw(kernel_args.empty() ? nullptr : kernel_args.data(), kernel_args.size());
+}
+
+}  // namespace jit
+}  // namespace deme
+
+#endif  // DEME_JIT_KERNEL_INL
diff --git a/src/core/utils/JitKernel_cuda.cpp b/src/core/utils/JitKernel_cuda.cpp
new file mode 100644
index 00000000..c55126e7
--- /dev/null
+++ b/src/core/utils/JitKernel_cuda.cpp
@@ -0,0 +1,205 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//  Copyright (c) 2026, Advanced Micro Devices, Inc.
+//
+//  SPDX-License-Identifier: BSD-3-Clause
+
+// CUDA backend for JitKernel using jitify v1 (NVRTC)
+
+#ifndef USE_HIP
+
+#include "JitKernel.h"
+
+// Jitify is in the build tree at ${CMAKE_BINARY_DIR}/src/jitify/
+// Include path is added by CMake target_include_directories
+#include <jitify/jitify.hpp>
+#include <unordered_map>
+#include <stdexcept>
+#include <mutex>
+
+namespace deme {
+namespace jit {
+
+// ============================================================================
+// Kernel implementation
+// ============================================================================
+
+struct Kernel::Impl {
+    std::string name;
+    jitify::Program* program = nullptr;  // Non-owning reference to parent program
+    jitify::KernelInstantiation instantiation;
+    bool has_instantiation = false;
+};
+
+Kernel::Kernel() : impl_(std::make_unique<Impl>()) {}
+Kernel::~Kernel() = default;
+Kernel::Kernel(Kernel&&) noexcept = default;
+Kernel& Kernel::operator=(Kernel&&) noexcept = default;
+
+KernelLauncher Kernel::operator()() {
+    return KernelLauncher(this);
+}
+
+KernelLauncher& Kernel::instantiate() {
+    launcher_ = std::make_unique<KernelLauncher>(this);
+    return launcher_->instantiate();
+}
+
+KernelLauncher& Kernel::instantiate(const std::string& type_names) {
+    launcher_ = std::make_unique<KernelLauncher>(this);
+    return launcher_->instantiate(type_names);
+}
+
+const std::string& Kernel::name() const {
+    return impl_->name;
+}
+
+// ============================================================================
+// KernelLauncher implementation
+// ============================================================================
+
+KernelLauncher::KernelLauncher(Kernel* k) : kernel_(k) {}
+
+KernelLauncher& KernelLauncher::instantiate() {
+    kernel_->impl_->instantiation = kernel_->impl_->program->kernel(kernel_->impl_->name).instantiate();
+    kernel_->impl_->has_instantiation = true;
+    instantiated_ = true;
+    return *this;
+}
+
+KernelLauncher& KernelLauncher::instantiate(const std::string& type_names) {
+    // Parse comma-separated type names and instantiate with them.
+    // Jitify uses variadic templates for instantiation, but we can pass
+    // the type string directly to the kernel() call via name mangling.
+    // For CUDA/jitify, we use the instantiate(type_name) overload.
+    kernel_->impl_->instantiation = kernel_->impl_->program->kernel(kernel_->impl_->name).instantiate({type_names});
+    kernel_->impl_->has_instantiation = true;
+    instantiated_ = true;
+    instantiation_ = type_names;
+    return *this;
+}
+
+KernelLauncher& KernelLauncher::configure(dim3 grid, dim3 block, size_t shared_mem, gpuStream_t stream) {
+    grid_ = grid;
+    block_ = block;
+    shared_mem_ = shared_mem;
+    stream_ = stream;
+    return *this;
+}
+
+// ============================================================================
+// Program implementation
+// ============================================================================
+
+struct Program::Impl {
+    jitify::Program jit_program;
+};
+
+Program::Program() : impl_(std::make_unique<Impl>()) {}
+Program::~Program() = default;
+Program::Program(Program&&) noexcept = default;
+Program& Program::operator=(Program&&) noexcept = default;
+
+Program::operator bool() const {
+    return impl_ != nullptr;
+}
+
+Kernel Program::kernel(const std::string& name) {
+    Kernel k;
+    k.impl_->name = name;
+    k.impl_->program = &impl_->jit_program;
+    return k;
+}
+
+// ============================================================================
+// ProgramCache implementation
+// ============================================================================
+
+struct ProgramCache::Impl {
+    jitify::JitCache cache;
+};
+
+ProgramCache::ProgramCache() : impl_(std::make_unique<Impl>()) {}
+ProgramCache::~ProgramCache() = default;
+
+Program ProgramCache::program(const std::string& name,
+                              const std::string& source,
+                              const std::vector<std::pair<std::string, std::string>>& headers,
+                              const std::vector<std::string>& flags) {
+    // jitify::JitCache handles caching internally, so we just wrap the result
+    (void)name;  // Name is embedded in source for jitify
+
+    // Extract header sources (jitify v1 takes just the source strings)
+    std::vector<std::string> header_sources;
+    for (const auto& h : headers) {
+        header_sources.push_back(h.second);
+    }
+
+    Program prog;
+    prog.impl_->jit_program = impl_->cache.program(source, header_sources, flags);
+    return prog;
+}
+
+void ProgramCache::clear() {
+    // No-op: jitify::JitCache doesn't expose a clear method
+}
+
+// ============================================================================
+// CompilerFlags implementation
+// ============================================================================
+
+CompilerFlags& CompilerFlags::include(const std::string& path) {
+    flags_.push_back("-I" + path);
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::define(const std::string& macro, const std::string& value) {
+    if (value.empty()) {
+        flags_.push_back("-D" + macro);
+    } else {
+        flags_.push_back("-D" + macro + "=" + value);
+    }
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::arch(const std::string& arch) {
+    flags_.push_back("-arch=" + arch);
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::flag(const std::string& raw_flag) {
+    flags_.push_back(raw_flag);
+    return *this;
+}
+
+std::vector<std::string> CompilerFlags::build() const {
+    return flags_;
+}
+
+// ============================================================================
+// KernelLauncher::launchRaw() - launch via jitify's cuLaunchKernel wrapper
+// ============================================================================
+
+// The templated launch() (in JitKernel.inl) packs each argument into a buffer
+// and hands the resulting array of pointers here, identically to the HIP
+// backend. jitify's launch takes the same "vector of pointers to argument
+// values" convention, so we forward straight to it.
+void KernelLauncher::launchRaw(void** kernel_args, size_t num_args) {
+    if (!kernel_->impl_->has_instantiation) {
+        throw std::runtime_error("Kernel '" + kernel_->impl_->name + "' not instantiated before launch");
+    }
+
+    // Skip launch if any grid dimension is zero (no work); keeps parity with the
+    // HIP backend, where a zero grid is an error rather than a silent no-op.
+    if (grid_.x == 0 || grid_.y == 0 || grid_.z == 0) {
+        return;
+    }
+
+    std::vector<void*> arg_ptrs(kernel_args, kernel_args + num_args);
+    kernel_->impl_->instantiation.configure(grid_, block_, shared_mem_, stream_).safe_launch(arg_ptrs);
+}
+
+}  // namespace jit
+}  // namespace deme
+
+#endif  // !USE_HIP
diff --git a/src/core/utils/JitKernel_hip.cpp b/src/core/utils/JitKernel_hip.cpp
new file mode 100644
index 00000000..3dba0cb3
--- /dev/null
+++ b/src/core/utils/JitKernel_hip.cpp
@@ -0,0 +1,406 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//  Copyright (c) 2026, Advanced Micro Devices, Inc.
+//
+//  SPDX-License-Identifier: BSD-3-Clause
+
+// HIP backend for JitKernel using hiprtc.
+//
+// Key design: hiprtc requires all name expressions (kernel names to resolve)
+// be registered BEFORE compilation via hiprtcAddNameExpression(). Since the
+// DEM-Engine API calls buildProgram() first, then kernel().instantiate() later,
+// we use lazy compilation: defer hiprtcCompileProgram until the first kernel
+// launch, by which point all kernel+instantiation requests have been collected.
+
+#ifdef USE_HIP
+
+#include "JitKernel.h"
+#include <hip/hiprtc.h>
+#include <hip/hip_runtime.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <stdexcept>
+#include <mutex>
+#include <vector>
+#include <cstring>
+#include <algorithm>
+
+namespace deme {
+namespace jit {
+
+namespace {
+
+void checkHiprtc(hiprtcResult result, const char* msg) {
+    if (result != HIPRTC_SUCCESS) {
+        throw std::runtime_error(std::string(msg) + ": " + hiprtcGetErrorString(result));
+    }
+}
+
+void checkHip(hipError_t result, const char* msg) {
+    if (result != hipSuccess) {
+        throw std::runtime_error(std::string(msg) + ": " + hipGetErrorString(result));
+    }
+}
+
+// Build the name expression string for hiprtcAddNameExpression.
+// For a templated kernel like "modifyComponents" instantiated with "deme::DEMDataKT",
+// the expression is "modifyComponents<deme::DEMDataKT>".
+std::string buildNameExpression(const std::string& name, const std::string& instantiation) {
+    if (instantiation.empty()) {
+        return name;
+    }
+    return name + "<" + instantiation + ">";
+}
+
+}  // namespace
+
+// Forward declaration of ProgramImpl so Kernel::Impl can reference it
+struct ProgramImpl {
+    // Source and compilation options (stored for lazy/deferred compilation)
+    std::string name;
+    std::string source;
+    std::vector<std::pair<std::string, std::string>> headers;
+    std::vector<std::string> flags;
+
+    // Kernel name expressions that have been requested
+    // Key: name expression (e.g., "modifyComponents<deme::DEMDataKT>")
+    // Value: lowered/mangled name after compilation
+    std::unordered_map<std::string, std::string> loweredNames;
+
+    // Name expressions pending addition (requested but not yet compiled)
+    std::unordered_set<std::string> pendingNameExpressions;
+
+    // Compiled module (nullptr until first compilation)
+    hipModule_t module = nullptr;
+
+    // Compiled code buffer (kept for potential recompilation)
+    std::vector<char> codeBuffer;
+
+    // Whether we need to recompile (new name expressions added since last compile)
+    bool needsRecompile = true;
+
+    std::mutex mutex;
+
+    ~ProgramImpl() {
+        if (module) {
+            (void)hipModuleUnload(module);
+        }
+    }
+
+    // Compile or recompile with all registered name expressions
+    void ensureCompiled() {
+        std::lock_guard<std::mutex> lock(mutex);
+
+        if (!needsRecompile && module != nullptr) {
+            return;  // Already compiled with current name expressions
+        }
+
+        // Unload previous module if any
+        if (module) {
+            (void)hipModuleUnload(module);
+            module = nullptr;
+        }
+
+        // Create hiprtc program
+        hiprtcProgram prog;
+        std::vector<const char*> header_names;
+        std::vector<const char*> header_sources;
+        for (const auto& h : headers) {
+            header_names.push_back(h.first.c_str());
+            header_sources.push_back(h.second.c_str());
+        }
+
+        checkHiprtc(hiprtcCreateProgram(&prog, source.c_str(), name.c_str(),
+                                        static_cast<int>(header_sources.size()),
+                                        header_sources.data(), header_names.data()),
+                    "hiprtcCreateProgram");
+
+        // Register all name expressions (pending ones plus any previously resolved)
+        std::vector<std::string> allNameExpressions;
+        for (const auto& pending : pendingNameExpressions) {
+            allNameExpressions.push_back(pending);
+        }
+        for (const auto& resolved : loweredNames) {
+            // Re-register previously resolved names (needed for recompilation)
+            if (pendingNameExpressions.find(resolved.first) == pendingNameExpressions.end()) {
+                allNameExpressions.push_back(resolved.first);
+            }
+        }
+
+        for (const auto& expr : allNameExpressions) {
+            hiprtcResult res = hiprtcAddNameExpression(prog, expr.c_str());
+            if (res != HIPRTC_SUCCESS) {
+                hiprtcDestroyProgram(&prog);
+                throw std::runtime_error("hiprtcAddNameExpression failed for '" + expr +
+                                         "': " + hiprtcGetErrorString(res));
+            }
+        }
+
+        // Convert flags to const char* array
+        std::vector<const char*> options;
+        for (const auto& f : flags) {
+            options.push_back(f.c_str());
+        }
+
+        // Compile
+        hiprtcResult compileResult = hiprtcCompileProgram(prog, static_cast<int>(options.size()), options.data());
+
+        if (compileResult != HIPRTC_SUCCESS) {
+            size_t logSize;
+            hiprtcGetProgramLogSize(prog, &logSize);
+            std::string log(logSize, '\0');
+            hiprtcGetProgramLog(prog, log.data());
+            hiprtcDestroyProgram(&prog);
+            throw std::runtime_error("hiprtc compilation failed for '" + name + "':\n" + log);
+        }
+
+        // Get lowered names for all expressions
+        for (const auto& expr : allNameExpressions) {
+            const char* lowered = nullptr;
+            hiprtcResult res = hiprtcGetLoweredName(prog, expr.c_str(), &lowered);
+            if (res != HIPRTC_SUCCESS) {
+                hiprtcDestroyProgram(&prog);
+                throw std::runtime_error("hiprtcGetLoweredName failed for '" + expr +
+                                         "': " + hiprtcGetErrorString(res));
+            }
+            loweredNames[expr] = std::string(lowered);
+        }
+
+        // Clear pending set - all are now resolved
+        pendingNameExpressions.clear();
+
+        // Get compiled code
+        size_t codeSize;
+        checkHiprtc(hiprtcGetCodeSize(prog, &codeSize), "hiprtcGetCodeSize");
+        codeBuffer.resize(codeSize);
+        checkHiprtc(hiprtcGetCode(prog, codeBuffer.data()), "hiprtcGetCode");
+
+        hiprtcDestroyProgram(&prog);
+
+        // Load module
+        checkHip(hipModuleLoadData(&module, codeBuffer.data()), "hipModuleLoadData");
+
+        needsRecompile = false;
+    }
+
+    // Register a name expression (kernel + optional instantiation)
+    // Returns the lowered name, triggering compilation if needed
+    std::string getLoweredName(const std::string& nameExpr) {
+        {
+            std::lock_guard<std::mutex> lock(mutex);
+
+            // Check if already resolved
+            auto it = loweredNames.find(nameExpr);
+            if (it != loweredNames.end() && !needsRecompile) {
+                return it->second;
+            }
+
+            // Add to pending if not already there
+            if (loweredNames.find(nameExpr) == loweredNames.end()) {
+                pendingNameExpressions.insert(nameExpr);
+                needsRecompile = true;
+            }
+        }
+
+        // Compile (releases and re-acquires lock internally)
+        ensureCompiled();
+
+        // Now look up the lowered name
+        std::lock_guard<std::mutex> lock(mutex);
+        auto it = loweredNames.find(nameExpr);
+        if (it == loweredNames.end()) {
+            throw std::runtime_error("Internal error: lowered name not found for '" + nameExpr + "'");
+        }
+        return it->second;
+    }
+};
+
+// ============================================================================
+// Kernel implementation
+// ============================================================================
+
+struct Kernel::Impl {
+    std::string name;
+    std::string instantiation;
+    hipFunction_t func = nullptr;
+    ProgramImpl* program = nullptr;  // Non-owning reference to parent program
+};
+
+Kernel::Kernel() : impl_(std::make_unique<Impl>()) {}
+Kernel::~Kernel() = default;
+Kernel::Kernel(Kernel&&) noexcept = default;
+Kernel& Kernel::operator=(Kernel&&) noexcept = default;
+
+KernelLauncher Kernel::operator()() {
+    return KernelLauncher(this);
+}
+
+KernelLauncher& Kernel::instantiate() {
+    launcher_ = std::make_unique<KernelLauncher>(this);
+    return launcher_->instantiate();
+}
+
+KernelLauncher& Kernel::instantiate(const std::string& type_names) {
+    launcher_ = std::make_unique<KernelLauncher>(this);
+    return launcher_->instantiate(type_names);
+}
+
+const std::string& Kernel::name() const {
+    return impl_->name;
+}
+
+// ============================================================================
+// KernelLauncher implementation
+// ============================================================================
+
+KernelLauncher::KernelLauncher(Kernel* k) : kernel_(k) {}
+
+KernelLauncher& KernelLauncher::instantiate() {
+    return instantiate("");
+}
+
+KernelLauncher& KernelLauncher::instantiate(const std::string& type_names) {
+    kernel_->impl_->instantiation = type_names;
+    instantiation_ = type_names;
+
+    if (kernel_->impl_->program) {
+        // Build the name expression and register it
+        std::string nameExpr = buildNameExpression(kernel_->impl_->name, type_names);
+
+        // Get the lowered name (this triggers compilation if needed)
+        std::string loweredName = kernel_->impl_->program->getLoweredName(nameExpr);
+
+        // Ensure module is compiled
+        kernel_->impl_->program->ensureCompiled();
+
+        // Now get the function handle
+        hipError_t err = hipModuleGetFunction(&kernel_->impl_->func,
+                                               kernel_->impl_->program->module,
+                                               loweredName.c_str());
+        if (err != hipSuccess) {
+            throw std::runtime_error("hipModuleGetFunction failed for '" + nameExpr +
+                                     "' (lowered: " + loweredName + "): " + hipGetErrorString(err));
+        }
+    }
+    instantiated_ = true;
+    return *this;
+}
+
+KernelLauncher& KernelLauncher::configure(dim3 grid, dim3 block, size_t shared_mem, gpuStream_t stream) {
+    grid_ = grid;
+    block_ = block;
+    shared_mem_ = shared_mem;
+    stream_ = stream;
+    return *this;
+}
+
+// ============================================================================
+// Program implementation - wraps ProgramImpl
+// ============================================================================
+
+struct Program::Impl : public ProgramImpl {};
+
+Program::Program() : impl_(std::make_unique<Impl>()) {}
+Program::~Program() = default;
+Program::Program(Program&&) noexcept = default;
+Program& Program::operator=(Program&&) noexcept = default;
+
+Program::operator bool() const {
+    return impl_ != nullptr;
+}
+
+Kernel Program::kernel(const std::string& name) {
+    Kernel k;
+    k.impl_->name = name;
+    k.impl_->program = impl_.get();
+    return k;
+}
+
+// ============================================================================
+// ProgramCache implementation
+// ============================================================================
+
+struct ProgramCache::Impl {
+    // No caching at this level; callers cache via shared_ptr<Program>
+};
+
+ProgramCache::ProgramCache() : impl_(std::make_unique<Impl>()) {}
+ProgramCache::~ProgramCache() = default;
+
+Program ProgramCache::program(const std::string& name,
+                              const std::string& source,
+                              const std::vector<std::pair<std::string, std::string>>& headers,
+                              const std::vector<std::string>& flags) {
+    // Create a new Program with deferred compilation (lazy compilation on first kernel use)
+    Program result;
+    result.impl_->name = name;
+    result.impl_->source = source;
+    result.impl_->headers = headers;
+    result.impl_->flags = flags;
+    result.impl_->needsRecompile = true;
+    return result;
+}
+
+void ProgramCache::clear() {
+    // No-op: no caching at this level
+}
+
+// ============================================================================
+// CompilerFlags implementation
+// ============================================================================
+
+CompilerFlags& CompilerFlags::include(const std::string& path) {
+    flags_.push_back("-I" + path);
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::define(const std::string& macro, const std::string& value) {
+    if (value.empty()) {
+        flags_.push_back("-D" + macro);
+    } else {
+        flags_.push_back("-D" + macro + "=" + value);
+    }
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::arch(const std::string& arch) {
+    flags_.push_back("--gpu-architecture=" + arch);
+    return *this;
+}
+
+CompilerFlags& CompilerFlags::flag(const std::string& raw_flag) {
+    flags_.push_back(raw_flag);
+    return *this;
+}
+
+std::vector<std::string> CompilerFlags::build() const {
+    return flags_;
+}
+
+// ============================================================================
+// KernelLauncher::launchRaw() - non-templated launch for HIP
+// ============================================================================
+
+void KernelLauncher::launchRaw(void** kernel_args, size_t num_args) {
+    if (!kernel_->impl_->func) {
+        throw std::runtime_error("Kernel function '" + kernel_->impl_->name + "' not resolved");
+    }
+
+    // Skip launch if grid has zero dimension (no work to do).
+    // CUDA may silently accept grid=0, but HIP returns hipErrorInvalidValue.
+    if (grid_.x == 0 || grid_.y == 0 || grid_.z == 0) {
+        return;
+    }
+
+    hipError_t err =
+        hipModuleLaunchKernel(kernel_->impl_->func, grid_.x, grid_.y, grid_.z, block_.x, block_.y, block_.z,
+                              shared_mem_, stream_, kernel_args, nullptr);
+
+    checkHip(err, ("hipModuleLaunchKernel failed for '" + kernel_->impl_->name + "'").c_str());
+    (void)num_args;
+}
+
+}  // namespace jit
+}  // namespace deme
+
+#endif  // USE_HIP
diff --git a/src/core/utils/ManagedMemory.hpp b/src/core/utils/ManagedMemory.hpp
index b219dcc4..71bed2c7 100644
--- a/src/core/utils/ManagedMemory.hpp
+++ b/src/core/utils/ManagedMemory.hpp
@@ -13,7 +13,7 @@
 // Convenience functions to help with Managed Memory (allocated using ManagedAllocator, typically)
 namespace deme {
 
-#if CUDART_VERSION >= 13000
+#if !defined(USE_HIP) && CUDART_VERSION >= 13000
 inline cudaMemLocation make_device_location(int device) {
     cudaMemLocation loc{};
     loc.type = cudaMemLocationTypeDevice;
@@ -25,7 +25,10 @@ inline cudaMemLocation make_device_location(int device) {
 // Underlying implementation
 template <class T>
 inline void __migrate_impl(T* data, std::size_t size, int device, cudaStream_t stream = 0) {
-#if CUDART_VERSION >= 13000
+#if defined(USE_HIP)
+    // HIP uses the 4-arg form: hipMemPrefetchAsync(const void*, size_t, int dstDevice, hipStream_t)
+    cudaMemPrefetchAsync(static_cast<void*>(data), size * sizeof(T), device, stream);
+#elif CUDART_VERSION >= 13000
     // CUDA 13.0+: cudaMemPrefetchAsync(const void*, size_t, cudaMemLocation, unsigned int flags, cudaStream_t)
     auto loc = make_device_location(device);
     unsigned int flags = 0;
@@ -74,18 +77,30 @@ void migrate(const std::vector<T>& data, int device, cudaStream_t stream = 0) {
 
 // Aliases for cudaMemoryAdvise constants
 enum class ManagedAdvice {
+#if defined(USE_HIP)
+    READ_MOSTLY = hipMemAdviseSetReadMostly,
+    PREFERRED_LOC = hipMemAdviseSetPreferredLocation,
+    ACCESSED_BY = hipMemAdviseSetAccessedBy,
+    UNSET_READ_MOSTLY = hipMemAdviseUnsetReadMostly,
+    UNSET_PREFERRED_LOC = hipMemAdviseUnsetPreferredLocation,
+    UNSET_ACCESSED_BY = hipMemAdviseUnsetAccessedBy
+#else
     READ_MOSTLY = cudaMemAdviseSetReadMostly,
     PREFERRED_LOC = cudaMemAdviseSetPreferredLocation,
     ACCESSED_BY = cudaMemAdviseSetAccessedBy,
     UNSET_READ_MOSTLY = cudaMemAdviseUnsetReadMostly,
     UNSET_PREFERRED_LOC = cudaMemAdviseUnsetPreferredLocation,
     UNSET_ACCESSED_BY = cudaMemAdviseUnsetAccessedBy
+#endif
 };
 
 // Underlying implementation
 template <class T>
 void __advise_impl(const T* data, std::size_t size, ManagedAdvice advice, int device) {
-#if CUDART_VERSION >= 13000
+#if defined(USE_HIP)
+    // HIP uses the 4-arg form: hipMemAdvise(const void*, size_t, hipMemoryAdvise, int)
+    hipMemAdvise(static_cast<const void*>(data), size * sizeof(T), static_cast<hipMemoryAdvise>(advice), device);
+#elif CUDART_VERSION >= 13000
     cudaMemLocation loc{};
     loc.type = cudaMemLocationTypeDevice;
     loc.id = device;
diff --git a/src/core/utils/cuda_to_hip.h b/src/core/utils/cuda_to_hip.h
new file mode 100644
index 00000000..8c3c314c
--- /dev/null
+++ b/src/core/utils/cuda_to_hip.h
@@ -0,0 +1,123 @@
+//  Copyright (c) 2021, SBEL GPU Development Team
+//  Copyright (c) 2021, University of Wisconsin - Madison
+//  Copyright (c) 2026, Advanced Micro Devices, Inc.
+//
+//  SPDX-License-Identifier: BSD-3-Clause
+
+// CUDA-to-HIP compatibility header for DEM-Engine.
+// On ROCm this aliases CUDA runtime API symbols to their HIP equivalents.
+// On CUDA this is a passthrough to the standard CUDA runtime.
+
+#ifndef DEME_CUDA_TO_HIP_H
+#define DEME_CUDA_TO_HIP_H
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+
+#include <hip/hip_runtime.h>
+
+// Runtime API types
+#define cudaStream_t hipStream_t
+#define cudaEvent_t hipEvent_t
+#define cudaError_t hipError_t
+#define cudaDeviceProp hipDeviceProp_t
+
+// Error codes
+#define cudaSuccess hipSuccess
+#define cudaErrorInvalidValue hipErrorInvalidValue
+#define cudaErrorMemoryAllocation hipErrorMemoryAllocation
+#define cudaErrorNotSupported hipErrorNotSupported
+
+// Memory management
+#define cudaMalloc hipMalloc
+#define cudaFree hipFree
+#define cudaMallocManaged hipMallocManaged
+#define cudaMallocHost hipHostMalloc
+#define cudaHostAlloc hipHostMalloc
+#define cudaFreeHost hipHostFree
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemPrefetchAsync hipMemPrefetchAsync
+
+// Memory copy kinds
+#define cudaMemcpyHostToHost hipMemcpyHostToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDefault hipMemcpyDefault
+
+// Memory attach flags
+#define cudaMemAttachGlobal hipMemAttachGlobal
+
+// Host alloc flags
+#define cudaHostAllocDefault hipHostMallocDefault
+
+// Stream management
+#define cudaStreamCreate hipStreamCreate
+#define cudaStreamDestroy hipStreamDestroy
+#define cudaStreamSynchronize hipStreamSynchronize
+
+// Device management
+#define cudaGetDevice hipGetDevice
+#define cudaSetDevice hipSetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaDeviceReset hipDeviceReset
+
+// Error handling
+#define cudaGetLastError hipGetLastError
+#define cudaGetErrorString hipGetErrorString
+#define cudaPeekAtLastError hipPeekAtLastError
+
+// Pointer attributes
+#define cudaPointerAttributes hipPointerAttribute_t
+#define cudaPointerGetAttributes hipPointerGetAttributes
+
+// Memory types - use a namespace alias to preserve scoped access syntax.
+// In hiprtc JIT kernels, hipMemoryType may not be available (device-only runtime).
+// Provide a minimal fallback.
+#ifdef __HIPCC_RTC__
+    // hiprtc device-only context: define our own enum for compatibility
+    enum hipMemoryType { hipMemoryTypeUnregistered = 0, hipMemoryTypeHost = 1,
+                         hipMemoryTypeDevice = 2, hipMemoryTypeManaged = 3 };
+#endif
+
+namespace cudaMemoryType {
+    constexpr hipMemoryType cudaMemoryTypeUnregistered = hipMemoryTypeUnregistered;
+    constexpr hipMemoryType cudaMemoryTypeHost = hipMemoryTypeHost;
+    constexpr hipMemoryType cudaMemoryTypeDevice = hipMemoryTypeDevice;
+    constexpr hipMemoryType cudaMemoryTypeManaged = hipMemoryTypeManaged;
+}
+
+// Memory advise
+#define cudaMemAdvise hipMemAdvise
+#define cudaMemoryAdvise hipMemoryAdvise
+
+// CUB -> hipCUB
+#define cub hipcub
+
+// cuRAND device headers -> hipRAND (not needed in this shim, handled in API.h)
+
+// CUDA floating-point intrinsics with directed rounding.
+// HIP/AMD does not provide exact equivalents; use standard operations.
+// The _ru suffix means "round up" (toward +infinity), _rd means "round down",
+// _rn means "round to nearest", _rz means "round toward zero".
+// For most physics simulations, the difference is negligible.
+#if defined(__HIP_DEVICE_COMPILE__) || defined(__HIPCC__)
+    __device__ inline float __frcp_ru(float x) { return 1.0f / x; }
+    __device__ inline double __drcp_ru(double x) { return 1.0 / x; }
+    __device__ inline float __fmul_ru(float x, float y) { return x * y; }
+    __device__ inline double __dmul_ru(double x, double y) { return x * y; }
+    __device__ inline float __fadd_ru(float x, float y) { return x + y; }
+    __device__ inline double __dadd_ru(double x, double y) { return x + y; }
+#endif
+
+#else  // CUDA
+
+#include <cuda_runtime.h>
+
+#endif  // USE_HIP
+
+#endif  // DEME_CUDA_TO_HIP_H
diff --git a/src/kernel/CUDAMathHelpers.cuh b/src/kernel/CUDAMathHelpers.cuh
index bb523c3d..167bc53d 100644
--- a/src/kernel/CUDAMathHelpers.cuh
+++ b/src/kernel/CUDAMathHelpers.cuh
@@ -45,7 +45,7 @@
     #undef strtok_r
 #endif
 
-#include "cuda_runtime.h"
+#include <core/utils/cuda_to_hip.h>
 
 #ifndef EXIT_WAIVED
     #define EXIT_WAIVED 2
@@ -54,7 +54,7 @@
 using uint = unsigned int;
 using ushort = unsigned short;
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(__HIPCC_RTC__) && !defined(__HIP_DEVICE_COMPILE__)
     ////////////////////////////////////////////////////////////////////////////////
     // override implementations of CUDA functions
     ////////////////////////////////////////////////////////////////////////////////
@@ -241,8 +241,12 @@ inline __host__ __device__ int4 operator-(int4& a) {
 
 ////////////////////////////////////////////////////////////////////////////////
 // addition
+// Note: HIP provides operator+, operator+= for HIP_vector_type in
+// amd_hip_vector_types.h, so we guard vector-vector operators to avoid
+// ambiguity. Scalar versions may still be needed.
 ////////////////////////////////////////////////////////////////////////////////
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float2 operator+(float2 a, float2 b) {
     return make_float2(a.x + b.x, a.y + b.y);
 }
@@ -250,6 +254,7 @@ inline __host__ __device__ void operator+=(float2& a, float2 b) {
     a.x += b.x;
     a.y += b.y;
 }
+#endif
 inline __host__ __device__ float2 operator+(float2 a, float b) {
     return make_float2(a.x + b, a.y + b);
 }
@@ -297,6 +302,7 @@ inline __host__ __device__ void operator+=(uint2& a, uint b) {
     a.y += b;
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float3 operator+(float3 a, float3 b) {
     return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
 }
@@ -305,6 +311,7 @@ inline __host__ __device__ void operator+=(float3& a, float3 b) {
     a.y += b.y;
     a.z += b.z;
 }
+#endif
 inline __host__ __device__ float3 operator+(float3 a, float b) {
     return make_float3(a.x + b, a.y + b, a.z + b);
 }
@@ -358,6 +365,7 @@ inline __host__ __device__ float3 operator+(float b, float3 a) {
     return make_float3(a.x + b, a.y + b, a.z + b);
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float4 operator+(float4 a, float4 b) {
     return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
 }
@@ -367,6 +375,7 @@ inline __host__ __device__ void operator+=(float4& a, float4 b) {
     a.z += b.z;
     a.w += b.w;
 }
+#endif
 inline __host__ __device__ float4 operator+(float4 a, float b) {
     return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
 }
@@ -428,6 +437,7 @@ inline __host__ __device__ void operator+=(uint4& a, uint b) {
 // subtract
 ////////////////////////////////////////////////////////////////////////////////
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float2 operator-(float2 a, float2 b) {
     return make_float2(a.x - b.x, a.y - b.y);
 }
@@ -435,6 +445,7 @@ inline __host__ __device__ void operator-=(float2& a, float2 b) {
     a.x -= b.x;
     a.y -= b.y;
 }
+#endif
 inline __host__ __device__ float2 operator-(float2 a, float b) {
     return make_float2(a.x - b, a.y - b);
 }
@@ -482,6 +493,7 @@ inline __host__ __device__ void operator-=(uint2& a, uint b) {
     a.y -= b;
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float3 operator-(float3 a, float3 b) {
     return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
 }
@@ -490,6 +502,7 @@ inline __host__ __device__ void operator-=(float3& a, float3 b) {
     a.y -= b.y;
     a.z -= b.z;
 }
+#endif
 inline __host__ __device__ float3 operator-(float3 a, float b) {
     return make_float3(a.x - b, a.y - b, a.z - b);
 }
@@ -542,6 +555,7 @@ inline __host__ __device__ void operator-=(uint3& a, uint b) {
     a.z -= b;
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float4 operator-(float4 a, float4 b) {
     return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
 }
@@ -551,6 +565,7 @@ inline __host__ __device__ void operator-=(float4& a, float4 b) {
     a.z -= b.z;
     a.w -= b.w;
 }
+#endif
 inline __host__ __device__ float4 operator-(float4 a, float b) {
     return make_float4(a.x - b, a.y - b, a.z - b, a.w - b);
 }
@@ -607,8 +622,11 @@ inline __host__ __device__ void operator-=(uint4& a, uint b) {
 
 ////////////////////////////////////////////////////////////////////////////////
 // multiply
+// Note: HIP provides operator*(floatN, floatN) in amd_hip_vector_types.h
+// so we guard the vector-to-vector operators to avoid ambiguity
 ////////////////////////////////////////////////////////////////////////////////
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float2 operator*(float2 a, float2 b) {
     return make_float2(a.x * b.x, a.y * b.y);
 }
@@ -616,6 +634,7 @@ inline __host__ __device__ void operator*=(float2& a, float2 b) {
     a.x *= b.x;
     a.y *= b.y;
 }
+#endif
 inline __host__ __device__ float2 operator*(float2 a, float b) {
     return make_float2(a.x * b, a.y * b);
 }
@@ -663,6 +682,7 @@ inline __host__ __device__ void operator*=(uint2& a, uint b) {
     a.y *= b;
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float3 operator*(float3 a, float3 b) {
     return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
 }
@@ -671,6 +691,7 @@ inline __host__ __device__ void operator*=(float3& a, float3 b) {
     a.y *= b.y;
     a.z *= b.z;
 }
+#endif
 inline __host__ __device__ float3 operator*(float3 a, float b) {
     return make_float3(a.x * b, a.y * b, a.z * b);
 }
@@ -723,6 +744,7 @@ inline __host__ __device__ void operator*=(uint3& a, uint b) {
     a.z *= b;
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float4 operator*(float4 a, float4 b) {
     return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
 }
@@ -732,6 +754,7 @@ inline __host__ __device__ void operator*=(float4& a, float4 b) {
     a.z *= b.z;
     a.w *= b.w;
 }
+#endif
 inline __host__ __device__ float4 operator*(float4 a, float b) {
     return make_float4(a.x * b, a.y * b, a.z * b, a.w * b);
 }
@@ -791,8 +814,10 @@ inline __host__ __device__ void operator*=(uint4& a, uint b) {
 
 ////////////////////////////////////////////////////////////////////////////////
 // divide
+// Note: HIP provides operator/(floatN, floatN) in amd_hip_vector_types.h
 ////////////////////////////////////////////////////////////////////////////////
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float2 operator/(float2 a, float2 b) {
     return make_float2(a.x / b.x, a.y / b.y);
 }
@@ -800,6 +825,7 @@ inline __host__ __device__ void operator/=(float2& a, float2 b) {
     a.x /= b.x;
     a.y /= b.y;
 }
+#endif
 inline __host__ __device__ float2 operator/(float2 a, float b) {
     return make_float2(a.x / b, a.y / b);
 }
@@ -811,6 +837,7 @@ inline __host__ __device__ float2 operator/(float b, float2 a) {
     return make_float2(b / a.x, b / a.y);
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float3 operator/(float3 a, float3 b) {
     return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
 }
@@ -819,6 +846,7 @@ inline __host__ __device__ void operator/=(float3& a, float3 b) {
     a.y /= b.y;
     a.z /= b.z;
 }
+#endif
 inline __host__ __device__ float3 operator/(float3 a, float b) {
     return make_float3(a.x / b, a.y / b, a.z / b);
 }
@@ -831,6 +859,7 @@ inline __host__ __device__ float3 operator/(float b, float3 a) {
     return make_float3(b / a.x, b / a.y, b / a.z);
 }
 
+#ifndef __HIP_PLATFORM_AMD__
 inline __host__ __device__ float4 operator/(float4 a, float4 b) {
     return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
 }
@@ -840,6 +869,7 @@ inline __host__ __device__ void operator/=(float4& a, float4 b) {
     a.z /= b.z;
     a.w /= b.w;
 }
+#endif
 inline __host__ __device__ float4 operator/(float4 a, float b) {
     return make_float4(a.x / b, a.y / b, a.z / b, a.w / b);
 }
@@ -1453,11 +1483,20 @@ inline __host__ __device__ T2 to_real3(const T1& a) {
 }
 
 // Cause an error inside a kernel
-#define DEME_ABORT_KERNEL(...) \
-    {                          \
-        printf(__VA_ARGS__);   \
-        __threadfence();       \
-        asm volatile("trap;"); \
-    }
+#if defined(__HIP_PLATFORM_AMD__) || defined(USE_HIP)
+    #define DEME_ABORT_KERNEL(...) \
+        {                          \
+            printf(__VA_ARGS__);   \
+            __threadfence();       \
+            __builtin_trap();      \
+        }
+#else
+    #define DEME_ABORT_KERNEL(...) \
+        {                          \
+            printf(__VA_ARGS__);   \
+            __threadfence();       \
+            asm volatile("trap;"); \
+        }
+#endif
 
 #endif