From 3bd9a6259c84efc7a9f5d2a530d0e1aac35d91a9 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 15 Jun 2026 17:03:05 -0400
Subject: [PATCH 1/4] Benchmark kernel GPU time via profiler instead of event
 wall-clock
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CUDA-event window around each call measured the GPU-timeline span,
including idle bubbles from per-call allocation and (for autograd variants)
Python/engine launch overhead between kernels. Those bubbles are benchmarking
artifacts — in a real run the CPU stays ahead of the GPU and the caching
allocator serves buffers without syncs — and they vary by variant, so they
unfairly penalized the Python-autograd Triton path versus the eager C++ engine.
They also made backward timing (derived as fwd_bwd - fwd) swing ~190%
run-to-run on a fixed shape.

Sum per-kernel device self-time from torch.profiler instead. Only CUDA-device
entries are summed (custom-autograd CPU nodes carry their children's device
time and would double-count); the L2-flush fill kernel is identified from a
flush-only profile and excluded. Reproduces to <0.5% run-to-run and matches
isolated-kernel timing.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 tools/benchmark/triton_kernels/__main__.py |  3 ++
 tools/benchmark/triton_kernels/runner.py   | 62 +++++++++++++---------
 2 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/tools/benchmark/triton_kernels/__main__.py b/tools/benchmark/triton_kernels/__main__.py
index e09b92976..950307f22 100644
--- a/tools/benchmark/triton_kernels/__main__.py
+++ b/tools/benchmark/triton_kernels/__main__.py
@@ -33,6 +33,9 @@
 # mutated inputs" when using max-autotune. The fallback is correct; suppress noise.
 warnings.filterwarnings("ignore", message=".*[Ss]kipping (cuda|CUDA)[Gg]raphs.*")
 logging.getLogger("torch._inductor.cudagraph_trees").setLevel(logging.ERROR)
+# The per-measurement profiler session emits a one-time note about event cycles; the
+# runner reads key_averages right after each session, so it doesn't apply here.
+warnings.filterwarnings("ignore", message=".*Profiler clears events.*")
 
 _BENCHMARKS = {
     "entropy_loss": bench_entropy_loss,
diff --git a/tools/benchmark/triton_kernels/runner.py b/tools/benchmark/triton_kernels/runner.py
index 9a00c0cd0..952cef0ef 100644
--- a/tools/benchmark/triton_kernels/runner.py
+++ b/tools/benchmark/triton_kernels/runner.py
@@ -4,18 +4,19 @@
 Each benchmark file defines a list of `Case` objects (input shape/dtype
 sweep) and a list of `Variant` objects (implementations to compare — e.g.
 pytorch eager, pytorch compiled, Triton). The runner invokes each variant
-on each case, measures timing (median + mean + percentiles via CUDA events),
-measures peak/final memory, and compares outputs against an fp32 reference
-using RMS error. Results are printed as a table per case.
+on each case, measures GPU kernel time (summed per-kernel device time via the
+profiler), measures peak/final memory, and compares outputs against an fp32
+reference using RMS error. Results are printed as a table per case.
 """
 
 import dataclasses
 import math
-import statistics
 import time
 import typing
 
 import torch
+from torch.autograd import DeviceType
+from torch.profiler import ProfilerActivity, profile
 
 from fast_llm.utils import header
 from tools.benchmark.triton_kernels.gpu_specs import GpuSpec, detect_gpu_spec
@@ -163,8 +164,15 @@ def bench_fn(
     """Benchmark `fn` — it should be a no-arg callable that invokes the kernel
     being timed (close over inputs). Returns timing statistics in ms.
 
-    Mirrors `triton.testing.do_bench` logic but returns raw per-rep list so we
-    can compute {median, mean, min, max, std} from one set of runs.
+    Reports GPU kernel time: the profiler's summed per-kernel device self-time,
+    not a wall-clock window around `fn`. A CUDA-event window also counts GPU-idle
+    bubbles from per-call allocation and (for autograd variants) Python/engine
+    launch overhead — benchmarking artifacts that don't occur in a real run where
+    the CPU runs ahead and the allocator serves cached buffers, and which vary
+    by variant (the eager C++ engine starves the GPU far less than a Python
+    autograd Function). Summing device self-time isolates the work the GPU
+    actually does. Device time is deterministic to <1% run-to-run, so the
+    per-rep mean is reported across all stat fields.
     """
     if not torch.cuda.is_available():
         # CPU / Triton interpret: single timed run with wall clock. min_reps,
@@ -217,28 +225,32 @@ def bench_fn(
 
     num_reps = max(min_reps, min(max_reps, int(rep_ms / one_rep_ms)))
 
-    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_reps)]
-    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_reps)]
-    for i in range(num_reps):
-        if reset is not None:
-            reset()
-        # The L2 flush is enqueued before start_events[i] on the same stream, so
-        # the timed window starts after the zero completes — only fn() is timed.
+    # The L2 flush is itself a device kernel (a fill); profile it alone to learn
+    # its kernel key so it can be excluded from the measured sum below.
+    with profile(activities=[ProfilerActivity.CUDA]) as flush_prof:
         flush_buffer.zero_()
-        start_events[i].record()
-        fn()
-        end_events[i].record()
-    torch.cuda.synchronize()
+        torch.cuda.synchronize()
+    flush_keys = {k.key for k in flush_prof.key_averages() if k.device_type == DeviceType.CUDA}
+
+    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
+        for _ in range(num_reps):
+            if reset is not None:
+                reset()
+            # Cold L2 between reps so each kernel re-reads from DRAM. Its fill
+            # kernel is excluded from the sum via `flush_keys`.
+            flush_buffer.zero_()
+            fn()
+        torch.cuda.synchronize()
 
-    times = [start_events[i].elapsed_time(end_events[i]) for i in range(num_reps)]
-    return TimingStats(
-        median_ms=statistics.median(times),
-        mean_ms=statistics.fmean(times),
-        min_ms=min(times),
-        max_ms=max(times),
-        std_ms=statistics.pstdev(times) if len(times) > 1 else 0.0,
-        num_reps=num_reps,
+    # Sum only CUDA-device entries: custom-autograd CPU nodes carry their
+    # children's device time and would double-count.
+    kernel_us = sum(
+        k.self_device_time_total
+        for k in prof.key_averages()
+        if k.device_type == DeviceType.CUDA and k.key not in flush_keys
     )
+    per_rep_ms = kernel_us / num_reps / 1000
+    return TimingStats(per_rep_ms, per_rep_ms, per_rep_ms, per_rep_ms, 0.0, num_reps)
 
 
 # --------------------------------------------------------------------------- memory

From 784a3daeb6e10b7a6222ed1d885c08eeccf01be1 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 15 Jun 2026 18:39:37 -0400
Subject: [PATCH 2/4] Profile each rep separately and build a real timing
 distribution
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Addresses review of the profiler timing change:

- Move the L2 flush outside each rep's profiler context. The flush kernel is no
  longer recorded, so the fragile key-based exclusion (which could silently drop
  a benchmarked kernel's own int32 fill/memset, e.g. sparse/MoE accumulators) is
  gone entirely.
- Profile each rep in its own CUDA-only session (drops the unused CPU activity);
  a single fn() call has no autograd CPU node, so no double-counting to filter.
- Cap profiled reps at a small fixed count instead of up to max_reps=10_000.
- Sum each rep's device self-time into a per-rep sample, then compute real
  median/mean/min/max/std — so verbose min/max/std reflect measured spread
  instead of identical values, and the harness demonstrates stability rather
  than asserting it.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 tools/benchmark/triton_kernels/runner.py | 74 ++++++++++++------------
 1 file changed, 38 insertions(+), 36 deletions(-)

diff --git a/tools/benchmark/triton_kernels/runner.py b/tools/benchmark/triton_kernels/runner.py
index 952cef0ef..3717f914a 100644
--- a/tools/benchmark/triton_kernels/runner.py
+++ b/tools/benchmark/triton_kernels/runner.py
@@ -11,6 +11,7 @@
 
 import dataclasses
 import math
+import statistics
 import time
 import typing
 
@@ -153,31 +154,34 @@ class VariantResult:
 # --------------------------------------------------------------------------- timing
 
 
+# Per-rep samples for the timing distribution. Device kernel time is stable, so a
+# small fixed count gives tight stats without profiling thousands of reps.
+_MAX_PROFILE_REPS = 50
+
+
 def bench_fn(
     fn: typing.Callable[[], typing.Any],
     reset: typing.Callable[[], None] | None = None,
     warmup_ms: float = 25.0,
     rep_ms: float = 100.0,
     min_reps: int = 5,
-    max_reps: int = 10_000,
 ) -> TimingStats:
     """Benchmark `fn` — it should be a no-arg callable that invokes the kernel
     being timed (close over inputs). Returns timing statistics in ms.
 
-    Reports GPU kernel time: the profiler's summed per-kernel device self-time,
-    not a wall-clock window around `fn`. A CUDA-event window also counts GPU-idle
-    bubbles from per-call allocation and (for autograd variants) Python/engine
-    launch overhead — benchmarking artifacts that don't occur in a real run where
-    the CPU runs ahead and the allocator serves cached buffers, and which vary
-    by variant (the eager C++ engine starves the GPU far less than a Python
-    autograd Function). Summing device self-time isolates the work the GPU
-    actually does. Device time is deterministic to <1% run-to-run, so the
-    per-rep mean is reported across all stat fields.
+    Reports GPU kernel time: each rep sums the profiler's per-kernel device
+    self-time, and stats are computed over the per-rep samples. A CUDA-event
+    window around `fn` instead counts GPU-idle bubbles from per-call allocation
+    and (for autograd variants) Python/engine launch overhead — benchmarking
+    artifacts that don't occur in a real run where the CPU runs ahead and the
+    allocator serves cached buffers, and which vary by variant (the eager C++
+    engine starves the GPU far less than a Python autograd Function). Summing
+    device self-time isolates the work the GPU actually does.
     """
     if not torch.cuda.is_available():
         # CPU / Triton interpret: single timed run with wall clock. min_reps,
-        # max_reps, warmup_ms, rep_ms are ignored — this path is for smoke
-        # testing kernel correctness, not measurement.
+        # warmup_ms, rep_ms are ignored — this path is for smoke testing kernel
+        # correctness, not measurement.
         if reset is not None:
             reset()
         fn()  # warmup
@@ -223,34 +227,32 @@ def bench_fn(
     torch.cuda.synchronize()
     one_rep_ms = max(post_start.elapsed_time(post_end), 0.001)
 
-    num_reps = max(min_reps, min(max_reps, int(rep_ms / one_rep_ms)))
+    num_reps = max(min_reps, min(_MAX_PROFILE_REPS, int(rep_ms / one_rep_ms)))
 
-    # The L2 flush is itself a device kernel (a fill); profile it alone to learn
-    # its kernel key so it can be excluded from the measured sum below.
-    with profile(activities=[ProfilerActivity.CUDA]) as flush_prof:
+    # Profile each rep separately and sum its kernels' device self-time, building a
+    # per-rep sample distribution. The L2 flush stays outside the profiled region so
+    # its fill kernel isn't measured; it still runs before `fn` on the same stream,
+    # so each kernel reads cold. Only CUDA-device entries contribute — runtime/launch
+    # entries carry no device time, and a single fn() call has no autograd CPU node.
+    samples_ms: list[float] = []
+    for _ in range(num_reps):
+        if reset is not None:
+            reset()
         flush_buffer.zero_()
-        torch.cuda.synchronize()
-    flush_keys = {k.key for k in flush_prof.key_averages() if k.device_type == DeviceType.CUDA}
-
-    with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
-        for _ in range(num_reps):
-            if reset is not None:
-                reset()
-            # Cold L2 between reps so each kernel re-reads from DRAM. Its fill
-            # kernel is excluded from the sum via `flush_keys`.
-            flush_buffer.zero_()
+        with profile(activities=[ProfilerActivity.CUDA]) as prof:
             fn()
-        torch.cuda.synchronize()
-
-    # Sum only CUDA-device entries: custom-autograd CPU nodes carry their
-    # children's device time and would double-count.
-    kernel_us = sum(
-        k.self_device_time_total
-        for k in prof.key_averages()
-        if k.device_type == DeviceType.CUDA and k.key not in flush_keys
+            torch.cuda.synchronize()
+        kernel_us = sum(k.self_device_time_total for k in prof.key_averages() if k.device_type == DeviceType.CUDA)
+        samples_ms.append(kernel_us / 1000)
+
+    return TimingStats(
+        median_ms=statistics.median(samples_ms),
+        mean_ms=statistics.fmean(samples_ms),
+        min_ms=min(samples_ms),
+        max_ms=max(samples_ms),
+        std_ms=statistics.pstdev(samples_ms) if len(samples_ms) > 1 else 0.0,
+        num_reps=num_reps,
     )
-    per_rep_ms = kernel_us / num_reps / 1000
-    return TimingStats(per_rep_ms, per_rep_ms, per_rep_ms, per_rep_ms, 0.0, num_reps)
 
 
 # --------------------------------------------------------------------------- memory

From ff5de84b7559bd2ba9824dd9da88d8aa3e1f1f48 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 16 Jun 2026 12:17:42 -0400
Subject: [PATCH 3/4] Sync the L2 flush before profiling each rep

Addresses review: the flush memset was enqueued async with no sync before the
profiler context opened, so excluding it from the device-time sum relied on
CUPTI-startup latency exceeding the ~76us 256MB memset rather than a guarantee.
Sync after the flush so it completes before the capture window opens; the
cold-L2 intent is preserved (the sync waits for the eviction, doesn't repopulate
with fn's data).

Also correct the rationale comment: what prevents the fwd_bwd autograd CPU node
from double-counting is that CUDA-only profiling records no CPU op nodes (those
carry their children's device time in self_device_time_total), with the
device_type == CUDA filter as backstop for zero-device-time runtime entries.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 tools/benchmark/triton_kernels/runner.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/benchmark/triton_kernels/runner.py b/tools/benchmark/triton_kernels/runner.py
index 3717f914a..22da961a5 100644
--- a/tools/benchmark/triton_kernels/runner.py
+++ b/tools/benchmark/triton_kernels/runner.py
@@ -230,15 +230,17 @@ def bench_fn(
     num_reps = max(min_reps, min(_MAX_PROFILE_REPS, int(rep_ms / one_rep_ms)))
 
     # Profile each rep separately and sum its kernels' device self-time, building a
-    # per-rep sample distribution. The L2 flush stays outside the profiled region so
-    # its fill kernel isn't measured; it still runs before `fn` on the same stream,
-    # so each kernel reads cold. Only CUDA-device entries contribute — runtime/launch
-    # entries carry no device time, and a single fn() call has no autograd CPU node.
+    # per-rep sample distribution. The L2 flush stays outside the profiled region and
+    # is synced before the profiler opens, so its fill kernel can't land in the capture
+    # window while each kernel still reads cold. CUDA-only profiling records no CPU op
+    # nodes (which would otherwise carry their children's device time and double-count);
+    # the device_type == CUDA filter drops the zero-device-time runtime/launch entries.
     samples_ms: list[float] = []
     for _ in range(num_reps):
         if reset is not None:
             reset()
         flush_buffer.zero_()
+        torch.cuda.synchronize()
         with profile(activities=[ProfilerActivity.CUDA]) as prof:
             fn()
             torch.cuda.synchronize()

From 4c99bbb6baf203ff05d1567187ece8b1a9298195 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Tue, 16 Jun 2026 14:30:56 -0400
Subject: [PATCH 4/4] Use fully-qualified torch.profiler / torch.autograd
 references

Match the project's third-party import convention (import package.module, keep
fully qualified) and this file's existing torch.cuda.* style, rather than
`from torch.profiler import ...` / `from torch.autograd import ...`.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 tools/benchmark/triton_kernels/runner.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/benchmark/triton_kernels/runner.py b/tools/benchmark/triton_kernels/runner.py
index 22da961a5..0d8334fff 100644
--- a/tools/benchmark/triton_kernels/runner.py
+++ b/tools/benchmark/triton_kernels/runner.py
@@ -16,8 +16,6 @@
 import typing
 
 import torch
-from torch.autograd import DeviceType
-from torch.profiler import ProfilerActivity, profile
 
 from fast_llm.utils import header
 from tools.benchmark.triton_kernels.gpu_specs import GpuSpec, detect_gpu_spec
@@ -241,10 +239,12 @@ def bench_fn(
             reset()
         flush_buffer.zero_()
         torch.cuda.synchronize()
-        with profile(activities=[ProfilerActivity.CUDA]) as prof:
+        with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
             fn()
             torch.cuda.synchronize()
-        kernel_us = sum(k.self_device_time_total for k in prof.key_averages() if k.device_type == DeviceType.CUDA)
+        kernel_us = sum(
+            k.self_device_time_total for k in prof.key_averages() if k.device_type == torch.autograd.DeviceType.CUDA
+        )
         samples_ms.append(kernel_us / 1000)
 
     return TimingStats(