From 3bd9a6259c84efc7a9f5d2a530d0e1aac35d91a9 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 15 Jun 2026 17:03:05 -0400 Subject: [PATCH 1/4] Benchmark kernel GPU time via profiler instead of event wall-clock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CUDA-event window around each call measured the GPU-timeline span, including idle bubbles from per-call allocation and (for autograd variants) Python/engine launch overhead between kernels. Those bubbles are benchmarking artifacts — in a real run the CPU stays ahead of the GPU and the caching allocator serves buffers without syncs — and they vary by variant, so they unfairly penalized the Python-autograd Triton path versus the eager C++ engine. They also made backward timing (derived as fwd_bwd - fwd) swing ~190% run-to-run on a fixed shape. Sum per-kernel device self-time from torch.profiler instead. Only CUDA-device entries are summed (custom-autograd CPU nodes carry their children's device time and would double-count); the L2-flush fill kernel is identified from a flush-only profile and excluded. Reproduces to <0.5% run-to-run and matches isolated-kernel timing. Co-Authored-By: Claude Opus 4.8 --- tools/benchmark/triton_kernels/__main__.py | 3 ++ tools/benchmark/triton_kernels/runner.py | 62 +++++++++++++--------- 2 files changed, 40 insertions(+), 25 deletions(-) diff --git a/tools/benchmark/triton_kernels/__main__.py b/tools/benchmark/triton_kernels/__main__.py index e09b92976..950307f22 100644 --- a/tools/benchmark/triton_kernels/__main__.py +++ b/tools/benchmark/triton_kernels/__main__.py @@ -33,6 +33,9 @@ # mutated inputs" when using max-autotune. The fallback is correct; suppress noise. warnings.filterwarnings("ignore", message=".*[Ss]kipping (cuda|CUDA)[Gg]raphs.*") logging.getLogger("torch._inductor.cudagraph_trees").setLevel(logging.ERROR) +# The per-measurement profiler session emits a one-time note about event cycles; the +# runner reads key_averages right after each session, so it doesn't apply here. +warnings.filterwarnings("ignore", message=".*Profiler clears events.*") _BENCHMARKS = { "entropy_loss": bench_entropy_loss, diff --git a/tools/benchmark/triton_kernels/runner.py b/tools/benchmark/triton_kernels/runner.py index 9a00c0cd0..952cef0ef 100644 --- a/tools/benchmark/triton_kernels/runner.py +++ b/tools/benchmark/triton_kernels/runner.py @@ -4,18 +4,19 @@ Each benchmark file defines a list of `Case` objects (input shape/dtype sweep) and a list of `Variant` objects (implementations to compare — e.g. pytorch eager, pytorch compiled, Triton). The runner invokes each variant -on each case, measures timing (median + mean + percentiles via CUDA events), -measures peak/final memory, and compares outputs against an fp32 reference -using RMS error. Results are printed as a table per case. +on each case, measures GPU kernel time (summed per-kernel device time via the +profiler), measures peak/final memory, and compares outputs against an fp32 +reference using RMS error. Results are printed as a table per case. """ import dataclasses import math -import statistics import time import typing import torch +from torch.autograd import DeviceType +from torch.profiler import ProfilerActivity, profile from fast_llm.utils import header from tools.benchmark.triton_kernels.gpu_specs import GpuSpec, detect_gpu_spec @@ -163,8 +164,15 @@ def bench_fn( """Benchmark `fn` — it should be a no-arg callable that invokes the kernel being timed (close over inputs). Returns timing statistics in ms. - Mirrors `triton.testing.do_bench` logic but returns raw per-rep list so we - can compute {median, mean, min, max, std} from one set of runs. + Reports GPU kernel time: the profiler's summed per-kernel device self-time, + not a wall-clock window around `fn`. A CUDA-event window also counts GPU-idle + bubbles from per-call allocation and (for autograd variants) Python/engine + launch overhead — benchmarking artifacts that don't occur in a real run where + the CPU runs ahead and the allocator serves cached buffers, and which vary + by variant (the eager C++ engine starves the GPU far less than a Python + autograd Function). Summing device self-time isolates the work the GPU + actually does. Device time is deterministic to <1% run-to-run, so the + per-rep mean is reported across all stat fields. """ if not torch.cuda.is_available(): # CPU / Triton interpret: single timed run with wall clock. min_reps, @@ -217,28 +225,32 @@ def bench_fn( num_reps = max(min_reps, min(max_reps, int(rep_ms / one_rep_ms))) - start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_reps)] - end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_reps)] - for i in range(num_reps): - if reset is not None: - reset() - # The L2 flush is enqueued before start_events[i] on the same stream, so - # the timed window starts after the zero completes — only fn() is timed. + # The L2 flush is itself a device kernel (a fill); profile it alone to learn + # its kernel key so it can be excluded from the measured sum below. + with profile(activities=[ProfilerActivity.CUDA]) as flush_prof: flush_buffer.zero_() - start_events[i].record() - fn() - end_events[i].record() - torch.cuda.synchronize() + torch.cuda.synchronize() + flush_keys = {k.key for k in flush_prof.key_averages() if k.device_type == DeviceType.CUDA} + + with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: + for _ in range(num_reps): + if reset is not None: + reset() + # Cold L2 between reps so each kernel re-reads from DRAM. Its fill + # kernel is excluded from the sum via `flush_keys`. + flush_buffer.zero_() + fn() + torch.cuda.synchronize() - times = [start_events[i].elapsed_time(end_events[i]) for i in range(num_reps)] - return TimingStats( - median_ms=statistics.median(times), - mean_ms=statistics.fmean(times), - min_ms=min(times), - max_ms=max(times), - std_ms=statistics.pstdev(times) if len(times) > 1 else 0.0, - num_reps=num_reps, + # Sum only CUDA-device entries: custom-autograd CPU nodes carry their + # children's device time and would double-count. + kernel_us = sum( + k.self_device_time_total + for k in prof.key_averages() + if k.device_type == DeviceType.CUDA and k.key not in flush_keys ) + per_rep_ms = kernel_us / num_reps / 1000 + return TimingStats(per_rep_ms, per_rep_ms, per_rep_ms, per_rep_ms, 0.0, num_reps) # --------------------------------------------------------------------------- memory From 784a3daeb6e10b7a6222ed1d885c08eeccf01be1 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 15 Jun 2026 18:39:37 -0400 Subject: [PATCH 2/4] Profile each rep separately and build a real timing distribution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses review of the profiler timing change: - Move the L2 flush outside each rep's profiler context. The flush kernel is no longer recorded, so the fragile key-based exclusion (which could silently drop a benchmarked kernel's own int32 fill/memset, e.g. sparse/MoE accumulators) is gone entirely. - Profile each rep in its own CUDA-only session (drops the unused CPU activity); a single fn() call has no autograd CPU node, so no double-counting to filter. - Cap profiled reps at a small fixed count instead of up to max_reps=10_000. - Sum each rep's device self-time into a per-rep sample, then compute real median/mean/min/max/std — so verbose min/max/std reflect measured spread instead of identical values, and the harness demonstrates stability rather than asserting it. Co-Authored-By: Claude Opus 4.8 --- tools/benchmark/triton_kernels/runner.py | 74 ++++++++++++------------ 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/tools/benchmark/triton_kernels/runner.py b/tools/benchmark/triton_kernels/runner.py index 952cef0ef..3717f914a 100644 --- a/tools/benchmark/triton_kernels/runner.py +++ b/tools/benchmark/triton_kernels/runner.py @@ -11,6 +11,7 @@ import dataclasses import math +import statistics import time import typing @@ -153,31 +154,34 @@ class VariantResult: # --------------------------------------------------------------------------- timing +# Per-rep samples for the timing distribution. Device kernel time is stable, so a +# small fixed count gives tight stats without profiling thousands of reps. +_MAX_PROFILE_REPS = 50 + + def bench_fn( fn: typing.Callable[[], typing.Any], reset: typing.Callable[[], None] | None = None, warmup_ms: float = 25.0, rep_ms: float = 100.0, min_reps: int = 5, - max_reps: int = 10_000, ) -> TimingStats: """Benchmark `fn` — it should be a no-arg callable that invokes the kernel being timed (close over inputs). Returns timing statistics in ms. - Reports GPU kernel time: the profiler's summed per-kernel device self-time, - not a wall-clock window around `fn`. A CUDA-event window also counts GPU-idle - bubbles from per-call allocation and (for autograd variants) Python/engine - launch overhead — benchmarking artifacts that don't occur in a real run where - the CPU runs ahead and the allocator serves cached buffers, and which vary - by variant (the eager C++ engine starves the GPU far less than a Python - autograd Function). Summing device self-time isolates the work the GPU - actually does. Device time is deterministic to <1% run-to-run, so the - per-rep mean is reported across all stat fields. + Reports GPU kernel time: each rep sums the profiler's per-kernel device + self-time, and stats are computed over the per-rep samples. A CUDA-event + window around `fn` instead counts GPU-idle bubbles from per-call allocation + and (for autograd variants) Python/engine launch overhead — benchmarking + artifacts that don't occur in a real run where the CPU runs ahead and the + allocator serves cached buffers, and which vary by variant (the eager C++ + engine starves the GPU far less than a Python autograd Function). Summing + device self-time isolates the work the GPU actually does. """ if not torch.cuda.is_available(): # CPU / Triton interpret: single timed run with wall clock. min_reps, - # max_reps, warmup_ms, rep_ms are ignored — this path is for smoke - # testing kernel correctness, not measurement. + # warmup_ms, rep_ms are ignored — this path is for smoke testing kernel + # correctness, not measurement. if reset is not None: reset() fn() # warmup @@ -223,34 +227,32 @@ def bench_fn( torch.cuda.synchronize() one_rep_ms = max(post_start.elapsed_time(post_end), 0.001) - num_reps = max(min_reps, min(max_reps, int(rep_ms / one_rep_ms))) + num_reps = max(min_reps, min(_MAX_PROFILE_REPS, int(rep_ms / one_rep_ms))) - # The L2 flush is itself a device kernel (a fill); profile it alone to learn - # its kernel key so it can be excluded from the measured sum below. - with profile(activities=[ProfilerActivity.CUDA]) as flush_prof: + # Profile each rep separately and sum its kernels' device self-time, building a + # per-rep sample distribution. The L2 flush stays outside the profiled region so + # its fill kernel isn't measured; it still runs before `fn` on the same stream, + # so each kernel reads cold. Only CUDA-device entries contribute — runtime/launch + # entries carry no device time, and a single fn() call has no autograd CPU node. + samples_ms: list[float] = [] + for _ in range(num_reps): + if reset is not None: + reset() flush_buffer.zero_() - torch.cuda.synchronize() - flush_keys = {k.key for k in flush_prof.key_averages() if k.device_type == DeviceType.CUDA} - - with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: - for _ in range(num_reps): - if reset is not None: - reset() - # Cold L2 between reps so each kernel re-reads from DRAM. Its fill - # kernel is excluded from the sum via `flush_keys`. - flush_buffer.zero_() + with profile(activities=[ProfilerActivity.CUDA]) as prof: fn() - torch.cuda.synchronize() - - # Sum only CUDA-device entries: custom-autograd CPU nodes carry their - # children's device time and would double-count. - kernel_us = sum( - k.self_device_time_total - for k in prof.key_averages() - if k.device_type == DeviceType.CUDA and k.key not in flush_keys + torch.cuda.synchronize() + kernel_us = sum(k.self_device_time_total for k in prof.key_averages() if k.device_type == DeviceType.CUDA) + samples_ms.append(kernel_us / 1000) + + return TimingStats( + median_ms=statistics.median(samples_ms), + mean_ms=statistics.fmean(samples_ms), + min_ms=min(samples_ms), + max_ms=max(samples_ms), + std_ms=statistics.pstdev(samples_ms) if len(samples_ms) > 1 else 0.0, + num_reps=num_reps, ) - per_rep_ms = kernel_us / num_reps / 1000 - return TimingStats(per_rep_ms, per_rep_ms, per_rep_ms, per_rep_ms, 0.0, num_reps) # --------------------------------------------------------------------------- memory From ff5de84b7559bd2ba9824dd9da88d8aa3e1f1f48 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 16 Jun 2026 12:17:42 -0400 Subject: [PATCH 3/4] Sync the L2 flush before profiling each rep Addresses review: the flush memset was enqueued async with no sync before the profiler context opened, so excluding it from the device-time sum relied on CUPTI-startup latency exceeding the ~76us 256MB memset rather than a guarantee. Sync after the flush so it completes before the capture window opens; the cold-L2 intent is preserved (the sync waits for the eviction, doesn't repopulate with fn's data). Also correct the rationale comment: what prevents the fwd_bwd autograd CPU node from double-counting is that CUDA-only profiling records no CPU op nodes (those carry their children's device time in self_device_time_total), with the device_type == CUDA filter as backstop for zero-device-time runtime entries. Co-Authored-By: Claude Opus 4.8 --- tools/benchmark/triton_kernels/runner.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/benchmark/triton_kernels/runner.py b/tools/benchmark/triton_kernels/runner.py index 3717f914a..22da961a5 100644 --- a/tools/benchmark/triton_kernels/runner.py +++ b/tools/benchmark/triton_kernels/runner.py @@ -230,15 +230,17 @@ def bench_fn( num_reps = max(min_reps, min(_MAX_PROFILE_REPS, int(rep_ms / one_rep_ms))) # Profile each rep separately and sum its kernels' device self-time, building a - # per-rep sample distribution. The L2 flush stays outside the profiled region so - # its fill kernel isn't measured; it still runs before `fn` on the same stream, - # so each kernel reads cold. Only CUDA-device entries contribute — runtime/launch - # entries carry no device time, and a single fn() call has no autograd CPU node. + # per-rep sample distribution. The L2 flush stays outside the profiled region and + # is synced before the profiler opens, so its fill kernel can't land in the capture + # window while each kernel still reads cold. CUDA-only profiling records no CPU op + # nodes (which would otherwise carry their children's device time and double-count); + # the device_type == CUDA filter drops the zero-device-time runtime/launch entries. samples_ms: list[float] = [] for _ in range(num_reps): if reset is not None: reset() flush_buffer.zero_() + torch.cuda.synchronize() with profile(activities=[ProfilerActivity.CUDA]) as prof: fn() torch.cuda.synchronize() From 4c99bbb6baf203ff05d1567187ece8b1a9298195 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 16 Jun 2026 14:30:56 -0400 Subject: [PATCH 4/4] Use fully-qualified torch.profiler / torch.autograd references Match the project's third-party import convention (import package.module, keep fully qualified) and this file's existing torch.cuda.* style, rather than `from torch.profiler import ...` / `from torch.autograd import ...`. Co-Authored-By: Claude Opus 4.8 --- tools/benchmark/triton_kernels/runner.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/benchmark/triton_kernels/runner.py b/tools/benchmark/triton_kernels/runner.py index 22da961a5..0d8334fff 100644 --- a/tools/benchmark/triton_kernels/runner.py +++ b/tools/benchmark/triton_kernels/runner.py @@ -16,8 +16,6 @@ import typing import torch -from torch.autograd import DeviceType -from torch.profiler import ProfilerActivity, profile from fast_llm.utils import header from tools.benchmark.triton_kernels.gpu_specs import GpuSpec, detect_gpu_spec @@ -241,10 +239,12 @@ def bench_fn( reset() flush_buffer.zero_() torch.cuda.synchronize() - with profile(activities=[ProfilerActivity.CUDA]) as prof: + with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof: fn() torch.cuda.synchronize() - kernel_us = sum(k.self_device_time_total for k in prof.key_averages() if k.device_type == DeviceType.CUDA) + kernel_us = sum( + k.self_device_time_total for k in prof.key_averages() if k.device_type == torch.autograd.DeviceType.CUDA + ) samples_ms.append(kernel_us / 1000) return TimingStats(