diff --git a/perf-tests/build.gradle b/perf-tests/build.gradle new file mode 100644 index 00000000..001973a2 --- /dev/null +++ b/perf-tests/build.gradle @@ -0,0 +1,65 @@ +plugins { + id 'java' +} + +java { + toolchain { + languageVersion = JavaLanguageVersion.of(17) + vendor = JvmVendorSpec.ADOPTIUM + } +} + +repositories { + mavenCentral() +} + +def langchainVersion = '1.8.0' + +ext { + otelVersion = rootProject.ext.otelVersion + junitVersion = rootProject.ext.junitVersion + slf4jVersion = rootProject.ext.slf4jVersion +} + +dependencies { + testImplementation project(":braintrust-sdk") + testImplementation project(":braintrust-sdk:instrumentation:langchain_1_8_0") + testImplementation project(":braintrust-java-agent:instrumenter") + testImplementation(testFixtures(project(":test-harness"))) + + testImplementation "io.opentelemetry:opentelemetry-api:${otelVersion}" + testImplementation "io.opentelemetry:opentelemetry-sdk:${otelVersion}" + testImplementation "io.opentelemetry:opentelemetry-sdk-trace:${otelVersion}" + testImplementation "io.opentelemetry:opentelemetry-sdk-logs:${otelVersion}" + testImplementation "io.opentelemetry:opentelemetry-sdk-metrics:${otelVersion}" + + testImplementation "dev.langchain4j:langchain4j:${langchainVersion}" + testImplementation "dev.langchain4j:langchain4j-http-client:${langchainVersion}" + testImplementation "dev.langchain4j:langchain4j-open-ai:${langchainVersion}" + + testImplementation 'net.bytebuddy:byte-buddy-agent:1.17.5' + + testImplementation "org.junit.jupiter:junit-jupiter:${junitVersion}" + testRuntimeOnly 'org.junit.platform:junit-platform-launcher' + testRuntimeOnly "org.slf4j:slf4j-simple:${slf4jVersion}" +} + +// Disable the default test task so perf tests don't run during ./gradlew test or check. +// Run explicitly with: ./gradlew :perf-tests:perfTest +test { + enabled = false +} + +task perfTest(type: Test) { + useJUnitPlatform() + workingDir = rootProject.projectDir + + // Disable JUnit's per-test timeout — perf tests can take a while + systemProperty 'junit.jupiter.execution.timeout.default', 'disabled' + + testLogging { + events "passed", "skipped", "failed" + showStandardStreams = true + exceptionFormat "full" + } +} diff --git a/perf-tests/src/test/java/dev/braintrust/perf/PerfResult.java b/perf-tests/src/test/java/dev/braintrust/perf/PerfResult.java new file mode 100644 index 00000000..1b208c5b --- /dev/null +++ b/perf-tests/src/test/java/dev/braintrust/perf/PerfResult.java @@ -0,0 +1,27 @@ +package dev.braintrust.perf; + +/** + * Captures the result of a single performance test run. + * + * @param config the configuration that produced this result + * @param payloadBytes total bytes of the OTLP HTTP request body captured at the wire + * @param spanCount number of spans that were exported (as observed by the server) + * @param requestCount number of HTTP requests received by the capture server + */ +public record PerfResult(PerfRunConfig config, long payloadBytes, int spanCount, int requestCount) { + + /** Bytes per span (approximate). */ + public double bytesPerSpan() { + return spanCount > 0 ? (double) payloadBytes / spanCount : 0; + } + + public double payloadMB() { + return payloadBytes / (1024.0 * 1024.0); + } + + public String summary() { + return String.format( + "[%s] %d request(s), %d span(s), %.3f MB total (%.1f bytes/span)", + config.name(), requestCount, spanCount, payloadMB(), bytesPerSpan()); + } +} diff --git a/perf-tests/src/test/java/dev/braintrust/perf/PerfRunConfig.java b/perf-tests/src/test/java/dev/braintrust/perf/PerfRunConfig.java new file mode 100644 index 00000000..8d883a01 --- /dev/null +++ b/perf-tests/src/test/java/dev/braintrust/perf/PerfRunConfig.java @@ -0,0 +1,24 @@ +package dev.braintrust.perf; + +/** + * Describes the configuration for a single performance test run. + * + *

Each field controls the shape of the multi-turn conversation that will be generated and + * exported. Add new fields here as you add new scenarios (e.g. tool use, streaming, etc.). + * + * @param name human-readable label for this configuration + * @param turns number of conversational turns (user messages sent to the AI service) + * @param includeImageAttachment whether to include an image attachment in the first user message + */ +public record PerfRunConfig(String name, int turns, boolean includeImageAttachment) { + + /** A multi-turn conversation with an image attachment on the first turn. */ + public static PerfRunConfig multiTurnWithAttachment() { + return new PerfRunConfig("multi-turn-with-attachment", 10, true); + } + + /** A multi-turn conversation with text only (no attachments). */ + public static PerfRunConfig multiTurnTextOnly() { + return new PerfRunConfig("multi-turn-text-only", 3, false); + } +} diff --git a/perf-tests/src/test/java/dev/braintrust/perf/TracePayloadSizeTest.java b/perf-tests/src/test/java/dev/braintrust/perf/TracePayloadSizeTest.java new file mode 100644 index 00000000..ad6cdb70 --- /dev/null +++ b/perf-tests/src/test/java/dev/braintrust/perf/TracePayloadSizeTest.java @@ -0,0 +1,316 @@ +package dev.braintrust.perf; + +import static org.junit.jupiter.api.Assertions.*; + +import com.sun.net.httpserver.HttpServer; +import dev.braintrust.TestHarness; +import dev.braintrust.instrumentation.Instrumenter; +import dev.langchain4j.data.message.*; +import dev.langchain4j.memory.chat.MessageWindowChatMemory; +import dev.langchain4j.model.chat.ChatModel; +import dev.langchain4j.model.openai.OpenAiChatModel; +import java.awt.image.BufferedImage; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.ArrayList; +import java.util.Base64; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import javax.imageio.ImageIO; +import net.bytebuddy.agent.ByteBuddyAgent; +import org.junit.jupiter.api.*; + +/** + * Measures the wire-format (OTLP protobuf over HTTP) payload size produced by {@code + * BraintrustSpanExporter} for realistic multi-turn LangChain4j conversations. + * + *

The test stands up a plain HTTP capture server on {@code /otel/v1/traces}, configures the + * Braintrust SDK to export there (while OpenAI calls go through the TestHarness VCR proxy), runs a + * multi-turn conversation, flushes, and records how many bytes the capture server received. + * + *

To add new scenarios, create additional {@link PerfRunConfig} instances and call {@link + * #runScenario}. + */ +public class TracePayloadSizeTest { + + /** + * Base64-encoded JPEG generated at class load time. 512x512 with random pixel noise to simulate + * a realistic photo payload (~150KB raw, ~210K base64 chars). + */ + private static final String TEST_IMAGE_BASE64 = generateTestImageBase64(512, 512); + + private HttpServer captureServer; + private int capturePort; + + /** Accumulated byte count across all requests for a single scenario. */ + private AtomicLong capturedBytes; + + /** Number of HTTP requests received. */ + private AtomicInteger requestCount; + + /** + * Latch that fires when at least one export request arrives. We use this to know when the + * exporter has sent data, then wait a grace period for any trailing batches. + */ + private CountDownLatch exportLatch; + + /** All results collected during the test class, printed in @AfterEach for comparison. */ + private final List results = new ArrayList<>(); + + @BeforeAll + static void installInstrumentation() { + var instrumentation = ByteBuddyAgent.install(); + Instrumenter.install(instrumentation, TracePayloadSizeTest.class.getClassLoader()); + } + + @BeforeEach + void setUp() throws IOException { + capturedBytes = new AtomicLong(); + requestCount = new AtomicInteger(); + exportLatch = new CountDownLatch(1); + + captureServer = HttpServer.create(new InetSocketAddress("localhost", 0), 0); + capturePort = captureServer.getAddress().getPort(); + + // ── OTLP trace export endpoint (the one we're measuring) ── + captureServer.createContext( + "/otel/v1/traces", + exchange -> { + try { + byte[] body = exchange.getRequestBody().readAllBytes(); + capturedBytes.addAndGet(body.length); + requestCount.incrementAndGet(); + exportLatch.countDown(); + exchange.sendResponseHeaders(200, 0); + } finally { + exchange.close(); + } + }); + + // ── Attachment upload flow stubs ── + // The SDK's AttachmentProcessor extracts base64 data URIs from span attributes, + // replaces them with attachment references, and uploads the data via: + // 1. POST /api/apikey/login -> resolve org ID + // 2. POST /attachment -> get a signed upload URL + // 3. PUT /s3-upload -> upload data to the signed URL + // 4. POST /attachment/status -> report upload status + + var loginResponse = + "{\"org_info\": [{\"id\": \"00000000-0000-0000-0000-000000000000\"," + + " \"name\": \"perf-test-org\"}]}"; + captureServer.createContext( + "/api/apikey/login", + exchange -> { + try { + exchange.getRequestBody().readAllBytes(); // drain + var body = loginResponse.getBytes(); + exchange.getResponseHeaders().set("Content-Type", "application/json"); + exchange.sendResponseHeaders(200, body.length); + exchange.getResponseBody().write(body); + } finally { + exchange.close(); + } + }); + + var signedUrl = "http://localhost:" + capturePort + "/s3-upload"; + var attachmentResponse = "{\"signedUrl\": \"" + signedUrl + "\", \"headers\": {}}"; + captureServer.createContext( + "/attachment", + exchange -> { + try { + exchange.getRequestBody().readAllBytes(); // drain + var body = attachmentResponse.getBytes(); + exchange.getResponseHeaders().set("Content-Type", "application/json"); + exchange.sendResponseHeaders(200, body.length); + exchange.getResponseBody().write(body); + } finally { + exchange.close(); + } + }); + + captureServer.createContext( + "/s3-upload", + exchange -> { + try { + exchange.getRequestBody().readAllBytes(); // drain + exchange.sendResponseHeaders(200, -1); + } finally { + exchange.close(); + } + }); + + captureServer.start(); + } + + @AfterEach + void tearDown() { + if (captureServer != null) { + captureServer.stop(0); + } + if (!results.isEmpty()) { + System.out.println("\n=== Perf Results ==="); + for (PerfResult r : results) { + System.out.println(r.summary()); + } + System.out.println("====================\n"); + } + } + + @Test + void multiTurnWithAttachment() throws Exception { + var result = runScenario(PerfRunConfig.multiTurnWithAttachment()); + results.add(result); + System.out.println(result.summary()); + + assertTrue(result.requestCount() > 0, "Expected at least one HTTP request"); + assertTrue(result.payloadBytes() > 0, "Expected non-empty payload"); + assertTrue(result.spanCount() > 0, "Expected at least one span"); + } + + @Test + @Disabled + void multiTurnTextOnly() throws Exception { + var result = runScenario(PerfRunConfig.multiTurnTextOnly()); + results.add(result); + System.out.println(result.summary()); + + assertTrue(result.requestCount() > 0, "Expected at least one HTTP request"); + assertTrue(result.payloadBytes() > 0, "Expected non-empty payload"); + assertTrue(result.spanCount() > 0, "Expected at least one span"); + } + + // ─── Core ─────────────────────────────────────────────────────────────────── + + /** + * Runs a single scenario: sets up the SDK with the capture server as the Braintrust API + * endpoint, builds a LangChain4j ChatModel via the VCR-proxied OpenAI, runs a multi-turn + * conversation, flushes, and returns the measured result. + */ + private PerfResult runScenario(PerfRunConfig config) throws Exception { + capturedBytes.set(0); + requestCount.set(0); + exportLatch = new CountDownLatch(1); + + // TestHarness sets up: + // - VCR proxy for OpenAI (testHarness.openAiBaseUrl()) + // - Braintrust SDK with BraintrustSpanExporter pointing at config.apiUrl() + // - UnitTestSpanExporter for in-memory span capture + // + // We override apiUrl to point at our capture server so the exporter sends there. + var testHarness = + TestHarness.setup( + cfg -> + cfg.apiUrl("http://localhost:" + capturePort) + .autoConvertAIAttachments(true) + .compressOtelPayload(true)); + + // Build the OpenAI-backed ChatModel. ByteBuddy auto-instrumentation intercepts + // OpenAiChatModel.Builder.build() and wraps the internal HttpClient with + // WrappedHttpClient, which creates OTel spans for each LLM call. + ChatModel model = + OpenAiChatModel.builder() + .apiKey(testHarness.openAiApiKey()) + .baseUrl(testHarness.openAiBaseUrl()) + .modelName("gpt-4o-mini") + .temperature(0.0) + .build(); + + // Chat memory to accumulate conversation history across turns + var memory = MessageWindowChatMemory.withMaxMessages(20); + + try { + runConversation(testHarness, model, memory, config); + + // Flush spans through BatchSpanProcessor → BraintrustSpanExporter → capture server + var flushResult = + testHarness + .openTelemetry() + .getSdkTracerProvider() + .forceFlush() + .join(30, TimeUnit.SECONDS); + + assertTrue(flushResult.isDone()); + assertTrue(flushResult.isSuccess()); + + boolean received = exportLatch.await(15, TimeUnit.SECONDS); + assertTrue(received, "Timed out waiting for span export for: " + config.name()); + + // Grace period for any trailing batch exports to be seen by the server + Thread.sleep(1_000); + + // Get span count from the in-memory exporter + var spans = testHarness.awaitExportedSpans(); + int spanCount = spans.size(); + + return new PerfResult(config, capturedBytes.get(), spanCount, requestCount.get()); + } finally { + testHarness.openTelemetry().getSdkTracerProvider().shutdown().join(5, TimeUnit.SECONDS); + } + } + + /** Runs a multi-turn conversation under a single root span. */ + private void runConversation( + TestHarness testHarness, + ChatModel model, + MessageWindowChatMemory memory, + PerfRunConfig config) { + var tracer = testHarness.openTelemetry().getTracer("perf-test"); + var rootSpan = tracer.spanBuilder("multi-turn-conversation").startSpan(); + + try (var ignored = rootSpan.makeCurrent()) { + String[] userPrompts = { + "Tell me a story about a wise cracking talking dog.", "tell me another story", + }; + + for (int turn = 0; turn < config.turns(); turn++) { + UserMessage userMessage; + var userPrompt = userPrompts[Math.min(turn, userPrompts.length - 1)]; + + if (turn == 0 && config.includeImageAttachment()) { + // First turn includes an image attachment alongside the text + userMessage = + UserMessage.from( + TextContent.from( + userPrompt + " -- take inspiration from this picture"), + ImageContent.from(TEST_IMAGE_BASE64, "image/jpeg")); + } else { + userMessage = UserMessage.from(userPrompt); + } + + memory.add(userMessage); + + var response = model.chat(memory.messages()); + var aiMessage = response.aiMessage(); + + memory.add(aiMessage); + } + } finally { + rootSpan.end(); + } + } + + /** + * Generates a JPEG image with random pixel noise and returns it as a base64 string. A fixed + * seed ensures the output is deterministic across runs. + */ + private static String generateTestImageBase64(int width, int height) { + var img = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB); + var rng = new java.util.Random(42); + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + img.setRGB(x, y, rng.nextInt(0xFFFFFF)); + } + } + try { + var baos = new ByteArrayOutputStream(); + ImageIO.write(img, "JPEG", baos); + return Base64.getEncoder().encodeToString(baos.toByteArray()); + } catch (IOException e) { + throw new RuntimeException("Failed to generate test image", e); + } + } +} diff --git a/settings.gradle b/settings.gradle index f4d4f199..436211e6 100644 --- a/settings.gradle +++ b/settings.gradle @@ -27,3 +27,4 @@ include 'braintrust-java-agent:smoke-test:wildfly' include 'btx' include 'braintrust-api' include 'braintrust-otel-extension' +include 'perf-tests'