f754bb6b
Add performance benchmarking instrumentation
a73x 2026-04-10 10:17
- Per-section frame timing ring buffer (snapshot, row_rebuild, atlas_upload, instance_upload, gpu_submit) with min/avg/p99/max stats - SIGUSR1 handler dumps stats to stderr; also dumps on clean exit - WAYSTTY_BENCH=1 env var runs a fixed workload (seq, find, yes) with warm-up phase instead of the user's shell - Makefile with bench and profile (perf + flamegraph) targets - shell_args support in pty.zig for bench script passthrough Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
diff --git a/.gitignore b/.gitignore index a45ec9e..2979db1 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ zig-out/ *.o *.swp .worktrees/ bench.log perf.data flamegraph.svg diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3e7754a --- /dev/null +++ b/Makefile @@ -0,0 +1,36 @@ ZIG ?= zig FLAMEGRAPH ?= flamegraph.pl STACKCOLLAPSE ?= stackcollapse-perf.pl .PHONY: build run test bench profile clean build: $(ZIG) build run: build $(ZIG) build run test: $(ZIG) build test zig-out/bin/waystty: $(wildcard src/*.zig) $(wildcard shaders/*) $(ZIG) build bench: zig-out/bin/waystty WAYSTTY_BENCH=1 ./zig-out/bin/waystty 2>bench.log || true @echo "--- frame timing ---" @grep -A 12 "waystty frame timing" bench.log || echo "(no timing data found)" profile: $(ZIG) build -Doptimize=ReleaseSafe perf record -g -F 999 --no-inherit -o perf.data -- \ sh -c 'WAYSTTY_BENCH=1 ./zig-out/bin/waystty 2>bench.log' perf script -i perf.data \ | $(STACKCOLLAPSE) \ | $(FLAMEGRAPH) > flamegraph.svg @echo "--- frame timing ---" @grep -A 12 "waystty frame timing" bench.log || echo "(no timing data found)" xdg-open flamegraph.svg clean: rm -rf zig-out .zig-cache perf.data bench.log flamegraph.svg diff --git a/src/main.zig b/src/main.zig index 4ccf41b..6ed0a50 100644 --- a/src/main.zig +++ b/src/main.zig @@ -168,8 +168,18 @@ fn runTerminal(alloc: std.mem.Allocator) !void { var atlas = try font.Atlas.init(alloc, 1024, 1024); defer atlas.deinit(); // Upload empty atlas first (so descriptor set is valid) // Precompute printable ASCII glyphs (32-126) into atlas for (32..127) |cp| { _ = atlas.getOrInsert(&face, @intCast(cp)) catch |err| switch (err) { error.AtlasFull => break, else => return err, }; } // Upload warm atlas (full upload — descriptor set needs valid data) try ctx.uploadAtlas(atlas.pixels); atlas.last_uploaded_y = atlas.cursor_y; atlas.needs_full_upload = false; atlas.dirty = false; // === terminal === var term = try vt.Terminal.init(alloc, .{ @@ -188,12 +198,25 @@ fn runTerminal(alloc: std.mem.Allocator) !void { // === pty === const shell: [:0]const u8 = blk: { if (std.posix.getenv("WAYSTTY_BENCH") != null) { break :blk try alloc.dupeZ(u8, "/bin/sh"); } const shell_env = std.posix.getenv("SHELL") orelse "/bin/sh"; break :blk try alloc.dupeZ(u8, shell_env); }; defer alloc.free(shell); var p = try pty.Pty.spawn(.{ .cols = cols, .rows = rows, .shell = shell }); const bench_script: ?[:0]const u8 = if (std.posix.getenv("WAYSTTY_BENCH") != null) "echo warmup; sleep 0.2; seq 1 50000; find /usr/lib -name '*.so' 2>/dev/null | head -500; yes 'hello world' | head -2000; exit 0" else null; var p = try pty.Pty.spawn(.{ .cols = cols, .rows = rows, .shell = shell, .shell_args = if (bench_script) |script| &.{ "-c", script } else null, }); defer p.deinit(); term.setWritePtyCallback(&p, &writePtyFromTerminal); @@ -202,6 +225,10 @@ fn runTerminal(alloc: std.mem.Allocator) !void { defer render_cache.deinit(alloc); try render_cache.resizeRows(alloc, rows); // === frame timing === var frame_ring = FrameTimingRing{}; installSigusr1Handler(); // === main loop === const wl_fd = conn.display.getFd(); var pollfds = [_]std.posix.pollfd{ @@ -354,10 +381,15 @@ fn runTerminal(alloc: std.mem.Allocator) !void { if (!shouldRenderFrame(render_pending, false, false)) continue; var frame_timing: FrameTiming = .{}; // === render === const previous_cursor = term.render_state.cursor; var section_timer = std.time.Timer.start() catch unreachable; try term.snapshot(); frame_timing.snapshot_us = usFromTimer(§ion_timer); section_timer = std.time.Timer.start() catch unreachable; const default_bg = term.backgroundColor(); const bg_uv = atlas.cursorUV(); @@ -449,13 +481,29 @@ fn runTerminal(alloc: std.mem.Allocator) !void { cursor_rebuilt = true; } // Re-upload atlas if new glyphs were added frame_timing.row_rebuild_us = usFromTimer(§ion_timer); section_timer = std.time.Timer.start() catch unreachable; // Re-upload atlas if new glyphs were added (incremental) if (atlas.dirty) { try ctx.uploadAtlas(atlas.pixels); const y_start = atlas.last_uploaded_y; const y_end = atlas.cursor_y + atlas.row_height; if (y_start < y_end) { try ctx.uploadAtlasRegion( atlas.pixels, y_start, y_end, atlas.needs_full_upload, ); atlas.last_uploaded_y = atlas.cursor_y; atlas.needs_full_upload = false; render_cache.layout_dirty = true; } atlas.dirty = false; render_cache.layout_dirty = true; } frame_timing.atlas_upload_us = usFromTimer(§ion_timer); section_timer = std.time.Timer.start() catch unreachable; const upload_plan = applyRenderPlan(.{ .layout_dirty = render_cache.layout_dirty, .rows_rebuilt = rows_rebuilt, @@ -514,6 +562,9 @@ fn runTerminal(alloc: std.mem.Allocator) !void { } } frame_timing.instance_upload_us = usFromTimer(§ion_timer); section_timer = std.time.Timer.start() catch unreachable; const baseline_coverage = renderer.coverageVariantParams(.baseline); ctx.drawCells( render_cache.total_instance_count, @@ -531,10 +582,22 @@ fn runTerminal(alloc: std.mem.Allocator) !void { }, else => return err, }; frame_timing.gpu_submit_us = usFromTimer(§ion_timer); frame_ring.push(frame_timing); // Check for SIGUSR1 stats dump request if (sigusr1_received.swap(false, .acq_rel)) { printFrameStats(computeFrameStats(&frame_ring)); } clearConsumedDirtyFlags(&term.render_state.dirty, dirty_rows, refresh_plan); render_pending = false; } // Dump timing stats on exit printFrameStats(computeFrameStats(&frame_ring)); _ = try ctx.vkd.deviceWaitIdle(ctx.device); } @@ -855,6 +918,152 @@ fn clampSelectionSpan(span: SelectionSpan, cols: u16, rows: u16) ?SelectionSpan } else null; } const FrameTiming = struct { snapshot_us: u32 = 0, row_rebuild_us: u32 = 0, atlas_upload_us: u32 = 0, instance_upload_us: u32 = 0, gpu_submit_us: u32 = 0, fn total(self: FrameTiming) u32 { return self.snapshot_us + self.row_rebuild_us + self.atlas_upload_us + self.instance_upload_us + self.gpu_submit_us; } }; const FrameTimingRing = struct { const capacity = 256; entries: [capacity]FrameTiming = [_]FrameTiming{.{}} ** capacity, head: usize = 0, count: usize = 0, fn push(self: *FrameTimingRing, timing: FrameTiming) void { const idx = if (self.count < capacity) self.count else self.head; self.entries[idx] = timing; if (self.count < capacity) { self.count += 1; } else { self.head = (self.head + 1) % capacity; } } /// Return a slice of valid entries in insertion order. /// Caller must provide a scratch buffer of `capacity` entries. fn orderedSlice(self: *const FrameTimingRing, buf: *[capacity]FrameTiming) []const FrameTiming { if (self.count < capacity) { return self.entries[0..self.count]; } // Ring has wrapped — copy from head..end then 0..head const tail_len = capacity - self.head; @memcpy(buf[0..tail_len], self.entries[self.head..capacity]); @memcpy(buf[tail_len..capacity], self.entries[0..self.head]); return buf[0..capacity]; } }; const SectionStats = struct { min: u32 = 0, avg: u32 = 0, p99: u32 = 0, max: u32 = 0, }; const FrameTimingStats = struct { snapshot: SectionStats = .{}, row_rebuild: SectionStats = .{}, atlas_upload: SectionStats = .{}, instance_upload: SectionStats = .{}, gpu_submit: SectionStats = .{}, total: SectionStats = .{}, frame_count: usize = 0, }; fn computeSectionStats(values: []u32) SectionStats { if (values.len == 0) return .{}; std.mem.sort(u32, values, {}, std.sort.asc(u32)); var sum: u64 = 0; for (values) |v| sum += v; const p99_idx = if (values.len <= 1) 0 else ((values.len - 1) * 99) / 100; return .{ .min = values[0], .avg = @intCast(sum / values.len), .p99 = values[p99_idx], .max = values[values.len - 1], }; } fn computeFrameStats(ring: *const FrameTimingRing) FrameTimingStats { if (ring.count == 0) return .{}; var ordered_buf: [FrameTimingRing.capacity]FrameTiming = undefined; const entries = ring.orderedSlice(&ordered_buf); const n = entries.len; var snapshot_vals: [FrameTimingRing.capacity]u32 = undefined; var row_rebuild_vals: [FrameTimingRing.capacity]u32 = undefined; var atlas_upload_vals: [FrameTimingRing.capacity]u32 = undefined; var instance_upload_vals: [FrameTimingRing.capacity]u32 = undefined; var gpu_submit_vals: [FrameTimingRing.capacity]u32 = undefined; var total_vals: [FrameTimingRing.capacity]u32 = undefined; for (entries, 0..) |e, i| { snapshot_vals[i] = e.snapshot_us; row_rebuild_vals[i] = e.row_rebuild_us; atlas_upload_vals[i] = e.atlas_upload_us; instance_upload_vals[i] = e.instance_upload_us; gpu_submit_vals[i] = e.gpu_submit_us; total_vals[i] = e.total(); } return .{ .snapshot = computeSectionStats(snapshot_vals[0..n]), .row_rebuild = computeSectionStats(row_rebuild_vals[0..n]), .atlas_upload = computeSectionStats(atlas_upload_vals[0..n]), .instance_upload = computeSectionStats(instance_upload_vals[0..n]), .gpu_submit = computeSectionStats(gpu_submit_vals[0..n]), .total = computeSectionStats(total_vals[0..n]), .frame_count = n, }; } fn printFrameStats(stats: FrameTimingStats) void { const row_fmt = "{s:<20}{d:>6}{d:>6}{d:>6}{d:>6}\n"; std.debug.print("\n=== waystty frame timing ({d} frames) ===\n", .{stats.frame_count}); std.debug.print("{s:<20}{s:>6}{s:>6}{s:>6}{s:>6} (us)\n", .{ "section", "min", "avg", "p99", "max" }); std.debug.print(row_fmt, .{ "snapshot", stats.snapshot.min, stats.snapshot.avg, stats.snapshot.p99, stats.snapshot.max }); std.debug.print(row_fmt, .{ "row_rebuild", stats.row_rebuild.min, stats.row_rebuild.avg, stats.row_rebuild.p99, stats.row_rebuild.max }); std.debug.print(row_fmt, .{ "atlas_upload", stats.atlas_upload.min, stats.atlas_upload.avg, stats.atlas_upload.p99, stats.atlas_upload.max }); std.debug.print(row_fmt, .{ "instance_upload", stats.instance_upload.min, stats.instance_upload.avg, stats.instance_upload.p99, stats.instance_upload.max }); std.debug.print(row_fmt, .{ "gpu_submit", stats.gpu_submit.min, stats.gpu_submit.avg, stats.gpu_submit.p99, stats.gpu_submit.max }); std.debug.print("----------------------------------------------------\n", .{}); std.debug.print(row_fmt, .{ "total", stats.total.min, stats.total.avg, stats.total.p99, stats.total.max }); } var sigusr1_received: std.atomic.Value(bool) = std.atomic.Value(bool).init(false); fn sigusr1Handler(_: c_int) callconv(.c) void { sigusr1_received.store(true, .release); } fn installSigusr1Handler() void { const act = std.posix.Sigaction{ .handler = .{ .handler = sigusr1Handler }, .mask = std.posix.sigemptyset(), .flags = std.posix.SA.RESTART, }; std.posix.sigaction(std.posix.SIG.USR1, &act, null); } fn usFromTimer(timer: *std.time.Timer) u32 { const ns = timer.read(); const us = ns / std.time.ns_per_us; return std.math.cast(u32, us) orelse std.math.maxInt(u32); } test "SelectionSpan.normalized orders endpoints in reading order" { const span = (SelectionSpan{ .start = .{ .col = 7, .row = 4 }, @@ -2914,6 +3123,82 @@ test "buildTextCoverageCompareScene repeats the same specimen in four panels" { ); } test "FrameTiming.total sums all sections" { const ft: FrameTiming = .{ .snapshot_us = 10, .row_rebuild_us = 20, .atlas_upload_us = 30, .instance_upload_us = 40, .gpu_submit_us = 50, }; try std.testing.expectEqual(@as(u32, 150), ft.total()); } test "FrameTimingRing records and wraps correctly" { var ring = FrameTimingRing{}; try std.testing.expectEqual(@as(usize, 0), ring.count); ring.push(.{ .snapshot_us = 1, .row_rebuild_us = 2, .atlas_upload_us = 3, .instance_upload_us = 4, .gpu_submit_us = 5 }); try std.testing.expectEqual(@as(usize, 1), ring.count); try std.testing.expectEqual(@as(u32, 1), ring.entries[0].snapshot_us); // Fill to capacity for (1..FrameTimingRing.capacity) |i| { ring.push(.{ .snapshot_us = @intCast(i + 1), .row_rebuild_us = 0, .atlas_upload_us = 0, .instance_upload_us = 0, .gpu_submit_us = 0 }); } try std.testing.expectEqual(FrameTimingRing.capacity, ring.count); // One more wraps around — overwrites entries[0], head advances to 1 ring.push(.{ .snapshot_us = 999, .row_rebuild_us = 0, .atlas_upload_us = 0, .instance_upload_us = 0, .gpu_submit_us = 0 }); try std.testing.expectEqual(FrameTimingRing.capacity, ring.count); // Newest entry is at (head + capacity - 1) % capacity = 0 try std.testing.expectEqual(@as(u32, 999), ring.entries[0].snapshot_us); // head has advanced past the overwritten slot try std.testing.expectEqual(@as(usize, 1), ring.head); } test "FrameTimingRing.orderedSlice returns entries in insertion order after wrap" { var ring = FrameTimingRing{}; // Push capacity + 3 entries so the ring wraps for (0..FrameTimingRing.capacity + 3) |i| { ring.push(.{ .snapshot_us = @intCast(i), .row_rebuild_us = 0, .atlas_upload_us = 0, .instance_upload_us = 0, .gpu_submit_us = 0 }); } var buf: [FrameTimingRing.capacity]FrameTiming = undefined; const ordered = ring.orderedSlice(&buf); try std.testing.expectEqual(FrameTimingRing.capacity, ordered.len); // First entry should be the 4th pushed (index 3), last should be capacity+2 try std.testing.expectEqual(@as(u32, 3), ordered[0].snapshot_us); try std.testing.expectEqual(@as(u32, FrameTimingRing.capacity + 2), ordered[ordered.len - 1].snapshot_us); } test "FrameTimingStats computes min/avg/p99/max correctly" { var ring = FrameTimingRing{}; // Push 100 frames with snapshot_us = 1..100 for (0..100) |i| { ring.push(.{ .snapshot_us = @intCast(i + 1), .row_rebuild_us = 0, .atlas_upload_us = 0, .instance_upload_us = 0, .gpu_submit_us = 0, }); } const stats = computeFrameStats(&ring); try std.testing.expectEqual(@as(u32, 1), stats.snapshot.min); try std.testing.expectEqual(@as(u32, 100), stats.snapshot.max); try std.testing.expectEqual(@as(u32, 50), stats.snapshot.avg); // p99 of 1..100 = value at index 98 (0-based) = 99 try std.testing.expectEqual(@as(u32, 99), stats.snapshot.p99); try std.testing.expectEqual(@as(usize, 100), stats.frame_count); } test "FrameTimingStats handles empty ring" { var ring = FrameTimingRing{}; const stats = computeFrameStats(&ring); try std.testing.expectEqual(@as(usize, 0), stats.frame_count); try std.testing.expectEqual(@as(u32, 0), stats.snapshot.min); } fn runRenderSmokeTest(alloc: std.mem.Allocator) !void { const conn = try wayland_client.Connection.init(alloc); defer conn.deinit(); diff --git a/src/pty.zig b/src/pty.zig index 4afda9c..86d3c55 100644 --- a/src/pty.zig +++ b/src/pty.zig @@ -19,6 +19,7 @@ pub const Pty = struct { cols: u16, rows: u16, shell: [:0]const u8, shell_args: ?[]const [:0]const u8 = null, }; pub fn spawn(opts: SpawnOptions) !Pty { @@ -37,8 +38,18 @@ pub const Pty = struct { // Child process _ = c.setenv("TERM", "xterm-256color", 1); var argv = [_:null]?[*:0]const u8{ opts.shell.ptr, null }; std.posix.execveZ(opts.shell.ptr, &argv, std.c.environ) catch {}; if (opts.shell_args) |args| { std.debug.assert(args.len < 15); // argv[0] = shell, must fit in 16-slot buffer var argv_buf: [16:null]?[*:0]const u8 = .{null} ** 16; argv_buf[0] = opts.shell.ptr; for (args, 1..) |arg, i| { argv_buf[i] = arg.ptr; } std.posix.execveZ(opts.shell.ptr, &argv_buf, std.c.environ) catch {}; } else { var argv = [_:null]?[*:0]const u8{ opts.shell.ptr, null }; std.posix.execveZ(opts.shell.ptr, &argv, std.c.environ) catch {}; } std.process.exit(1); }