f754bb6b

Add performance benchmarking instrumentation

a73x 2026-04-10 10:17

- Per-section frame timing ring buffer (snapshot, row_rebuild,
  atlas_upload, instance_upload, gpu_submit) with min/avg/p99/max stats
- SIGUSR1 handler dumps stats to stderr; also dumps on clean exit
- WAYSTTY_BENCH=1 env var runs a fixed workload (seq, find, yes)
  with warm-up phase instead of the user's shell
- Makefile with bench and profile (perf + flamegraph) targets
- shell_args support in pty.zig for bench script passthrough

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

diff --git a/.gitignore b/.gitignore
index a45ec9e..2979db1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,6 @@ zig-out/
*.o
*.swp
.worktrees/
bench.log
perf.data
flamegraph.svg
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3e7754a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,36 @@
ZIG ?= zig
FLAMEGRAPH ?= flamegraph.pl
STACKCOLLAPSE ?= stackcollapse-perf.pl

.PHONY: build run test bench profile clean

build:
	$(ZIG) build

run: build
	$(ZIG) build run

test:
	$(ZIG) build test

zig-out/bin/waystty: $(wildcard src/*.zig) $(wildcard shaders/*)
	$(ZIG) build

bench: zig-out/bin/waystty
	WAYSTTY_BENCH=1 ./zig-out/bin/waystty 2>bench.log || true
	@echo "--- frame timing ---"
	@grep -A 12 "waystty frame timing" bench.log || echo "(no timing data found)"

profile:
	$(ZIG) build -Doptimize=ReleaseSafe
	perf record -g -F 999 --no-inherit -o perf.data -- \
		sh -c 'WAYSTTY_BENCH=1 ./zig-out/bin/waystty 2>bench.log'
	perf script -i perf.data \
		| $(STACKCOLLAPSE) \
		| $(FLAMEGRAPH) > flamegraph.svg
	@echo "--- frame timing ---"
	@grep -A 12 "waystty frame timing" bench.log || echo "(no timing data found)"
	xdg-open flamegraph.svg

clean:
	rm -rf zig-out .zig-cache perf.data bench.log flamegraph.svg
diff --git a/src/main.zig b/src/main.zig
index 4ccf41b..6ed0a50 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -168,8 +168,18 @@ fn runTerminal(alloc: std.mem.Allocator) !void {
    var atlas = try font.Atlas.init(alloc, 1024, 1024);
    defer atlas.deinit();

    // Upload empty atlas first (so descriptor set is valid)
    // Precompute printable ASCII glyphs (32-126) into atlas
    for (32..127) |cp| {
        _ = atlas.getOrInsert(&face, @intCast(cp)) catch |err| switch (err) {
            error.AtlasFull => break,
            else => return err,
        };
    }
    // Upload warm atlas (full upload — descriptor set needs valid data)
    try ctx.uploadAtlas(atlas.pixels);
    atlas.last_uploaded_y = atlas.cursor_y;
    atlas.needs_full_upload = false;
    atlas.dirty = false;

    // === terminal ===
    var term = try vt.Terminal.init(alloc, .{
@@ -188,12 +198,25 @@ fn runTerminal(alloc: std.mem.Allocator) !void {

    // === pty ===
    const shell: [:0]const u8 = blk: {
        if (std.posix.getenv("WAYSTTY_BENCH") != null) {
            break :blk try alloc.dupeZ(u8, "/bin/sh");
        }
        const shell_env = std.posix.getenv("SHELL") orelse "/bin/sh";
        break :blk try alloc.dupeZ(u8, shell_env);
    };
    defer alloc.free(shell);

    var p = try pty.Pty.spawn(.{ .cols = cols, .rows = rows, .shell = shell });
    const bench_script: ?[:0]const u8 = if (std.posix.getenv("WAYSTTY_BENCH") != null)
        "echo warmup; sleep 0.2; seq 1 50000; find /usr/lib -name '*.so' 2>/dev/null | head -500; yes 'hello world' | head -2000; exit 0"
    else
        null;

    var p = try pty.Pty.spawn(.{
        .cols = cols,
        .rows = rows,
        .shell = shell,
        .shell_args = if (bench_script) |script| &.{ "-c", script } else null,
    });
    defer p.deinit();
    term.setWritePtyCallback(&p, &writePtyFromTerminal);

@@ -202,6 +225,10 @@ fn runTerminal(alloc: std.mem.Allocator) !void {
    defer render_cache.deinit(alloc);
    try render_cache.resizeRows(alloc, rows);

    // === frame timing ===
    var frame_ring = FrameTimingRing{};
    installSigusr1Handler();

    // === main loop ===
    const wl_fd = conn.display.getFd();
    var pollfds = [_]std.posix.pollfd{
@@ -354,10 +381,15 @@ fn runTerminal(alloc: std.mem.Allocator) !void {

        if (!shouldRenderFrame(render_pending, false, false)) continue;

        var frame_timing: FrameTiming = .{};

        // === render ===
        const previous_cursor = term.render_state.cursor;
        var section_timer = std.time.Timer.start() catch unreachable;
        try term.snapshot();
        frame_timing.snapshot_us = usFromTimer(&section_timer);

        section_timer = std.time.Timer.start() catch unreachable;
        const default_bg = term.backgroundColor();
        const bg_uv = atlas.cursorUV();

@@ -449,13 +481,29 @@ fn runTerminal(alloc: std.mem.Allocator) !void {
            cursor_rebuilt = true;
        }

        // Re-upload atlas if new glyphs were added
        frame_timing.row_rebuild_us = usFromTimer(&section_timer);

        section_timer = std.time.Timer.start() catch unreachable;
        // Re-upload atlas if new glyphs were added (incremental)
        if (atlas.dirty) {
            try ctx.uploadAtlas(atlas.pixels);
            const y_start = atlas.last_uploaded_y;
            const y_end = atlas.cursor_y + atlas.row_height;
            if (y_start < y_end) {
                try ctx.uploadAtlasRegion(
                    atlas.pixels,
                    y_start,
                    y_end,
                    atlas.needs_full_upload,
                );
                atlas.last_uploaded_y = atlas.cursor_y;
                atlas.needs_full_upload = false;
                render_cache.layout_dirty = true;
            }
            atlas.dirty = false;
            render_cache.layout_dirty = true;
        }
        frame_timing.atlas_upload_us = usFromTimer(&section_timer);

        section_timer = std.time.Timer.start() catch unreachable;
        const upload_plan = applyRenderPlan(.{
            .layout_dirty = render_cache.layout_dirty,
            .rows_rebuilt = rows_rebuilt,
@@ -514,6 +562,9 @@ fn runTerminal(alloc: std.mem.Allocator) !void {
            }
        }

        frame_timing.instance_upload_us = usFromTimer(&section_timer);

        section_timer = std.time.Timer.start() catch unreachable;
        const baseline_coverage = renderer.coverageVariantParams(.baseline);
        ctx.drawCells(
            render_cache.total_instance_count,
@@ -531,10 +582,22 @@ fn runTerminal(alloc: std.mem.Allocator) !void {
            },
            else => return err,
        };
        frame_timing.gpu_submit_us = usFromTimer(&section_timer);

        frame_ring.push(frame_timing);

        // Check for SIGUSR1 stats dump request
        if (sigusr1_received.swap(false, .acq_rel)) {
            printFrameStats(computeFrameStats(&frame_ring));
        }

        clearConsumedDirtyFlags(&term.render_state.dirty, dirty_rows, refresh_plan);
        render_pending = false;
    }

    // Dump timing stats on exit
    printFrameStats(computeFrameStats(&frame_ring));

    _ = try ctx.vkd.deviceWaitIdle(ctx.device);
}

@@ -855,6 +918,152 @@ fn clampSelectionSpan(span: SelectionSpan, cols: u16, rows: u16) ?SelectionSpan 
    } else null;
}

const FrameTiming = struct {
    snapshot_us: u32 = 0,
    row_rebuild_us: u32 = 0,
    atlas_upload_us: u32 = 0,
    instance_upload_us: u32 = 0,
    gpu_submit_us: u32 = 0,

    fn total(self: FrameTiming) u32 {
        return self.snapshot_us +
            self.row_rebuild_us +
            self.atlas_upload_us +
            self.instance_upload_us +
            self.gpu_submit_us;
    }
};

const FrameTimingRing = struct {
    const capacity = 256;

    entries: [capacity]FrameTiming = [_]FrameTiming{.{}} ** capacity,
    head: usize = 0,
    count: usize = 0,

    fn push(self: *FrameTimingRing, timing: FrameTiming) void {
        const idx = if (self.count < capacity) self.count else self.head;
        self.entries[idx] = timing;
        if (self.count < capacity) {
            self.count += 1;
        } else {
            self.head = (self.head + 1) % capacity;
        }
    }

    /// Return a slice of valid entries in insertion order.
    /// Caller must provide a scratch buffer of `capacity` entries.
    fn orderedSlice(self: *const FrameTimingRing, buf: *[capacity]FrameTiming) []const FrameTiming {
        if (self.count < capacity) {
            return self.entries[0..self.count];
        }
        // Ring has wrapped — copy from head..end then 0..head
        const tail_len = capacity - self.head;
        @memcpy(buf[0..tail_len], self.entries[self.head..capacity]);
        @memcpy(buf[tail_len..capacity], self.entries[0..self.head]);
        return buf[0..capacity];
    }
};

const SectionStats = struct {
    min: u32 = 0,
    avg: u32 = 0,
    p99: u32 = 0,
    max: u32 = 0,
};

const FrameTimingStats = struct {
    snapshot: SectionStats = .{},
    row_rebuild: SectionStats = .{},
    atlas_upload: SectionStats = .{},
    instance_upload: SectionStats = .{},
    gpu_submit: SectionStats = .{},
    total: SectionStats = .{},
    frame_count: usize = 0,
};

fn computeSectionStats(values: []u32) SectionStats {
    if (values.len == 0) return .{};
    std.mem.sort(u32, values, {}, std.sort.asc(u32));
    var sum: u64 = 0;
    for (values) |v| sum += v;
    const p99_idx = if (values.len <= 1) 0 else ((values.len - 1) * 99) / 100;
    return .{
        .min = values[0],
        .avg = @intCast(sum / values.len),
        .p99 = values[p99_idx],
        .max = values[values.len - 1],
    };
}

fn computeFrameStats(ring: *const FrameTimingRing) FrameTimingStats {
    if (ring.count == 0) return .{};

    var ordered_buf: [FrameTimingRing.capacity]FrameTiming = undefined;
    const entries = ring.orderedSlice(&ordered_buf);
    const n = entries.len;

    var snapshot_vals: [FrameTimingRing.capacity]u32 = undefined;
    var row_rebuild_vals: [FrameTimingRing.capacity]u32 = undefined;
    var atlas_upload_vals: [FrameTimingRing.capacity]u32 = undefined;
    var instance_upload_vals: [FrameTimingRing.capacity]u32 = undefined;
    var gpu_submit_vals: [FrameTimingRing.capacity]u32 = undefined;
    var total_vals: [FrameTimingRing.capacity]u32 = undefined;

    for (entries, 0..) |e, i| {
        snapshot_vals[i] = e.snapshot_us;
        row_rebuild_vals[i] = e.row_rebuild_us;
        atlas_upload_vals[i] = e.atlas_upload_us;
        instance_upload_vals[i] = e.instance_upload_us;
        gpu_submit_vals[i] = e.gpu_submit_us;
        total_vals[i] = e.total();
    }

    return .{
        .snapshot = computeSectionStats(snapshot_vals[0..n]),
        .row_rebuild = computeSectionStats(row_rebuild_vals[0..n]),
        .atlas_upload = computeSectionStats(atlas_upload_vals[0..n]),
        .instance_upload = computeSectionStats(instance_upload_vals[0..n]),
        .gpu_submit = computeSectionStats(gpu_submit_vals[0..n]),
        .total = computeSectionStats(total_vals[0..n]),
        .frame_count = n,
    };
}

fn printFrameStats(stats: FrameTimingStats) void {
    const row_fmt = "{s:<20}{d:>6}{d:>6}{d:>6}{d:>6}\n";
    std.debug.print("\n=== waystty frame timing ({d} frames) ===\n", .{stats.frame_count});
    std.debug.print("{s:<20}{s:>6}{s:>6}{s:>6}{s:>6}  (us)\n", .{ "section", "min", "avg", "p99", "max" });
    std.debug.print(row_fmt, .{ "snapshot",        stats.snapshot.min,        stats.snapshot.avg,        stats.snapshot.p99,        stats.snapshot.max });
    std.debug.print(row_fmt, .{ "row_rebuild",     stats.row_rebuild.min,     stats.row_rebuild.avg,     stats.row_rebuild.p99,     stats.row_rebuild.max });
    std.debug.print(row_fmt, .{ "atlas_upload",    stats.atlas_upload.min,    stats.atlas_upload.avg,    stats.atlas_upload.p99,    stats.atlas_upload.max });
    std.debug.print(row_fmt, .{ "instance_upload", stats.instance_upload.min, stats.instance_upload.avg, stats.instance_upload.p99, stats.instance_upload.max });
    std.debug.print(row_fmt, .{ "gpu_submit",      stats.gpu_submit.min,      stats.gpu_submit.avg,      stats.gpu_submit.p99,      stats.gpu_submit.max });
    std.debug.print("----------------------------------------------------\n", .{});
    std.debug.print(row_fmt, .{ "total",           stats.total.min,           stats.total.avg,           stats.total.p99,           stats.total.max });
}

var sigusr1_received: std.atomic.Value(bool) = std.atomic.Value(bool).init(false);

fn sigusr1Handler(_: c_int) callconv(.c) void {
    sigusr1_received.store(true, .release);
}

fn installSigusr1Handler() void {
    const act = std.posix.Sigaction{
        .handler = .{ .handler = sigusr1Handler },
        .mask = std.posix.sigemptyset(),
        .flags = std.posix.SA.RESTART,
    };
    std.posix.sigaction(std.posix.SIG.USR1, &act, null);
}

fn usFromTimer(timer: *std.time.Timer) u32 {
    const ns = timer.read();
    const us = ns / std.time.ns_per_us;
    return std.math.cast(u32, us) orelse std.math.maxInt(u32);
}

test "SelectionSpan.normalized orders endpoints in reading order" {
    const span = (SelectionSpan{
        .start = .{ .col = 7, .row = 4 },
@@ -2914,6 +3123,82 @@ test "buildTextCoverageCompareScene repeats the same specimen in four panels" {
    );
}

test "FrameTiming.total sums all sections" {
    const ft: FrameTiming = .{
        .snapshot_us = 10,
        .row_rebuild_us = 20,
        .atlas_upload_us = 30,
        .instance_upload_us = 40,
        .gpu_submit_us = 50,
    };
    try std.testing.expectEqual(@as(u32, 150), ft.total());
}

test "FrameTimingRing records and wraps correctly" {
    var ring = FrameTimingRing{};
    try std.testing.expectEqual(@as(usize, 0), ring.count);

    ring.push(.{ .snapshot_us = 1, .row_rebuild_us = 2, .atlas_upload_us = 3, .instance_upload_us = 4, .gpu_submit_us = 5 });
    try std.testing.expectEqual(@as(usize, 1), ring.count);
    try std.testing.expectEqual(@as(u32, 1), ring.entries[0].snapshot_us);

    // Fill to capacity
    for (1..FrameTimingRing.capacity) |i| {
        ring.push(.{ .snapshot_us = @intCast(i + 1), .row_rebuild_us = 0, .atlas_upload_us = 0, .instance_upload_us = 0, .gpu_submit_us = 0 });
    }
    try std.testing.expectEqual(FrameTimingRing.capacity, ring.count);

    // One more wraps around — overwrites entries[0], head advances to 1
    ring.push(.{ .snapshot_us = 999, .row_rebuild_us = 0, .atlas_upload_us = 0, .instance_upload_us = 0, .gpu_submit_us = 0 });
    try std.testing.expectEqual(FrameTimingRing.capacity, ring.count);
    // Newest entry is at (head + capacity - 1) % capacity = 0
    try std.testing.expectEqual(@as(u32, 999), ring.entries[0].snapshot_us);
    // head has advanced past the overwritten slot
    try std.testing.expectEqual(@as(usize, 1), ring.head);
}

test "FrameTimingRing.orderedSlice returns entries in insertion order after wrap" {
    var ring = FrameTimingRing{};
    // Push capacity + 3 entries so the ring wraps
    for (0..FrameTimingRing.capacity + 3) |i| {
        ring.push(.{ .snapshot_us = @intCast(i), .row_rebuild_us = 0, .atlas_upload_us = 0, .instance_upload_us = 0, .gpu_submit_us = 0 });
    }
    var buf: [FrameTimingRing.capacity]FrameTiming = undefined;
    const ordered = ring.orderedSlice(&buf);
    try std.testing.expectEqual(FrameTimingRing.capacity, ordered.len);
    // First entry should be the 4th pushed (index 3), last should be capacity+2
    try std.testing.expectEqual(@as(u32, 3), ordered[0].snapshot_us);
    try std.testing.expectEqual(@as(u32, FrameTimingRing.capacity + 2), ordered[ordered.len - 1].snapshot_us);
}

test "FrameTimingStats computes min/avg/p99/max correctly" {
    var ring = FrameTimingRing{};
    // Push 100 frames with snapshot_us = 1..100
    for (0..100) |i| {
        ring.push(.{
            .snapshot_us = @intCast(i + 1),
            .row_rebuild_us = 0,
            .atlas_upload_us = 0,
            .instance_upload_us = 0,
            .gpu_submit_us = 0,
        });
    }
    const stats = computeFrameStats(&ring);
    try std.testing.expectEqual(@as(u32, 1), stats.snapshot.min);
    try std.testing.expectEqual(@as(u32, 100), stats.snapshot.max);
    try std.testing.expectEqual(@as(u32, 50), stats.snapshot.avg);
    // p99 of 1..100 = value at index 98 (0-based) = 99
    try std.testing.expectEqual(@as(u32, 99), stats.snapshot.p99);
    try std.testing.expectEqual(@as(usize, 100), stats.frame_count);
}

test "FrameTimingStats handles empty ring" {
    var ring = FrameTimingRing{};
    const stats = computeFrameStats(&ring);
    try std.testing.expectEqual(@as(usize, 0), stats.frame_count);
    try std.testing.expectEqual(@as(u32, 0), stats.snapshot.min);
}

fn runRenderSmokeTest(alloc: std.mem.Allocator) !void {
    const conn = try wayland_client.Connection.init(alloc);
    defer conn.deinit();
diff --git a/src/pty.zig b/src/pty.zig
index 4afda9c..86d3c55 100644
--- a/src/pty.zig
+++ b/src/pty.zig
@@ -19,6 +19,7 @@ pub const Pty = struct {
        cols: u16,
        rows: u16,
        shell: [:0]const u8,
        shell_args: ?[]const [:0]const u8 = null,
    };

    pub fn spawn(opts: SpawnOptions) !Pty {
@@ -37,8 +38,18 @@ pub const Pty = struct {
            // Child process
            _ = c.setenv("TERM", "xterm-256color", 1);

            var argv = [_:null]?[*:0]const u8{ opts.shell.ptr, null };
            std.posix.execveZ(opts.shell.ptr, &argv, std.c.environ) catch {};
            if (opts.shell_args) |args| {
                std.debug.assert(args.len < 15); // argv[0] = shell, must fit in 16-slot buffer
                var argv_buf: [16:null]?[*:0]const u8 = .{null} ** 16;
                argv_buf[0] = opts.shell.ptr;
                for (args, 1..) |arg, i| {
                    argv_buf[i] = arg.ptr;
                }
                std.posix.execveZ(opts.shell.ptr, &argv_buf, std.c.environ) catch {};
            } else {
                var argv = [_:null]?[*:0]const u8{ opts.shell.ptr, null };
                std.posix.execveZ(opts.shell.ptr, &argv, std.c.environ) catch {};
            }
            std.process.exit(1);
        }