When nanoseconds matter and every allocation counts, Zig shines as a systems programming language designed for performance. Unlike higher-level languages that hide costs behind abstractions, Zig gives you direct control while remaining readable and maintainable.
Zig was designed by Andrew Kelley after years of working on performance-critical systems. Its philosophy is simple: no hidden control flow, no hidden allocations, no hidden costs.
Key performance advantages:
- Zero-overhead abstractions: Generics and comptime generate optimal code
- LLVM backend: Access to world-class optimizations
- No hidden allocations: Every allocation is explicit and controllable
- Predictable performance: No garbage collector, no runtime surprises
- Cache-friendly design: Packed structs, explicit alignment control
Explicit Memory Layout Control
For cache-friendly data structures, Zig offers fine-grained control:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
| const std = @import("std");
// Packed struct: no padding between fields
const PackedVertex = packed struct {
x: f32,
y: f32,
z: f32,
color: u32, // RGBA packed into 4 bytes
};
// Extern struct: C ABI compatible, predictable layout
const ExternVertex = extern struct {
position: [3]f32,
normal: [3]f32,
uv: [2]f32,
};
// Aligned for SIMD operations
const AlignedVector = struct {
data: [4]f32 align(16),
};
pub fn main() void {
std.debug.print("PackedVertex size: {d} bytes\n", .{@sizeOf(PackedVertex)});
std.debug.print("ExternVertex size: {d} bytes\n", .{@sizeOf(ExternVertex)});
std.debug.print("AlignedVector alignment: {d}\n", .{@alignOf(AlignedVector)});
}
|
SIMD Vectorization
Zig exposes SIMD intrinsics as first-class types:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
| const std = @import("std");
const Vector = std.meta.Vector;
// 256-bit vector: 8 x f32
const Vec8f = @Vector(8, f32);
fn dotProduct8(a: Vec8f, b: Vec8f) f32 {
const product = a * b;
return @reduce(.Add, product);
}
fn normalizeVectors(vectors: []align(32) [8]f32) void {
for (vectors) |*v| {
const vec: Vec8f = v.*;
const squared = vec * vec;
const sum = @reduce(.Add, squared);
const inv_len = 1.0 / @sqrt(sum);
v.* = vec * @as(Vec8f, @splat(inv_len));
}
}
pub fn main() void {
const a: Vec8f = .{ 1, 2, 3, 4, 5, 6, 7, 8 };
const b: Vec8f = .{ 8, 7, 6, 5, 4, 3, 2, 1 };
const dot = dotProduct8(a, b);
std.debug.print("Dot product: {d}\n", .{dot});
}
|
Zig’s allocator interface enables specialized allocation strategies:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
| const std = @import("std");
/// Arena allocator: fast bump allocation, bulk deallocation
fn processWithArena() !void {
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena.deinit(); // Free everything at once
const allocator = arena.allocator();
// Fast allocations - no individual bookkeeping
var items = try allocator.alloc(u64, 10000);
for (items, 0..) |*item, i| {
item.* = i * i;
}
// No need to free individual allocations
}
/// Fixed buffer allocator: zero heap usage
fn processStackOnly() !void {
var buffer: [4096]u8 = undefined;
var fba = std.heap.FixedBufferAllocator.init(&buffer);
const allocator = fba.allocator();
// All allocations come from stack buffer
var list = std.ArrayList(u32).init(allocator);
try list.appendSlice(&.{ 1, 2, 3, 4, 5 });
}
/// Pool allocator: fast fixed-size allocations
const ObjectPool = struct {
const Node = struct {
data: [64]u8,
next: ?*Node,
};
free_list: ?*Node,
backing: std.mem.Allocator,
pub fn init(backing: std.mem.Allocator) ObjectPool {
return .{ .free_list = null, .backing = backing };
}
pub fn acquire(self: *ObjectPool) !*Node {
if (self.free_list) |node| {
self.free_list = node.next;
return node;
}
return try self.backing.create(Node);
}
pub fn release(self: *ObjectPool, node: *Node) void {
node.next = self.free_list;
self.free_list = node;
}
};
|
Lock-Free Data Structures
Zig provides atomic operations for concurrent programming:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
| const std = @import("std");
const Atomic = std.atomic.Value;
/// Lock-free single-producer, single-consumer queue
fn SpscQueue(comptime T: type, comptime capacity: usize) type {
return struct {
const Self = @This();
buffer: [capacity]T = undefined,
head: Atomic(usize) = Atomic(usize).init(0),
tail: Atomic(usize) = Atomic(usize).init(0),
pub fn push(self: *Self, item: T) bool {
const tail = self.tail.load(.acquire);
const next_tail = (tail + 1) % capacity;
if (next_tail == self.head.load(.acquire)) {
return false; // Queue full
}
self.buffer[tail] = item;
self.tail.store(next_tail, .release);
return true;
}
pub fn pop(self: *Self) ?T {
const head = self.head.load(.acquire);
if (head == self.tail.load(.acquire)) {
return null; // Queue empty
}
const item = self.buffer[head];
self.head.store((head + 1) % capacity, .release);
return item;
}
};
}
pub fn main() void {
var queue = SpscQueue(u64, 1024){};
_ = queue.push(42);
_ = queue.push(123);
while (queue.pop()) |value| {
std.debug.print("Popped: {d}\n", .{value});
}
}
|
Zero-Copy Parsing
For high-performance parsers, avoid allocations entirely:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
| const std = @import("std");
const Token = struct {
kind: enum { identifier, number, operator, eof },
text: []const u8, // Points into original input
};
const Lexer = struct {
input: []const u8,
pos: usize = 0,
pub fn init(input: []const u8) Lexer {
return .{ .input = input };
}
pub fn next(self: *Lexer) Token {
self.skipWhitespace();
if (self.pos >= self.input.len) {
return .{ .kind = .eof, .text = "" };
}
const start = self.pos;
const c = self.input[self.pos];
if (std.ascii.isAlphabetic(c)) {
while (self.pos < self.input.len and
std.ascii.isAlphanumeric(self.input[self.pos])) {
self.pos += 1;
}
return .{ .kind = .identifier, .text = self.input[start..self.pos] };
}
if (std.ascii.isDigit(c)) {
while (self.pos < self.input.len and
std.ascii.isDigit(self.input[self.pos])) {
self.pos += 1;
}
return .{ .kind = .number, .text = self.input[start..self.pos] };
}
self.pos += 1;
return .{ .kind = .operator, .text = self.input[start..self.pos] };
}
fn skipWhitespace(self: *Lexer) void {
while (self.pos < self.input.len and
std.ascii.isWhitespace(self.input[self.pos])) {
self.pos += 1;
}
}
};
pub fn main() void {
const source = "foo + bar123 * 42";
var lexer = Lexer.init(source);
while (true) {
const tok = lexer.next();
if (tok.kind == .eof) break;
std.debug.print("{s}: \"{s}\"\n", .{@tagName(tok.kind), tok.text});
}
}
|
Compile-Time Code Generation
Generate specialized code paths at compile time:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
| const std = @import("std");
fn MatrixMultiply(comptime N: usize) type {
return struct {
const Matrix = [N][N]f64;
// Unrolled at compile time for small matrices
pub fn multiply(a: Matrix, b: Matrix) Matrix {
var result: Matrix = undefined;
comptime var i = 0;
inline while (i < N) : (i += 1) {
comptime var j = 0;
inline while (j < N) : (j += 1) {
var sum: f64 = 0;
comptime var k = 0;
inline while (k < N) : (k += 1) {
sum += a[i][k] * b[k][j];
}
result[i][j] = sum;
}
}
return result;
}
};
}
pub fn main() void {
const Mat4 = MatrixMultiply(4);
const identity: Mat4.Matrix = .{
.{ 1, 0, 0, 0 },
.{ 0, 1, 0, 0 },
.{ 0, 0, 1, 0 },
.{ 0, 0, 0, 1 },
};
const result = Mat4.multiply(identity, identity);
_ = result;
std.debug.print("4x4 matrix multiply generated at comptime\n", .{});
}
|
Memory-Mapped I/O
Direct hardware access for embedded and systems programming:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
| const std = @import("std");
// Memory-mapped peripheral register
const GPIO = packed struct {
data: u8,
direction: u8,
interrupt_enable: u8,
interrupt_status: u8,
};
fn getGPIO() *volatile GPIO {
// Cast hardware address to typed pointer
return @ptrFromInt(0x4000_0000);
}
fn initGPIO() void {
const gpio = getGPIO();
// All operations are volatile - not optimized away
gpio.direction = 0xFF; // All pins output
gpio.interrupt_enable = 0; // Disable interrupts
gpio.data = 0; // Clear outputs
}
fn setPin(pin: u3, value: bool) void {
const gpio = getGPIO();
const mask = @as(u8, 1) << pin;
if (value) {
gpio.data |= mask;
} else {
gpio.data &= ~mask;
}
}
|
Benchmarking and Profiling
Built-in timing primitives for performance measurement:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
| const std = @import("std");
fn benchmark(comptime name: []const u8, comptime func: anytype) void {
const iterations = 1_000_000;
var timer = std.time.Timer.start() catch unreachable;
var i: usize = 0;
while (i < iterations) : (i += 1) {
_ = @call(.never_inline, func, .{});
}
const elapsed = timer.read();
const ns_per_op = elapsed / iterations;
std.debug.print("{s}: {d} ns/op\n", .{ name, ns_per_op });
}
fn fibonacciIterative(n: u64) u64 {
if (n <= 1) return n;
var prev: u64 = 0;
var curr: u64 = 1;
var i: u64 = 2;
while (i <= n) : (i += 1) {
const next = prev + curr;
prev = curr;
curr = next;
}
return curr;
}
pub fn main() void {
benchmark("fib(30)", struct {
fn call() u64 {
return fibonacciIterative(30);
}
}.call);
}
|
Inline Assembly for Critical Paths
When you need ultimate control:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
| const std = @import("std");
fn rdtsc() u64 {
var low: u32 = undefined;
var high: u32 = undefined;
asm volatile ("rdtsc"
: "={eax}" (low),
"={edx}" (high)
);
return (@as(u64, high) << 32) | low;
}
fn prefetch(ptr: [*]const u8) void {
asm volatile ("prefetcht0 (%[ptr])"
:
: [ptr] "r" (ptr)
);
}
pub fn main() void {
const start = rdtsc();
// Critical section
var sum: u64 = 0;
for (0..1000) |i| {
sum += i;
}
const end = rdtsc();
std.debug.print("Cycles: {d}, Sum: {d}\n", .{ end - start, sum });
}
|
Build Optimization Flags
Optimize your release builds:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
| // build.zig
const std = @import("std");
pub fn build(b: *std.Build) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});
const exe = b.addExecutable(.{
.name = "my_app",
.root_source_file = b.path("src/main.zig"),
.target = target,
.optimize = optimize,
});
// Performance-critical settings
exe.want_lto = true; // Link-time optimization
exe.strip = true; // Strip debug symbols
exe.single_threaded = true; // If not using threads
b.installArtifact(exe);
}
|
Build with maximum optimization:
1
| zig build -Doptimize=ReleaseFast
|
Conclusion
Zig provides the tools for building truly high-performance systems:
- Explicit control: No hidden costs or allocations
- SIMD support: First-class vector types
- Custom allocators: Tailored memory strategies
- Comptime: Zero-cost abstractions via code generation
- Low-level access: Inline assembly and memory-mapped I/O
The language proves that safety and performance aren’t mutually exclusive—you can have readable, maintainable code that runs as fast as hand-tuned C.
At Sajima Solutions, we use Zig for performance-critical components where every cycle counts. Contact us to discuss how we can optimize your systems for maximum performance.