From 4a511f86e1c841f051ed8c8d7cd4ec96920445d0 Mon Sep 17 00:00:00 2001
From: Karl Seguin <k@openmymind.io>
Date: Mon, 25 Aug 2025 08:24:24 +0800
Subject: [PATCH] Remove open_data_file

On linux, this can't compile - it relates to invalid import paths. Not sure why
Zig 0.15 cares - since it's never touched, but it does. Just a small pre-emptive
change for Zig 0.15.
---
 io/darwin.zig  | 102 ------------------
 io/linux.zig   | 288 -------------------------------------------------
 superblock.zig |  18 +---
 3 files changed, 3 insertions(+), 405 deletions(-)

diff --git a/io/darwin.zig b/io/darwin.zig
index ae427c0..73247f4 100644
--- a/io/darwin.zig
+++ b/io/darwin.zig
@@ -827,108 +827,6 @@ pub const IO = struct {
     pub const fd_t = posix.fd_t;
     pub const INVALID_FILE: fd_t = -1;
 
-    /// Opens or creates a journal file:
-    /// - For reading and writing.
-    /// - For Direct I/O (required on darwin).
-    /// - Obtains an advisory exclusive lock to the file descriptor.
-    /// - Allocates the file contiguously on disk if this is supported by the file system.
-    /// - Ensures that the file data (and file inode in the parent directory) is durable on disk.
-    ///   The caller is responsible for ensuring that the parent directory inode is durable.
-    /// - Verifies that the file size matches the expected file size before returning.
-    pub fn open_data_file(
-        self: *IO,
-        dir_fd: fd_t,
-        relative_path: []const u8,
-        size: u64,
-        method: enum { create, create_or_open, open, open_read_only },
-        direct_io: DirectIO,
-    ) !fd_t {
-        _ = self;
-
-        assert(relative_path.len > 0);
-        assert(size % constants.sector_size == 0);
-
-        // TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock.
-        // This is much stronger than an advisory exclusive lock, and is required on some platforms.
-
-        // Normally, O_DSYNC enables us to omit fsync() calls in the data plane, since we sync to
-        // the disk on every write, but that's not the case for Darwin:
-        // https://x.com/TigerBeetleDB/status/1536628729031581697
-        // To work around this, fs_sync() is explicitly called after writing in do_operation.
-        var flags: posix.O = .{
-            .CLOEXEC = true,
-            .ACCMODE = if (method == .open_read_only) .RDONLY else .RDWR,
-            .DSYNC = true,
-        };
-        var mode: posix.mode_t = 0;
-
-        // TODO Document this and investigate whether this is in fact correct to set here.
-        if (@hasField(posix.O, "LARGEFILE")) flags.LARGEFILE = true;
-
-        switch (method) {
-            .create => {
-                flags.CREAT = true;
-                flags.EXCL = true;
-                mode = 0o666;
-                log.info("creating \"{s}\"...", .{relative_path});
-            },
-            .create_or_open => {
-                flags.CREAT = true;
-                mode = 0o666;
-                log.info("opening or creating \"{s}\"...", .{relative_path});
-            },
-            .open, .open_read_only => {
-                log.info("opening \"{s}\"...", .{relative_path});
-            },
-        }
-
-        // This is critical as we rely on O_DSYNC for fsync() whenever we write to the file:
-        assert(flags.DSYNC);
-
-        // Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page)
-        assert(!std.fs.path.isAbsolute(relative_path));
-        const fd = try posix.openat(dir_fd, relative_path, flags, mode);
-        // TODO Return a proper error message when the path exists or does not exist (init/start).
-        errdefer posix.close(fd);
-
-        // TODO Check that the file is actually a file.
-
-        // On darwin assume that Direct I/O is always supported.
-        // Use F_NOCACHE to disable the page cache as O_DIRECT doesn't exist.
-        if (direct_io != .direct_io_disabled) {
-            _ = try posix.fcntl(fd, posix.F.NOCACHE, 1);
-        }
-
-        // Obtain an advisory exclusive lock that works only if all processes actually use flock().
-        // LOCK_NB means that we want to fail the lock without waiting if another process has it.
-        posix.flock(fd, posix.LOCK.EX | posix.LOCK.NB) catch |err| switch (err) {
-            error.WouldBlock => @panic("another process holds the data file lock"),
-            else => return err,
-        };
-
-        // Ask the file system to allocate contiguous sectors for the file (if possible):
-        // If the file system does not support `fallocate()`, then this could mean more seeks or a
-        // panic if we run out of disk space (ENOSPC).
-        if (method == .create) try fs_allocate(fd, size);
-
-        // The best fsync strategy is always to fsync before reading because this prevents us from
-        // making decisions on data that was never durably written by a previously crashed process.
-        // We therefore always fsync when we open the path, also to wait for any pending O_DSYNC.
-        // Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out.
-        try fs_sync(fd);
-
-        // We fsync the parent directory to ensure that the file inode is durably written.
-        // The caller is responsible for the parent directory inode stored under the grandparent.
-        // We always do this when opening because we don't know if this was done before crashing.
-        try fs_sync(dir_fd);
-
-        // TODO Document that `size` is now `data_file_size_min` from `main.zig`.
-        const stat = try posix.fstat(fd);
-        if (stat.size < size) @panic("data file inode size was truncated or corrupted");
-
-        return fd;
-    }
-
     /// Darwin's fsync() syscall does not flush past the disk cache. We must use F_FULLFSYNC
     /// instead.
     /// https://twitter.com/TigerBeetleDB/status/1422491736224436225
diff --git a/io/linux.zig b/io/linux.zig
index f734b33..c2c079a 100644
--- a/io/linux.zig
+++ b/io/linux.zig
@@ -1518,294 +1518,6 @@ pub const IO = struct {
     pub const fd_t = posix.fd_t;
     pub const INVALID_FILE: fd_t = -1;
 
-    /// Opens or creates a journal file:
-    /// - For reading and writing.
-    /// - For Direct I/O (if possible in development mode, but required in production mode).
-    /// - Obtains an advisory exclusive lock to the file descriptor.
-    /// - Allocates the file contiguously on disk if this is supported by the file system.
-    /// - Ensures that the file data (and file inode in the parent directory) is durable on disk.
-    ///   The caller is responsible for ensuring that the parent directory inode is durable.
-    /// - Verifies that the file size matches the expected file size before returning.
-    pub fn open_data_file(
-        self: *IO,
-        dir_fd: fd_t,
-        relative_path: []const u8,
-        size: u64,
-        method: enum { create, create_or_open, open, open_read_only },
-        direct_io: DirectIO,
-    ) !fd_t {
-        _ = self;
-
-        assert(relative_path.len > 0);
-        assert(size % constants.sector_size == 0);
-        // Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page)
-        assert(!std.fs.path.isAbsolute(relative_path));
-
-        var flags: posix.O = .{
-            .CLOEXEC = true,
-            .ACCMODE = if (method == .open_read_only) .RDONLY else .RDWR,
-            .DSYNC = true,
-        };
-        var mode: posix.mode_t = 0;
-
-        const kind: enum { file, block_device } = blk: {
-            const stat = posix.fstatat(
-                dir_fd,
-                relative_path,
-                0,
-            ) catch |err| switch (err) {
-                error.FileNotFound => {
-                    if (method == .create or method == .create_or_open) {
-                        // It's impossible to distinguish creating a new file and opening a new
-                        // block device with the current API. So if it's possible that we should
-                        // create a file we try that instead of failing here.
-                        break :blk .file;
-                    } else {
-                        @panic("Path does not exist.");
-                    }
-                },
-                else => |err_| return err_,
-            };
-            if (posix.S.ISBLK(stat.mode)) {
-                break :blk .block_device;
-            } else {
-                if (!posix.S.ISREG(stat.mode)) {
-                    @panic("file path does not point to block device or regular file.");
-                }
-                break :blk .file;
-            }
-        };
-
-        // This is not strictly necessary on 64bit systems but it's harmless.
-        // This will avoid errors with handling large files on certain configurations
-        // of 32bit kernels. In all other cases, it's a noop.
-        // See: <https://github.com/torvalds/linux/blob/ab27740f76654ed58dd32ac0ba0031c18a6dea3b/fs/open.c#L1602>
-        if (@hasField(posix.O, "LARGEFILE")) flags.LARGEFILE = true;
-
-        switch (kind) {
-            .block_device => {
-                if (direct_io != .direct_io_disabled) {
-                    // Block devices should always support Direct IO.
-                    flags.DIRECT = true;
-                    // Use O_EXCL when opening as a block device to obtain an advisory exclusive
-                    // lock. Normally, you can't do this for files you don't create, but for
-                    // block devices this guarantees:
-                    //     - that there are no mounts using this block device
-                    //     - that no new mounts can use this block device while we have it open
-                    //
-                    // However it doesn't prevent other processes with root from opening without
-                    // O_EXCL and writing (mount is just a special case that always checks O_EXCL).
-                    //
-                    // This should be stronger than flock(2) locks, which work on a separate system.
-                    // The relevant kernel code (as of v6.7) is here:
-                    // <https://github.com/torvalds/linux/blob/7da71072e1d6967c0482abcbb5991ffb5953fdf2/block/bdev.c#L932>
-                    flags.EXCL = true;
-                }
-                log.info("opening block device \"{s}\"...", .{relative_path});
-            },
-            .file => {
-                var direct_io_supported = false;
-                const dir_on_tmpfs = try fs_is_tmpfs(dir_fd);
-
-                if (dir_on_tmpfs) {
-                    log.warn(
-                        "tmpfs is not durable, and your data will be lost on reboot",
-                        .{},
-                    );
-                }
-
-                // Special case. tmpfs doesn't support Direct I/O. Normally we would panic
-                // here (see below) but being able to benchmark production workloads
-                // on tmpfs is very useful for removing
-                // disk speed from the equation.
-                if (direct_io != .direct_io_disabled and !dir_on_tmpfs) {
-                    direct_io_supported = try fs_supports_direct_io(dir_fd);
-                    if (direct_io_supported) {
-                        flags.DIRECT = true;
-                    } else if (direct_io == .direct_io_optional) {
-                        log.warn("This file system does not support Direct I/O.", .{});
-                    } else {
-                        assert(direct_io == .direct_io_required);
-                        // We require Direct I/O for safety to handle fsync failure correctly, and
-                        // therefore panic in production if it is not supported.
-                        log.err("This file system does not support Direct I/O.", .{});
-                        log.err("TigerBeetle uses Direct I/O to bypass the kernel page cache, " ++
-                            "to ensure that data is durable when writes complete.", .{});
-                        log.err("If this is a production replica, Direct I/O is required.", .{});
-                        log.err("If this is a development/testing replica, " ++
-                            "re-run with --development set to bypass this error.", .{});
-                        @panic("file system does not support Direct I/O");
-                    }
-                }
-
-                switch (method) {
-                    .create => {
-                        flags.CREAT = true;
-                        flags.EXCL = true;
-                        mode = 0o666;
-                        log.info("creating \"{s}\"...", .{relative_path});
-                    },
-                    .create_or_open => {
-                        flags.CREAT = true;
-                        mode = 0o666;
-                        log.info("opening or creating \"{s}\"...", .{relative_path});
-                    },
-                    .open, .open_read_only => {
-                        log.info("opening \"{s}\"...", .{relative_path});
-                    },
-                }
-            },
-        }
-
-        // This is critical as we rely on O_DSYNC for fsync() whenever we write to the file:
-        assert(flags.DSYNC);
-
-        const fd = try posix.openat(dir_fd, relative_path, flags, mode);
-        // TODO Return a proper error message when the path exists or does not exist (init/start).
-        errdefer posix.close(fd);
-
-        {
-            // Make sure we're getting the type of file descriptor we expect.
-            const stat = try posix.fstat(fd);
-            switch (kind) {
-                .file => assert(posix.S.ISREG(stat.mode)),
-                .block_device => assert(posix.S.ISBLK(stat.mode)),
-            }
-        }
-
-        // Obtain an advisory exclusive lock that works only if all processes actually use flock().
-        // LOCK_NB means that we want to fail the lock without waiting if another process has it.
-        //
-        // This is wrapped inside a retry loop with a sleep because of the interaction between
-        // io_uring semantics and flock: flocks are held per fd, but io_uring will keep a reference
-        // to the fd alive even once a process has been terminated, until all async operations have
-        // been completed.
-        //
-        // This means that when killing and starting a tigerbeetle process in an automated way, you
-        // can see "another process holds the data file lock" errors, even though the process really
-        // has terminated.
-        for (0..2) |_| {
-            posix.flock(fd, posix.LOCK.EX | posix.LOCK.NB) catch |err| switch (err) {
-                error.WouldBlock => {
-                    std.time.sleep(50 * std.time.ns_per_ms);
-                    continue;
-                },
-                else => return err,
-            };
-            break;
-        } else {
-            posix.flock(fd, posix.LOCK.EX | posix.LOCK.NB) catch |err| switch (err) {
-                error.WouldBlock => @panic("another process holds the data file lock"),
-                else => return err,
-            };
-        }
-
-        // Ask the file system to allocate contiguous sectors for the file (if possible):
-        // If the file system does not support `fallocate()`, then this could mean more seeks or a
-        // panic if we run out of disk space (ENOSPC).
-        if (method == .create and kind == .file) {
-            log.info("allocating {}...", .{std.fmt.fmtIntSizeBin(size)});
-            fs_allocate(fd, size) catch |err| switch (err) {
-                error.OperationNotSupported => {
-                    log.warn("file system does not support fallocate(), an ENOSPC will panic", .{});
-                    log.info("allocating by writing to the last sector " ++
-                        "of the file instead...", .{});
-
-                    const sector_size = constants.sector_size;
-                    const sector: [sector_size]u8 align(sector_size) = [_]u8{0} ** sector_size;
-
-                    // Handle partial writes where the physical sector is
-                    // less than a logical sector:
-                    const write_offset = size - sector.len;
-                    var written: usize = 0;
-                    while (written < sector.len) {
-                        written += try posix.pwrite(fd, sector[written..], write_offset + written);
-                    }
-                },
-                else => |e| return e,
-            };
-        }
-
-        // The best fsync strategy is always to fsync before reading because this prevents us from
-        // making decisions on data that was never durably written by a previously crashed process.
-        // We therefore always fsync when we open the path, also to wait for any pending O_DSYNC.
-        // Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out.
-        try posix.fsync(fd);
-
-        // We fsync the parent directory to ensure that the file inode is durably written.
-        // The caller is responsible for the parent directory inode stored under the grandparent.
-        // We always do this when opening because we don't know if this was done before crashing.
-        try posix.fsync(dir_fd);
-
-        switch (kind) {
-            .file => {
-                if ((try posix.fstat(fd)).size < size) {
-                    @panic("data file inode size was truncated or corrupted");
-                }
-            },
-            .block_device => {
-                const BLKGETSIZE64 = os.linux.IOCTL.IOR(0x12, 114, usize);
-                var block_device_size: usize = 0;
-
-                switch (os.linux.E.init(os.linux.ioctl(
-                    fd,
-                    BLKGETSIZE64,
-                    @intFromPtr(&block_device_size),
-                ))) {
-                    .SUCCESS => {},
-
-                    // These are the only errors that are supposed to be possible from ioctl(2).
-                    .BADF => return error.InvalidFileDescriptor,
-                    .NOTTY => return error.BadRequest,
-                    .FAULT => return error.InvalidAddress,
-                    else => |err| return stdx.unexpected_errno("open_file:ioctl", err),
-                }
-
-                if (block_device_size < size) {
-                    std.debug.panic(
-                        "The block device used is too small ({} available/{} needed).",
-                        .{
-                            std.fmt.fmtIntSizeBin(block_device_size),
-                            std.fmt.fmtIntSizeBin(size),
-                        },
-                    );
-                }
-
-                if (method == .create or method == .create_or_open) {
-                    // Check that the first superblock_zone_size bytes are 0.
-                    // - It'll ensure that the block device is not directly TigerBeetle.
-                    // - It'll be very likely to catch any cases where there's an existing
-                    //   other filesystem.
-                    // - In the case of there being a partition table (eg, two partitions,
-                    //   one starting at 0MiB, one at 1024MiB) and the operator tries to format
-                    //   the raw disk (/dev/sda) while a partition later is
-                    //   TigerBeetle (/dev/sda2) it'll be blocked by the MBR/GPT existing.
-                    const superblock_zone_size =
-                        @import("../superblock.zig").superblock_zone_size;
-                    var read_buf: [superblock_zone_size]u8 align(constants.sector_size) = undefined;
-
-                    // We can do this without worrying about retrying partial reads because on
-                    // linux, read(2) on block devices can not be interrupted by signals.
-                    // See signal(7).
-                    assert(superblock_zone_size == try posix.read(fd, &read_buf));
-                    if (!std.mem.allEqual(u8, &read_buf, 0)) {
-                        std.debug.panic(
-                            "Superblock on block device not empty. " ++
-                                "If this is the correct block device to use, " ++
-                                "please zero the first {} using a tool like dd.",
-                            .{std.fmt.fmtIntSizeBin(superblock_zone_size)},
-                        );
-                    }
-                    // Reset position in the block device to compensate for read(2).
-                    try posix.lseek_CUR(fd, -superblock_zone_size);
-                    assert(try posix.lseek_CUR_get(fd) == 0);
-                }
-            },
-        }
-
-        return fd;
-    }
-
     /// Detects whether the underlying file system for a given directory fd is tmpfs. This is used
     /// to relax our Direct I/O check - running on tmpfs for benchmarking is useful.
     fn fs_is_tmpfs(dir_fd: fd_t) !bool {
diff --git a/superblock.zig b/superblock.zig
index eb52cc0..2a1a239 100644
--- a/superblock.zig
+++ b/superblock.zig
@@ -33,9 +33,9 @@ const maybe = stdx.maybe;
 const mem = std.mem;
 const meta = std.meta;
 
-const constants = @import("../constants.zig");
-const stdx = @import("../stdx.zig");
-const vsr = @import("../vsr.zig");
+const constants = @import("constants.zig");
+const stdx = @import("stdx.zig");
+const vsr = @import("vsr.zig");
 const log = std.log.scoped(.superblock);
 
 pub const Quorums = @import("superblock_quorums.zig").QuorumsType(.{
@@ -1572,12 +1572,6 @@ pub fn SuperBlockType(comptime Storage: type) type {
                     @tagName(context.caller),
                 });
 
-                if (Storage == @import("../testing/storage.zig").Storage) {
-                    // We should have finished all pending superblock io before starting any more.
-                    superblock.storage.assert_no_pending_reads(.superblock);
-                    superblock.storage.assert_no_pending_writes(.superblock);
-                }
-
                 if (context.caller == .open) {
                     superblock.read_working(context, .open);
                 } else {
@@ -1594,12 +1588,6 @@ pub fn SuperBlockType(comptime Storage: type) type {
                 @tagName(context.caller),
             });
 
-            if (Storage == @import("../testing/storage.zig").Storage) {
-                // We should have finished all pending io by now.
-                superblock.storage.assert_no_pending_reads(.superblock);
-                superblock.storage.assert_no_pending_writes(.superblock);
-            }
-
             switch (context.caller) {
                 .format => {},
                 .open => {