From 4a511f86e1c841f051ed8c8d7cd4ec96920445d0 Mon Sep 17 00:00:00 2001 From: Karl Seguin Date: Mon, 25 Aug 2025 08:24:24 +0800 Subject: [PATCH] Remove open_data_file On linux, this can't compile - it relates to invalid import paths. Not sure why Zig 0.15 cares - since it's never touched, but it does. Just a small pre-emptive change for Zig 0.15. --- io/darwin.zig | 102 ------------------ io/linux.zig | 288 ------------------------------------------------- superblock.zig | 18 +--- 3 files changed, 3 insertions(+), 405 deletions(-) diff --git a/io/darwin.zig b/io/darwin.zig index ae427c0..73247f4 100644 --- a/io/darwin.zig +++ b/io/darwin.zig @@ -827,108 +827,6 @@ pub const IO = struct { pub const fd_t = posix.fd_t; pub const INVALID_FILE: fd_t = -1; - /// Opens or creates a journal file: - /// - For reading and writing. - /// - For Direct I/O (required on darwin). - /// - Obtains an advisory exclusive lock to the file descriptor. - /// - Allocates the file contiguously on disk if this is supported by the file system. - /// - Ensures that the file data (and file inode in the parent directory) is durable on disk. - /// The caller is responsible for ensuring that the parent directory inode is durable. - /// - Verifies that the file size matches the expected file size before returning. - pub fn open_data_file( - self: *IO, - dir_fd: fd_t, - relative_path: []const u8, - size: u64, - method: enum { create, create_or_open, open, open_read_only }, - direct_io: DirectIO, - ) !fd_t { - _ = self; - - assert(relative_path.len > 0); - assert(size % constants.sector_size == 0); - - // TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock. - // This is much stronger than an advisory exclusive lock, and is required on some platforms. - - // Normally, O_DSYNC enables us to omit fsync() calls in the data plane, since we sync to - // the disk on every write, but that's not the case for Darwin: - // https://x.com/TigerBeetleDB/status/1536628729031581697 - // To work around this, fs_sync() is explicitly called after writing in do_operation. - var flags: posix.O = .{ - .CLOEXEC = true, - .ACCMODE = if (method == .open_read_only) .RDONLY else .RDWR, - .DSYNC = true, - }; - var mode: posix.mode_t = 0; - - // TODO Document this and investigate whether this is in fact correct to set here. - if (@hasField(posix.O, "LARGEFILE")) flags.LARGEFILE = true; - - switch (method) { - .create => { - flags.CREAT = true; - flags.EXCL = true; - mode = 0o666; - log.info("creating \"{s}\"...", .{relative_path}); - }, - .create_or_open => { - flags.CREAT = true; - mode = 0o666; - log.info("opening or creating \"{s}\"...", .{relative_path}); - }, - .open, .open_read_only => { - log.info("opening \"{s}\"...", .{relative_path}); - }, - } - - // This is critical as we rely on O_DSYNC for fsync() whenever we write to the file: - assert(flags.DSYNC); - - // Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page) - assert(!std.fs.path.isAbsolute(relative_path)); - const fd = try posix.openat(dir_fd, relative_path, flags, mode); - // TODO Return a proper error message when the path exists or does not exist (init/start). - errdefer posix.close(fd); - - // TODO Check that the file is actually a file. - - // On darwin assume that Direct I/O is always supported. - // Use F_NOCACHE to disable the page cache as O_DIRECT doesn't exist. - if (direct_io != .direct_io_disabled) { - _ = try posix.fcntl(fd, posix.F.NOCACHE, 1); - } - - // Obtain an advisory exclusive lock that works only if all processes actually use flock(). - // LOCK_NB means that we want to fail the lock without waiting if another process has it. - posix.flock(fd, posix.LOCK.EX | posix.LOCK.NB) catch |err| switch (err) { - error.WouldBlock => @panic("another process holds the data file lock"), - else => return err, - }; - - // Ask the file system to allocate contiguous sectors for the file (if possible): - // If the file system does not support `fallocate()`, then this could mean more seeks or a - // panic if we run out of disk space (ENOSPC). - if (method == .create) try fs_allocate(fd, size); - - // The best fsync strategy is always to fsync before reading because this prevents us from - // making decisions on data that was never durably written by a previously crashed process. - // We therefore always fsync when we open the path, also to wait for any pending O_DSYNC. - // Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out. - try fs_sync(fd); - - // We fsync the parent directory to ensure that the file inode is durably written. - // The caller is responsible for the parent directory inode stored under the grandparent. - // We always do this when opening because we don't know if this was done before crashing. - try fs_sync(dir_fd); - - // TODO Document that `size` is now `data_file_size_min` from `main.zig`. - const stat = try posix.fstat(fd); - if (stat.size < size) @panic("data file inode size was truncated or corrupted"); - - return fd; - } - /// Darwin's fsync() syscall does not flush past the disk cache. We must use F_FULLFSYNC /// instead. /// https://twitter.com/TigerBeetleDB/status/1422491736224436225 diff --git a/io/linux.zig b/io/linux.zig index f734b33..c2c079a 100644 --- a/io/linux.zig +++ b/io/linux.zig @@ -1518,294 +1518,6 @@ pub const IO = struct { pub const fd_t = posix.fd_t; pub const INVALID_FILE: fd_t = -1; - /// Opens or creates a journal file: - /// - For reading and writing. - /// - For Direct I/O (if possible in development mode, but required in production mode). - /// - Obtains an advisory exclusive lock to the file descriptor. - /// - Allocates the file contiguously on disk if this is supported by the file system. - /// - Ensures that the file data (and file inode in the parent directory) is durable on disk. - /// The caller is responsible for ensuring that the parent directory inode is durable. - /// - Verifies that the file size matches the expected file size before returning. - pub fn open_data_file( - self: *IO, - dir_fd: fd_t, - relative_path: []const u8, - size: u64, - method: enum { create, create_or_open, open, open_read_only }, - direct_io: DirectIO, - ) !fd_t { - _ = self; - - assert(relative_path.len > 0); - assert(size % constants.sector_size == 0); - // Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page) - assert(!std.fs.path.isAbsolute(relative_path)); - - var flags: posix.O = .{ - .CLOEXEC = true, - .ACCMODE = if (method == .open_read_only) .RDONLY else .RDWR, - .DSYNC = true, - }; - var mode: posix.mode_t = 0; - - const kind: enum { file, block_device } = blk: { - const stat = posix.fstatat( - dir_fd, - relative_path, - 0, - ) catch |err| switch (err) { - error.FileNotFound => { - if (method == .create or method == .create_or_open) { - // It's impossible to distinguish creating a new file and opening a new - // block device with the current API. So if it's possible that we should - // create a file we try that instead of failing here. - break :blk .file; - } else { - @panic("Path does not exist."); - } - }, - else => |err_| return err_, - }; - if (posix.S.ISBLK(stat.mode)) { - break :blk .block_device; - } else { - if (!posix.S.ISREG(stat.mode)) { - @panic("file path does not point to block device or regular file."); - } - break :blk .file; - } - }; - - // This is not strictly necessary on 64bit systems but it's harmless. - // This will avoid errors with handling large files on certain configurations - // of 32bit kernels. In all other cases, it's a noop. - // See: - if (@hasField(posix.O, "LARGEFILE")) flags.LARGEFILE = true; - - switch (kind) { - .block_device => { - if (direct_io != .direct_io_disabled) { - // Block devices should always support Direct IO. - flags.DIRECT = true; - // Use O_EXCL when opening as a block device to obtain an advisory exclusive - // lock. Normally, you can't do this for files you don't create, but for - // block devices this guarantees: - // - that there are no mounts using this block device - // - that no new mounts can use this block device while we have it open - // - // However it doesn't prevent other processes with root from opening without - // O_EXCL and writing (mount is just a special case that always checks O_EXCL). - // - // This should be stronger than flock(2) locks, which work on a separate system. - // The relevant kernel code (as of v6.7) is here: - // - flags.EXCL = true; - } - log.info("opening block device \"{s}\"...", .{relative_path}); - }, - .file => { - var direct_io_supported = false; - const dir_on_tmpfs = try fs_is_tmpfs(dir_fd); - - if (dir_on_tmpfs) { - log.warn( - "tmpfs is not durable, and your data will be lost on reboot", - .{}, - ); - } - - // Special case. tmpfs doesn't support Direct I/O. Normally we would panic - // here (see below) but being able to benchmark production workloads - // on tmpfs is very useful for removing - // disk speed from the equation. - if (direct_io != .direct_io_disabled and !dir_on_tmpfs) { - direct_io_supported = try fs_supports_direct_io(dir_fd); - if (direct_io_supported) { - flags.DIRECT = true; - } else if (direct_io == .direct_io_optional) { - log.warn("This file system does not support Direct I/O.", .{}); - } else { - assert(direct_io == .direct_io_required); - // We require Direct I/O for safety to handle fsync failure correctly, and - // therefore panic in production if it is not supported. - log.err("This file system does not support Direct I/O.", .{}); - log.err("TigerBeetle uses Direct I/O to bypass the kernel page cache, " ++ - "to ensure that data is durable when writes complete.", .{}); - log.err("If this is a production replica, Direct I/O is required.", .{}); - log.err("If this is a development/testing replica, " ++ - "re-run with --development set to bypass this error.", .{}); - @panic("file system does not support Direct I/O"); - } - } - - switch (method) { - .create => { - flags.CREAT = true; - flags.EXCL = true; - mode = 0o666; - log.info("creating \"{s}\"...", .{relative_path}); - }, - .create_or_open => { - flags.CREAT = true; - mode = 0o666; - log.info("opening or creating \"{s}\"...", .{relative_path}); - }, - .open, .open_read_only => { - log.info("opening \"{s}\"...", .{relative_path}); - }, - } - }, - } - - // This is critical as we rely on O_DSYNC for fsync() whenever we write to the file: - assert(flags.DSYNC); - - const fd = try posix.openat(dir_fd, relative_path, flags, mode); - // TODO Return a proper error message when the path exists or does not exist (init/start). - errdefer posix.close(fd); - - { - // Make sure we're getting the type of file descriptor we expect. - const stat = try posix.fstat(fd); - switch (kind) { - .file => assert(posix.S.ISREG(stat.mode)), - .block_device => assert(posix.S.ISBLK(stat.mode)), - } - } - - // Obtain an advisory exclusive lock that works only if all processes actually use flock(). - // LOCK_NB means that we want to fail the lock without waiting if another process has it. - // - // This is wrapped inside a retry loop with a sleep because of the interaction between - // io_uring semantics and flock: flocks are held per fd, but io_uring will keep a reference - // to the fd alive even once a process has been terminated, until all async operations have - // been completed. - // - // This means that when killing and starting a tigerbeetle process in an automated way, you - // can see "another process holds the data file lock" errors, even though the process really - // has terminated. - for (0..2) |_| { - posix.flock(fd, posix.LOCK.EX | posix.LOCK.NB) catch |err| switch (err) { - error.WouldBlock => { - std.time.sleep(50 * std.time.ns_per_ms); - continue; - }, - else => return err, - }; - break; - } else { - posix.flock(fd, posix.LOCK.EX | posix.LOCK.NB) catch |err| switch (err) { - error.WouldBlock => @panic("another process holds the data file lock"), - else => return err, - }; - } - - // Ask the file system to allocate contiguous sectors for the file (if possible): - // If the file system does not support `fallocate()`, then this could mean more seeks or a - // panic if we run out of disk space (ENOSPC). - if (method == .create and kind == .file) { - log.info("allocating {}...", .{std.fmt.fmtIntSizeBin(size)}); - fs_allocate(fd, size) catch |err| switch (err) { - error.OperationNotSupported => { - log.warn("file system does not support fallocate(), an ENOSPC will panic", .{}); - log.info("allocating by writing to the last sector " ++ - "of the file instead...", .{}); - - const sector_size = constants.sector_size; - const sector: [sector_size]u8 align(sector_size) = [_]u8{0} ** sector_size; - - // Handle partial writes where the physical sector is - // less than a logical sector: - const write_offset = size - sector.len; - var written: usize = 0; - while (written < sector.len) { - written += try posix.pwrite(fd, sector[written..], write_offset + written); - } - }, - else => |e| return e, - }; - } - - // The best fsync strategy is always to fsync before reading because this prevents us from - // making decisions on data that was never durably written by a previously crashed process. - // We therefore always fsync when we open the path, also to wait for any pending O_DSYNC. - // Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out. - try posix.fsync(fd); - - // We fsync the parent directory to ensure that the file inode is durably written. - // The caller is responsible for the parent directory inode stored under the grandparent. - // We always do this when opening because we don't know if this was done before crashing. - try posix.fsync(dir_fd); - - switch (kind) { - .file => { - if ((try posix.fstat(fd)).size < size) { - @panic("data file inode size was truncated or corrupted"); - } - }, - .block_device => { - const BLKGETSIZE64 = os.linux.IOCTL.IOR(0x12, 114, usize); - var block_device_size: usize = 0; - - switch (os.linux.E.init(os.linux.ioctl( - fd, - BLKGETSIZE64, - @intFromPtr(&block_device_size), - ))) { - .SUCCESS => {}, - - // These are the only errors that are supposed to be possible from ioctl(2). - .BADF => return error.InvalidFileDescriptor, - .NOTTY => return error.BadRequest, - .FAULT => return error.InvalidAddress, - else => |err| return stdx.unexpected_errno("open_file:ioctl", err), - } - - if (block_device_size < size) { - std.debug.panic( - "The block device used is too small ({} available/{} needed).", - .{ - std.fmt.fmtIntSizeBin(block_device_size), - std.fmt.fmtIntSizeBin(size), - }, - ); - } - - if (method == .create or method == .create_or_open) { - // Check that the first superblock_zone_size bytes are 0. - // - It'll ensure that the block device is not directly TigerBeetle. - // - It'll be very likely to catch any cases where there's an existing - // other filesystem. - // - In the case of there being a partition table (eg, two partitions, - // one starting at 0MiB, one at 1024MiB) and the operator tries to format - // the raw disk (/dev/sda) while a partition later is - // TigerBeetle (/dev/sda2) it'll be blocked by the MBR/GPT existing. - const superblock_zone_size = - @import("../superblock.zig").superblock_zone_size; - var read_buf: [superblock_zone_size]u8 align(constants.sector_size) = undefined; - - // We can do this without worrying about retrying partial reads because on - // linux, read(2) on block devices can not be interrupted by signals. - // See signal(7). - assert(superblock_zone_size == try posix.read(fd, &read_buf)); - if (!std.mem.allEqual(u8, &read_buf, 0)) { - std.debug.panic( - "Superblock on block device not empty. " ++ - "If this is the correct block device to use, " ++ - "please zero the first {} using a tool like dd.", - .{std.fmt.fmtIntSizeBin(superblock_zone_size)}, - ); - } - // Reset position in the block device to compensate for read(2). - try posix.lseek_CUR(fd, -superblock_zone_size); - assert(try posix.lseek_CUR_get(fd) == 0); - } - }, - } - - return fd; - } - /// Detects whether the underlying file system for a given directory fd is tmpfs. This is used /// to relax our Direct I/O check - running on tmpfs for benchmarking is useful. fn fs_is_tmpfs(dir_fd: fd_t) !bool { diff --git a/superblock.zig b/superblock.zig index eb52cc0..2a1a239 100644 --- a/superblock.zig +++ b/superblock.zig @@ -33,9 +33,9 @@ const maybe = stdx.maybe; const mem = std.mem; const meta = std.meta; -const constants = @import("../constants.zig"); -const stdx = @import("../stdx.zig"); -const vsr = @import("../vsr.zig"); +const constants = @import("constants.zig"); +const stdx = @import("stdx.zig"); +const vsr = @import("vsr.zig"); const log = std.log.scoped(.superblock); pub const Quorums = @import("superblock_quorums.zig").QuorumsType(.{ @@ -1572,12 +1572,6 @@ pub fn SuperBlockType(comptime Storage: type) type { @tagName(context.caller), }); - if (Storage == @import("../testing/storage.zig").Storage) { - // We should have finished all pending superblock io before starting any more. - superblock.storage.assert_no_pending_reads(.superblock); - superblock.storage.assert_no_pending_writes(.superblock); - } - if (context.caller == .open) { superblock.read_working(context, .open); } else { @@ -1594,12 +1588,6 @@ pub fn SuperBlockType(comptime Storage: type) type { @tagName(context.caller), }); - if (Storage == @import("../testing/storage.zig").Storage) { - // We should have finished all pending io by now. - superblock.storage.assert_no_pending_reads(.superblock); - superblock.storage.assert_no_pending_writes(.superblock); - } - switch (context.caller) { .format => {}, .open => {