Skip to content
This repository was archived by the owner on Sep 9, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 0 additions & 102 deletions io/darwin.zig
Original file line number Diff line number Diff line change
Expand Up @@ -827,108 +827,6 @@ pub const IO = struct {
pub const fd_t = posix.fd_t;
pub const INVALID_FILE: fd_t = -1;

/// Opens or creates a journal file:
/// - For reading and writing.
/// - For Direct I/O (required on darwin).
/// - Obtains an advisory exclusive lock to the file descriptor.
/// - Allocates the file contiguously on disk if this is supported by the file system.
/// - Ensures that the file data (and file inode in the parent directory) is durable on disk.
/// The caller is responsible for ensuring that the parent directory inode is durable.
/// - Verifies that the file size matches the expected file size before returning.
pub fn open_data_file(
self: *IO,
dir_fd: fd_t,
relative_path: []const u8,
size: u64,
method: enum { create, create_or_open, open, open_read_only },
direct_io: DirectIO,
) !fd_t {
_ = self;

assert(relative_path.len > 0);
assert(size % constants.sector_size == 0);

// TODO Use O_EXCL when opening as a block device to obtain a mandatory exclusive lock.
// This is much stronger than an advisory exclusive lock, and is required on some platforms.

// Normally, O_DSYNC enables us to omit fsync() calls in the data plane, since we sync to
// the disk on every write, but that's not the case for Darwin:
// https://x.com/TigerBeetleDB/status/1536628729031581697
// To work around this, fs_sync() is explicitly called after writing in do_operation.
var flags: posix.O = .{
.CLOEXEC = true,
.ACCMODE = if (method == .open_read_only) .RDONLY else .RDWR,
.DSYNC = true,
};
var mode: posix.mode_t = 0;

// TODO Document this and investigate whether this is in fact correct to set here.
if (@hasField(posix.O, "LARGEFILE")) flags.LARGEFILE = true;

switch (method) {
.create => {
flags.CREAT = true;
flags.EXCL = true;
mode = 0o666;
log.info("creating \"{s}\"...", .{relative_path});
},
.create_or_open => {
flags.CREAT = true;
mode = 0o666;
log.info("opening or creating \"{s}\"...", .{relative_path});
},
.open, .open_read_only => {
log.info("opening \"{s}\"...", .{relative_path});
},
}

// This is critical as we rely on O_DSYNC for fsync() whenever we write to the file:
assert(flags.DSYNC);

// Be careful with openat(2): "If pathname is absolute, then dirfd is ignored." (man page)
assert(!std.fs.path.isAbsolute(relative_path));
const fd = try posix.openat(dir_fd, relative_path, flags, mode);
// TODO Return a proper error message when the path exists or does not exist (init/start).
errdefer posix.close(fd);

// TODO Check that the file is actually a file.

// On darwin assume that Direct I/O is always supported.
// Use F_NOCACHE to disable the page cache as O_DIRECT doesn't exist.
if (direct_io != .direct_io_disabled) {
_ = try posix.fcntl(fd, posix.F.NOCACHE, 1);
}

// Obtain an advisory exclusive lock that works only if all processes actually use flock().
// LOCK_NB means that we want to fail the lock without waiting if another process has it.
posix.flock(fd, posix.LOCK.EX | posix.LOCK.NB) catch |err| switch (err) {
error.WouldBlock => @panic("another process holds the data file lock"),
else => return err,
};

// Ask the file system to allocate contiguous sectors for the file (if possible):
// If the file system does not support `fallocate()`, then this could mean more seeks or a
// panic if we run out of disk space (ENOSPC).
if (method == .create) try fs_allocate(fd, size);

// The best fsync strategy is always to fsync before reading because this prevents us from
// making decisions on data that was never durably written by a previously crashed process.
// We therefore always fsync when we open the path, also to wait for any pending O_DSYNC.
// Thanks to Alex Miller from FoundationDB for diving into our source and pointing this out.
try fs_sync(fd);

// We fsync the parent directory to ensure that the file inode is durably written.
// The caller is responsible for the parent directory inode stored under the grandparent.
// We always do this when opening because we don't know if this was done before crashing.
try fs_sync(dir_fd);

// TODO Document that `size` is now `data_file_size_min` from `main.zig`.
const stat = try posix.fstat(fd);
if (stat.size < size) @panic("data file inode size was truncated or corrupted");

return fd;
}

/// Darwin's fsync() syscall does not flush past the disk cache. We must use F_FULLFSYNC
/// instead.
/// https://twitter.com/TigerBeetleDB/status/1422491736224436225
Expand Down
Loading