1 /** 2 * io_uring system api definitions. 3 * 4 * See: https://github.com/torvalds/linux/blob/master/include/uapi/linux/io_uring.h 5 * 6 * Last changes from: f8e85cf255ad57d65eeb9a9d0e59e3dec55bdd9e (20191123) 7 */ 8 module during.io_uring; 9 10 version (linux): 11 12 import core.sys.posix.poll; 13 import core.sys.posix.signal; 14 15 @system nothrow @nogc: 16 17 /** 18 * IO operation submission data structure (Submission queue entry). 19 * 20 * C API: `struct io_uring_sqe` 21 */ 22 struct SubmissionEntry 23 { 24 Operation opcode; /// type of operation for this sqe 25 SubmissionEntryFlags flags; /// IOSQE_ flags 26 ushort ioprio; /// ioprio for the request 27 int fd; /// file descriptor to do IO on 28 union 29 { 30 ulong off; /// offset into file 31 ulong addr2; /// from Linux 5.5 32 } 33 34 ulong addr; /// pointer to buffer or iovecs 35 uint len; /// buffer size or number of iovecs 36 37 union 38 { 39 ReadWriteFlags rw_flags; 40 FsyncFlags fsync_flags; 41 PollEvents poll_events; 42 SyncFileRangeFlags sync_range_flags; /// from Linux 5.2 43 MsgFlags msg_flags; /// from Linux 5.3 44 TimeoutFlags timeout_flags; /// from Linux 5.4 45 AcceptFlags accept_flags; /// from Linux 5.5 46 // uint cancel_flags; /// from Linux 5.5 (TODO: not any yet) 47 } 48 49 ulong user_data; /// data to be passed back at completion time 50 51 union 52 { 53 ushort buf_index; /// index into fixed buffers, if used 54 ulong[3] __pad2; 55 } 56 57 /// Resets entry fields 58 void clear() @safe nothrow @nogc 59 { 60 this = SubmissionEntry.init; 61 } 62 } 63 64 enum ReadWriteFlags : int 65 { 66 NONE = 0, 67 68 /// High priority read/write. Allows block-based filesystems to 69 /// use polling of the device, which provides lower latency, but 70 /// may use additional resources. (Currently, this feature is 71 /// usable only on a file descriptor opened using the 72 /// O_DIRECT flag.) 73 /// 74 /// (since Linux 4.6) 75 HIPRI = 0x00000001, 76 77 /// Provide a per-write equivalent of the O_DSYNC open(2) flag. 78 /// This flag is meaningful only for pwritev2(), and its effect 79 /// applies only to the data range written by the system call. 80 /// 81 /// (since Linux 4.7) 82 DSYNC = 0x00000002, 83 84 /// Provide a per-write equivalent of the O_SYNC open(2) flag. 85 /// This flag is meaningful only for pwritev2(), and its effect 86 /// applies only to the data range written by the system call. 87 /// 88 /// (since Linux 4.7) 89 SYNC = 0x00000004, 90 91 /// Do not wait for data which is not immediately available. If 92 /// this flag is specified, the preadv2() system call will 93 /// return instantly if it would have to read data from the 94 /// backing storage or wait for a lock. If some data was 95 /// successfully read, it will return the number of bytes read. 96 /// If no bytes were read, it will return -1 and set errno to 97 /// EAGAIN. Currently, this flag is meaningful only for 98 /// preadv2(). 99 /// 100 /// (since Linux 4.14) 101 NOWAIT = 0x00000008, 102 103 /// Provide a per-write equivalent of the O_APPEND open(2) flag. 104 /// This flag is meaningful only for pwritev2(), and its effect 105 /// applies only to the data range written by the system call. 106 /// The offset argument does not affect the write operation; the 107 /// data is always appended to the end of the file. However, if 108 /// the offset argument is -1, the current file offset is 109 /// updated. 110 /// 111 /// (since Linux 4.16) 112 APPEND = 0x00000010 113 } 114 115 enum FsyncFlags : uint 116 { 117 /// Normal file integrity sync 118 NORMAL = 0, 119 120 /** 121 * `fdatasync` semantics. 122 * 123 * See_Also: `fsync(2)` for details 124 */ 125 DATASYNC = (1 << 0) 126 } 127 128 /** Possible poll event flags. 129 * See: poll(2) 130 */ 131 enum PollEvents : ushort 132 { 133 /// There is data to read. 134 IN = POLLIN, 135 136 /** Writing is now possible, though a write larger that the available 137 * space in a socket or pipe will still block (unless O_NONBLOCK is set). 138 */ 139 OUT = POLLOUT, 140 141 /** There is some exceptional condition on the file descriptor. 142 * Possibilities include: 143 * 144 * * There is out-of-band data on a TCP socket (see tcp(7)). 145 * * A pseudoterminal master in packet mode has seen a state 146 * change on the slave (see ioctl_tty(2)). 147 * * A cgroup.events file has been modified (see cgroups(7)). 148 */ 149 PRI = POLLPRI, 150 151 /** Error condition (only returned in revents; ignored in events). 152 * This bit is also set for a file descriptor referring to the 153 * write end of a pipe when the read end has been closed. 154 */ 155 ERR = POLLERR, 156 157 /// Invalid request: fd not open (only returned in revents; ignored in events). 158 NVAL = POLLNVAL, 159 160 RDNORM = POLLRDNORM, /// Equivalent to POLLIN. 161 RDBAND = POLLRDBAND, /// Priority band data can be read (generally unused on Linux). 162 WRNORM = POLLWRNORM, /// Equivalent to POLLOUT. 163 WRBAND = POLLWRBAND, /// Priority data may be written. 164 165 /** Hang up (only returned in revents; ignored in events). Note 166 * that when reading from a channel such as a pipe or a stream 167 * socket, this event merely indicates that the peer closed its 168 * end of the channel. Subsequent reads from the channel will 169 * return 0 (end of file) only after all outstanding data in the 170 * channel has been consumed. 171 */ 172 HUP = POLLHUP, 173 } 174 175 /** 176 * Flags for `sync_file_range(2)` operation. 177 * 178 * See_Also: `sync_file_range(2)` for details 179 */ 180 enum SyncFileRangeFlags : uint 181 { 182 NOOP = 0, /// no operation 183 /// Wait upon write-out of all pages in the specified range that have already been submitted to 184 /// the device driver for write-out before performing any write. 185 WAIT_BEFORE = 1U << 0, 186 187 /// Initiate write-out of all dirty pages in the specified range which are not presently 188 /// submitted write-out. Note that even this may block if you attempt to write more than 189 /// request queue size. 190 WRITE = 1U << 1, 191 192 /// Wait upon write-out of all pages in the range after performing any write. 193 WAIT_AFTER = 1U << 2, 194 195 /// This is a write-for-data-integrity operation that will ensure that all pages in the 196 /// specified range which were dirty when sync_file_range() was called are committed to disk. 197 WRITE_AND_WAIT = WAIT_BEFORE | WRITE | WAIT_AFTER 198 } 199 200 /** 201 * Flags for `sendmsg(2)` and `recvmsg(2)` operations. 202 * 203 * See_Also: man pages for the operations. 204 */ 205 enum MsgFlags : uint 206 { 207 /// No flags defined 208 NONE = 0, 209 210 /// Sends out-of-band data on sockets that support this notion (e.g., of type `SOCK_STREAM`); the 211 /// underlying protocol must also support out-of-band data. 212 OOB = 0x01, 213 214 /// This flag causes the receive operation to return data from the beginning of the receive 215 /// queue without removing that data from the queue. Thus, a subsequent receive call will return 216 /// the same data. 217 PEEK = 0x02, 218 219 /// Don't use a gateway to send out the packet, send to hosts only on directly connected 220 /// networks. This is usually used only by diagnostic or routing programs. This is defined only 221 /// for protocol families that route; packet sockets don't. 222 DONTROUTE = 0x04, 223 224 /// For raw (`AF_PACKET`), Internet datagram (since Linux 2.4.27/2.6.8), netlink (since Linux 225 /// 2.6.22), and UNIX datagram (since Linux 3.4) sockets: return the real length of the packet 226 /// or datagram, even when it was longer than the passed buffer. 227 /// 228 /// For use with Internet stream sockets, see `tcp(7)`. 229 TRUNC = 0x20, 230 231 /// Enables nonblocking operation; if the operation would block, EAGAIN or EWOULDBLOCK is 232 /// returned. This provides similar behavior to setting the O_NONBLOCK flag (via the `fcntl(2)` 233 /// F_SETFL operation), but differs in that `MSG_DONTWAIT` is a per-call option, whereas 234 /// `O_NONBLOCK` is a setting on the open file description (see `open(2)`), which will affect 235 /// all threads in the calling process and as well as other processes that hold file descriptors 236 /// referring to the same open file description. 237 DONTWAIT = 0x40, 238 239 /// Terminates a record (when this notion is supported, as for sockets of type `SOCK_SEQPACKET`). 240 EOR = 0x80, 241 242 /// This flag requests that the operation block until the full request is satisfied. However, 243 /// the call may still return less data than requested if a signal is caught, an error or 244 /// disconnect occurs, or the next data to be received is of a different type than that 245 /// returned. This flag has no effect for datagram sockets. 246 WAITALL = 0x100, 247 248 /// Tell the link layer that forward progress happened: you got a successful reply from the 249 /// other side. If the link layer doesn't get this it will regularly reprobe the neighbor (e.g., 250 /// via a unicast ARP). Valid only on SOCK_DGRAM and SOCK_RAW sockets and currently 251 /// implemented only for IPv4 and IPv6. See arp(7) for details. 252 CONFIRM = 0x800, 253 254 /// This flag specifies that queued errors should be received from the socket error queue. The 255 /// error is passed in an ancillary message with a type dependent on the protocol (for IPv4 256 /// `IP_RECVERR`). The user should supply a buffer of sufficient size. See `cmsg(3)` and `ip(7)` 257 /// for more information. The payload of the original packet that caused the error is passed as 258 /// normal data via msg_iovec. The original destination address of the datagram that caused the 259 /// error is supplied via `msg_name`. 260 ERRQUEUE = 0x2000, 261 262 /// Don't generate a `SIGPIPE` signal if the peer on a stream-oriented socket has closed the 263 /// connection. The `EPIPE` error is still returned. This provides similar behavior to using 264 /// `sigaction(2)` to ignore `SIGPIPE`, but, whereas `MSG_NOSIGNAL` is a per-call feature, 265 /// ignoring `SIGPIPE` sets a process attribute that affects all threads in the process. 266 NOSIGNAL = 0x4000, 267 268 /// The caller has more data to send. This flag is used with TCP sockets to obtain the same 269 /// effect as the `TCP_CORK` socket option (see `tcp(7)`), with the difference that this flag can be 270 /// set on a per-call basis. 271 /// 272 /// Since Linux 2.6, this flag is also supported for UDP sockets, and informs the kernel to 273 /// package all of the data sent in calls with this flag set into a single datagram which is 274 /// transmitted only when a call is performed that does not specify this flag. 275 /// 276 /// See_Also: the `UDP_CORK` socket option described in `udp(7)` 277 MORE = 0x8000, 278 279 /// Set the close-on-exec flag for the file descriptor received via a UNIX domain file 280 /// descriptor using the `SCM_RIGHTS` operation (described in `unix(7)`). This flag is useful 281 /// for the same reasons as the `O_CLOEXEC` flag of `open(2)`. (recvmsg only) 282 CMSG_CLOEXEC = 0x40000000 283 } 284 285 /** sqe->timeout_flags 286 */ 287 enum TimeoutFlags : uint 288 { 289 REL = 0, 290 ABS = 1U << 0 /// `IORING_TIMEOUT_ABS` (from Linux 5.5) 291 } 292 293 /** 294 * Flags that can be used with the `accept4(2)` operation. 295 */ 296 enum AcceptFlags : uint 297 { 298 /// Same as `accept()` 299 NONE = 0, 300 301 /// Set the `O_NONBLOCK` file status flag on the new open file description. Using this flag saves 302 /// extra calls to `fcntl(2)` to achieve the same result. 303 NONBLOCK = 0x800, // octal 00004000 304 305 /// Set the close-on-exec (`FD_CLOEXEC`) flag on the new file descriptor. See the description of 306 /// the `O_CLOEXEC` flag in `open(2)` for reasons why this may be useful. 307 CLOEXEC = 0x80000 // octal 02000000 308 } 309 310 /** 311 * Describes the operation to be performed 312 * 313 * See_Also: `io_uring_enter(2)` 314 */ 315 enum Operation : ubyte 316 { 317 // available from Linux 5.1 318 NOP = 0, /// IORING_OP_NOP 319 READV = 1, /// IORING_OP_READV 320 WRITEV = 2, /// IORING_OP_WRITEV 321 FSYNC = 3, /// IORING_OP_FSYNC 322 READ_FIXED = 4, /// IORING_OP_READ_FIXED 323 WRITE_FIXED = 5, /// IORING_OP_WRITE_FIXED 324 POLL_ADD = 6, /// IORING_OP_POLL_ADD 325 POLL_REMOVE = 7, /// IORING_OP_POLL_REMOVE 326 327 // available from Linux 5.2 328 SYNC_FILE_RANGE = 8, /// IORING_OP_SYNC_FILE_RANGE 329 330 // available from Linux 5.3 331 SENDMSG = 9, /// IORING_OP_SENDMSG 332 RECVMSG = 10, /// IORING_OP_RECVMSG 333 334 // available from Linux 5.4 335 TIMEOUT = 11, /// IORING_OP_TIMEOUT 336 337 // available from Linux 5.5 (in master now) 338 TIMEOUT_REMOVE = 12, /// IORING_OP_TIMEOUT_REMOVE 339 ACCEPT = 13, /// IORING_OP_ACCEPT 340 ASYNC_CANCEL = 14, /// IORING_OP_ASYNC_CANCEL 341 LINK_TIMEOUT = 15, /// IORING_OP_LINK_TIMEOUT 342 CONNECT = 16, /// IORING_OP_CONNECT 343 } 344 345 /// sqe->flags 346 enum SubmissionEntryFlags : ubyte 347 { 348 NONE = 0, 349 FIXED_FILE = 1U << 0, /// IOSQE_FIXED_FILE: use fixed fileset 350 351 /** 352 * `IOSQE_IO_DRAIN`: issue after inflight IO 353 * 354 * If a request is marked with `IO_DRAIN`, then previous commands must complete before this one 355 * is issued. Subsequent requests are not started until the drain has completed. 356 * 357 * Note: available from Linux 5.2 358 */ 359 IO_DRAIN = 1U << 1, 360 361 /** 362 * `IOSQE_IO_LINK` 363 * 364 * If set, the next SQE in the ring will depend on this SQE. A dependent SQE will not be started 365 * until the parent SQE has completed. If the parent SQE fails, then a dependent SQE will be 366 * failed without being started. Link chains can be arbitrarily long, the chain spans any new 367 * SQE that continues tohave the IOSQE_IO_LINK flag set. Once an SQE is encountered that does 368 * not have this flag set, that defines the end of the chain. This features allows to form 369 * dependencies between individual SQEs. 370 * 371 * Note: available from Linux 5.3 372 */ 373 IO_LINK = 1U << 2, 374 } 375 376 /** 377 * IO completion data structure (Completion Queue Entry) 378 * 379 * C API: `struct io_uring_cqe` 380 */ 381 struct CompletionEntry 382 { 383 ulong user_data; /* sqe->data submission passed back */ 384 int res; /* result code for this event */ 385 uint flags; 386 } 387 388 /** 389 * Passed in for io_uring_setup(2). Copied back with updated info on success. 390 * 391 * C API: `struct io_uring_params` 392 */ 393 struct SetupParameters 394 { 395 // Magic offsets for the application to mmap the data it needs 396 397 /// `IORING_OFF_SQ_RING`: mmap offset for submission queue ring 398 enum ulong SUBMISSION_QUEUE_RING_OFFSET = 0UL; 399 /// `IORING_OFF_CQ_RING`: mmap offset for completion queue ring 400 enum ulong COMPLETION_QUEUE_RING_OFFSET = 0x8000000UL; 401 /// `IORING_OFF_SQES`: mmap offset for submission entries 402 enum ulong SUBMISSION_QUEUE_ENTRIES_OFFSET = 0x10000000UL; 403 404 /// (output) allocated entries in submission queue 405 /// (both ring index `array` and separate entry array at `SUBMISSION_QUEUE_ENTRIES_OFFSET`). 406 uint sq_entries; 407 408 /// (output) allocated entries in completion queue 409 uint cq_entries; 410 411 SetupFlags flags; /// (input) 412 413 /// (input) used if SQ_AFF and SQPOLL flags are active to pin poll thread to specific cpu. 414 /// right now always checked in kernel for "possible cpu". 415 uint sq_thread_cpu; 416 417 /// (input) used if SQPOLL flag is active; timeout in milliseconds 418 /// until kernel poll thread goes to sleep. 419 uint sq_thread_idle; 420 SetupFeatures features; /// (from Linux 5.4) 421 private uint[4] resv; // reserved 422 SubmissionQueueRingOffsets sq_off; /// (output) submission queue ring data field offsets 423 CompletionQueueRingOffsets cq_off; /// (output) completion queue ring data field offsets 424 } 425 426 /// `io_uring_setup()` flags 427 enum SetupFlags : uint 428 { 429 /// No flags set 430 NONE = 0, 431 432 /** 433 * `IORING_SETUP_IOPOLL` 434 * 435 * Perform busy-waiting for an I/O completion, as opposed to getting notifications via an 436 * asynchronous IRQ (Interrupt Request). The file system (if any) and block device must 437 * support polling in order for this to work. Busy-waiting provides lower latency, but may 438 * consume more CPU resources than interrupt driven I/O. Currently, this feature is usable 439 * only on a file descriptor opened using the O_DIRECT flag. When a read or write is submitted 440 * to a polled context, the application must poll for completions on the CQ ring by calling 441 * io_uring_enter(2). It is illegal to mix and match polled and non-polled I/O on an io_uring 442 * instance. 443 */ 444 IOPOLL = 1U << 0, 445 446 /** 447 * `IORING_SETUP_SQPOLL` 448 * 449 * When this flag is specified, a kernel thread is created to perform submission queue polling. 450 * An io_uring instance configured in this way enables an application to issue I/O without ever 451 * context switching into the kernel. 452 * By using the submission queue to fill in new submission queue entries and watching for 453 * completions on the completion queue, the application can submit and reap I/Os without doing 454 * a single system call. 455 * If the kernel thread is idle for more than sq_thread_idle microseconds, it will set the 456 * IORING_SQ_NEED_WAKEUP bit in the flags field of the struct io_sq_ring. When this happens, 457 * the application must call io_uring_enter(2) to wake the kernel thread. If I/O is kept busy, 458 * the kernel thread will never sleep. An application making use of this feature will need to 459 * guard the io_uring_enter(2) call with the following code sequence: 460 * 461 * ```` 462 * // Ensure that the wakeup flag is read after the tail pointer has been written. 463 * smp_mb(); 464 * if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP) 465 * io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP); 466 * ``` 467 * 468 * where sq_ring is a submission queue ring setup using the struct io_sqring_offsets described below. 469 * 470 * To successfully use this feature, the application must register a set of files to be used for 471 * IO through io_uring_register(2) using the IORING_REGISTER_FILES opcode. Failure to do so will 472 * result in submitted IO being errored with EBADF. 473 */ 474 SQPOLL = 1U << 1, 475 476 /** 477 * `IORING_SETUP_SQ_AFF` 478 * 479 * If this flag is specified, then the poll thread will be bound to the cpu set in the 480 * sq_thread_cpu field of the struct io_uring_params. This flag is only meaningful when 481 * IORING_SETUP_SQPOLL is specified. 482 */ 483 SQ_AFF = 1U << 2, 484 485 /** 486 * `IORING_SETUP_CQSIZE` 487 * 488 * Create the completion queue with struct io_uring_params.cq_entries entries. The value must 489 * be greater than entries, and may be rounded up to the next power-of-two. 490 * 491 * Note: Available from Linux 5.5 492 */ 493 CQSIZE = 1U << 3, 494 } 495 496 /// `io_uring_params->features` flags 497 enum SetupFeatures : uint 498 { 499 NONE = 0, 500 SINGLE_MMAP = 1U << 0, /// `IORING_FEAT_SINGLE_MMAP` (from Linux 5.4) 501 NODROP = 1U << 1 /// `IORING_FEAT_NODROP` (from Linux 5.5) 502 } 503 504 /** 505 * Filled with the offset for mmap(2) 506 * 507 * C API: `struct io_sqring_offsets` 508 */ 509 struct SubmissionQueueRingOffsets 510 { 511 /// Incremented by kernel after entry at `head` was processed. 512 /// Pending submissions: [head..tail] 513 uint head; 514 515 /// Modified by user space when new entry was queued; points to next 516 /// entry user space is going to fill. 517 uint tail; 518 519 /// value `value_at(self.ring_entries) - 1` 520 /// mask for indices at `head` and `tail` (don't delete masked bits! 521 /// `head` and `tail` can point to the same entry, but if they are 522 /// not exactly equal it implies the ring is full, and if they are 523 /// exactly equal the ring is empty.) 524 uint ring_mask; 525 526 /// value same as SetupParameters.sq_entries, power of 2. 527 uint ring_entries; 528 529 /// SubmissionQueueFlags 530 SubmissionQueueFlags flags; 531 532 /// number of (invalid) entries that were dropped; entries are 533 /// invalid if their index (in `array`) is out of bounds. 534 uint dropped; 535 536 /// index into array of `SubmissionEntry`s at offset `SUBMISSION_QUEUE_ENTRIES_OFFSET` in mmap() 537 uint array; 538 539 private uint[3] resv; // reserved 540 } 541 542 enum SubmissionQueueFlags: uint 543 { 544 NONE = 0, 545 546 /// `IORING_SQ_NEED_WAKEUP`: needs io_uring_enter wakeup 547 /// set by kernel poll thread when it goes sleeping, and reset on wakeup 548 NEED_WAKEUP = 1U << 0 549 } 550 551 /** 552 * Field offsets used to map kernel structure to our. 553 * 554 * C API: `struct io_cqring_offsets` 555 */ 556 struct CompletionQueueRingOffsets 557 { 558 /// incremented by user space after entry at `head` was processed. 559 /// available entries for processing: [head..tail] 560 uint head; 561 562 /// modified by kernel when new entry was created; points to next 563 /// entry kernel is going to fill. 564 uint tail; 565 566 /// value `value_at(ring_entries) - 1` 567 /// mask for indices at `head` and `tail` (don't delete masked bits! 568 /// `head` and `tail` can point to the same entry, but if they are 569 /// not exactly equal it implies the ring is full, and if they are 570 /// exactly equal the ring is empty.) 571 uint ring_mask; 572 573 /// value same as SetupParameters.cq_entries, power of 2. 574 uint ring_entries; 575 576 /// incremented by the kernel every time it failed to queue a 577 /// completion event because the ring was full. 578 uint overflow; 579 580 /// Offset to array of completion queue entries 581 uint cqes; 582 583 private ulong[2] resv; // reserved 584 } 585 586 /// io_uring_register(2) opcodes and arguments 587 enum RegisterOpCode : uint 588 { 589 /** 590 * `arg` points to a struct iovec array of nr_args entries. The buffers associated with the 591 * iovecs will be locked in memory and charged against the user's RLIMIT_MEMLOCK resource limit. 592 * See getrlimit(2) for more informa‐ tion. Additionally, there is a size limit of 1GiB per 593 * buffer. Currently, the buffers must be anonymous, non-file-backed memory, such as that 594 * returned by malloc(3) or mmap(2) with the MAP_ANONYMOUS flag set. It is expected that this 595 * limitation will be lifted in the future. Huge pages are supported as well. Note that the 596 * entire huge page will be pinned in the kernel, even if only a portion of it is used. 597 * 598 * After a successful call, the supplied buffers are mapped into the kernel and eligible for 599 * I/O. To make use of them, the application must specify the IORING_OP_READ_FIXED or 600 * IORING_OP_WRITE_FIXED opcodes in the submis‐ sion queue entry (see the struct io_uring_sqe 601 * definition in io_uring_enter(2)), and set the buf_index field to the desired buffer index. 602 * The memory range described by the submission queue entry's addr and len fields must fall 603 * within the indexed buffer. 604 * 605 * It is perfectly valid to setup a large buffer and then only use part of it for an I/O, as 606 * long as the range is within the originally mapped region. 607 * 608 * An application can increase or decrease the size or number of registered buffers by first 609 * unregistering the existing buffers, and then issuing a new call to io_uring_register() with 610 * the new buffers. 611 * 612 * An application need not unregister buffers explicitly before shutting down the io_uring 613 * instance. 614 * 615 * `IORING_REGISTER_BUFFERS` 616 */ 617 REGISTER_BUFFERS = 0, 618 619 /** 620 * This operation takes no argument, and `arg` must be passed as NULL. All previously registered 621 * buffers associated with the io_uring instance will be released. 622 * 623 * `IORING_UNREGISTER_BUFFERS` 624 */ 625 UNREGISTER_BUFFERS = 1, 626 627 /** 628 * Register files for I/O. `arg` contains a pointer to an array of `nr_args` file descriptors 629 * (signed 32 bit integers). 630 * 631 * To make use of the registered files, the IOSQE_FIXED_FILE flag must be set in the flags 632 * member of the struct io_uring_sqe, and the fd member is set to the index of the file in the 633 * file descriptor array. 634 * 635 * Files are automatically unregistered when the io_uring instance is torn down. An application 636 * need only unregister if it wishes to register a new set of fds. 637 * 638 * `IORING_REGISTER_FILES` 639 */ 640 REGISTER_FILES = 2, 641 642 /** 643 * This operation requires no argument, and `arg` must be passed as NULL. All previously 644 * registered files associated with the io_uring instance will be unregistered. 645 * 646 * `IORING_UNREGISTER_FILES` 647 */ 648 UNREGISTER_FILES = 3, 649 650 /** 651 * `IORING_REGISTER_EVENTFD` 652 * 653 * Registers eventfd that would be used to notify about completions on io_uring itself. 654 * 655 * Note: available from Linux 5.2 656 */ 657 REGISTER_EVENTFD = 4, 658 659 /** 660 * `IORING_UNREGISTER_EVENTFD` 661 * 662 * Unregisters previously registered eventfd. 663 * 664 * Note: available from Linux 5.2 665 */ 666 UNREGISTER_EVENTFD = 5, 667 668 /// `IORING_REGISTER_FILES_UPDATE` (from Linux 5.5) 669 REGISTER_FILES_UPDATE = 6, 670 } 671 672 /// io_uring_enter(2) flags 673 enum EnterFlags: uint 674 { 675 NONE = 0, 676 GETEVENTS = (1 << 0), /// `IORING_ENTER_GETEVENTS` 677 SQ_WAKEUP = (1 << 1), /// `IORING_ENTER_SQ_WAKEUP` 678 } 679 680 /// Time specification as defined in kernel headers (used by TIMEOUT operations) 681 struct KernelTimespec 682 { 683 long tv_sec; /// seconds 684 long tv_nsec; /// nanoseconds 685 } 686 687 static assert(CompletionEntry.sizeof == 16); 688 static assert(CompletionQueueRingOffsets.sizeof == 40); 689 static assert(SetupParameters.sizeof == 120); 690 static assert(SubmissionEntry.sizeof == 64); 691 static assert(SubmissionQueueRingOffsets.sizeof == 40); 692 693 /** 694 * Setup a context for performing asynchronous I/O. 695 * 696 * The `io_uring_setup()` system call sets up a submission queue (SQ) and completion queue (CQ) with 697 * at least entries entries, and returns a file descriptor which can be used to perform subsequent 698 * operations on the io_uring instance. The submission and completion queues are shared between 699 * userspace and the kernel, which eliminates the need to copy data when initiating and completing 700 * I/O. 701 * 702 * See_Also: `io_uring_setup(2)` 703 * 704 * Params: 705 * entries = Defines how many entries can submission queue hold. 706 * p = `SetupParameters` 707 * 708 * Returns: 709 * `io_uring_setup(2)` returns a new file descriptor on success. The application may then provide 710 * the file descriptor in a subsequent `mmap(2)` call to map the submission and completion queues, 711 * or to the `io_uring_register(2)` or `io_uring_enter(2)` system calls. 712 * 713 * On error, -1 is returned and `errno` is set appropriately. 714 */ 715 int io_uring_setup(uint entries, scope ref SetupParameters p) @trusted 716 { 717 pragma(inline); 718 return syscall(SYS_io_uring_setup, entries, &p); 719 } 720 721 /** 722 * Initiate and/or complete asynchronous I/O 723 * 724 * `io_uring_enter()` is used to initiate and complete I/O using the shared submission and 725 * completion queues setup by a call to `io_uring_setup(2)`. A single call can both submit new I/O 726 * and wait for completions of I/O initiated by this call or previous calls to `io_uring_enter()``. 727 * 728 * When the system call returns that a certain amount of SQEs have been consumed and submitted, it's 729 * safe to reuse SQE entries in the ring. This is true even if the actual IO submission had to be 730 * punted to async context, which means that the SQE may in fact not have been submitted yet. If the 731 * kernel requires later use of a particular SQE entry, it will have made a private copy of it. 732 * 733 * Note: For interrupt driven I/O (where `IORING_SETUP_IOPOLL` was not specified in the call to 734 * `io_uring_setup(2)`), an application may check the completion queue for event completions without 735 * entering the kernel at all. 736 * 737 * See_Also: `io_uring_enter(2)` 738 * 739 * Params: 740 * fd = the file descriptor returned by io_uring_setup(2). 741 * to_submit = specifies the number of I/Os to submit from the submission queue. 742 * min_complete = If the `IORING_ENTER_GETEVENTS` bit is set in flags, then the system call will attempt 743 * to wait for `min_complete` event completions before returning. If the io_uring instance was configured 744 * for polling, by specifying IORING_SETUP_IOPOLL in the call to io_uring_setup(2), then 745 * min_complete has a slightly different meaning. Passing a value of 0 instructs the kernel to 746 * return any events which are already complete, without blocking. If min_complete is a non-zero 747 * value, the kernel will still return immediately if any completion events are available. If 748 * no event completions are available, then the call will poll either until one or more 749 * completions become available, or until the process has exceeded its scheduler time slice. 750 * flags = Behavior modification flags - `EnterFlags` 751 * sig = a pointer to a signal mask (see `sigprocmask(2)`); if sig is not `null`, `io_uring_enter()` 752 * first replaces the current signal mask by the one pointed to by sig, then waits for events to 753 * become available in the completion queue, and then restores the original signal mask. The 754 * following `io_uring_enter()` call: 755 * 756 * ``` 757 * ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, &sig); 758 * ``` 759 * 760 * is equivalent to atomically executing the following calls: 761 * 762 * ``` 763 * pthread_sigmask(SIG_SETMASK, &sig, &orig); 764 * ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, NULL); 765 * pthread_sigmask(SIG_SETMASK, &orig, NULL); 766 * ``` 767 * 768 * See the description of `pselect(2)` for an explanation of why the sig parameter is necessary. 769 * 770 * Returns: 771 */ 772 int io_uring_enter(int fd, uint to_submit, uint min_complete, EnterFlags flags, const sigset_t* sig = null) 773 { 774 pragma(inline); 775 return syscall(SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigset_t.sizeof); 776 } 777 778 /** 779 * Register files or user buffers for asynchronous I/O. 780 * 781 * The `io_uring_register()` system call registers user buffers or files for use in an `io_uring(7)` 782 * instance referenced by fd. Registering files or user buffers allows the kernel to take long term 783 * references to internal data structures or create long term mappings of application memory, 784 * greatly reducing per-I/O overhead. 785 * 786 * See_Also: `io_uring_register(2) 787 * 788 * Params: 789 * fd = the file descriptor returned by a call to io_uring_setup(2) 790 * opcode = code of operation to execute on args 791 * arg = Args used by specified operation. See `RegisterOpCode` for usage details. 792 * nr_args = number of provided arguments 793 * 794 * Returns: On success, io_uring_register() returns 0. On error, -1 is returned, and errno is set accordingly. 795 */ 796 int io_uring_register(int fd, RegisterOpCode opcode, const(void)* arg, uint nr_args) 797 { 798 pragma(inline); 799 return syscall(SYS_io_uring_register, fd, opcode, arg, nr_args); 800 } 801 802 private: 803 804 // Syscalls 805 enum 806 { 807 SYS_io_uring_setup = 425, 808 SYS_io_uring_enter = 426, 809 SYS_io_uring_register = 427 810 } 811 812 extern (C): 813 814 /// Invoke `system call' number `sysno`, passing it the remaining arguments. 815 int syscall(int sysno, ...);