1 /** 2 * io_uring system api definitions. 3 * 4 * See: https://github.com/torvalds/linux/blob/master/include/uapi/linux/io_uring.h 5 * 6 * Last changes from: dd47c104533dedb90434a3f142e94a671ac623a6 (20210913) 7 */ 8 module during.io_uring; 9 10 version (linux): 11 12 import core.sys.posix.poll; 13 import core.sys.posix.signal; 14 15 @system nothrow @nogc: 16 17 /** 18 * IO operation submission data structure (Submission queue entry). 19 * 20 * C API: `struct io_uring_sqe` 21 */ 22 struct SubmissionEntry 23 { 24 Operation opcode; /// type of operation for this sqe 25 SubmissionEntryFlags flags; /// IOSQE_ flags 26 ushort ioprio; /// ioprio for the request 27 int fd; /// file descriptor to do IO on 28 union 29 { 30 ulong off; /// offset into file 31 ulong addr2; /// from Linux 5.5 32 } 33 34 union 35 { 36 ulong addr; /// pointer to buffer or iovecs 37 ulong splice_off_in; 38 } 39 uint len; /// buffer size or number of iovecs 40 41 union 42 { 43 ReadWriteFlags rw_flags; 44 FsyncFlags fsync_flags; 45 ushort poll_events; /// Unused from 5.9, kept for compatibility reasons - see https://github.com/torvalds/linux/commit/5769a351b89cd4d82016f18fa5f6c4077403564d 46 PollEvents poll_events32; /// from Linux 5.9 - word-reversed for BE 47 SyncFileRangeFlags sync_range_flags; /// from Linux 5.2 48 MsgFlags msg_flags; /// from Linux 5.3 49 TimeoutFlags timeout_flags; /// from Linux 5.4 50 AcceptFlags accept_flags; /// from Linux 5.5 51 uint cancel_flags; /// from Linux 5.5 52 uint open_flags; /// from Linux 5.6 53 uint statx_flags; /// from Linux 5.6 54 uint fadvise_advice; /// from Linux 5.6 55 uint splice_flags; /// from Linux 5.7 56 uint rename_flags; /// from Linux 5.11 57 uint unlink_flags; /// from Linux 5.11 58 uint hardlink_flags; /// from Linux 5.15 59 } 60 61 ulong user_data; /// data to be passed back at completion time 62 63 union 64 { 65 align (1): 66 ushort buf_index; /// index into fixed buffers, if used 67 ushort buf_group; /// for grouped buffer selection 68 } 69 70 ushort personality; /// personality to use, if used 71 union 72 { 73 int splice_fd_in; 74 uint file_index; 75 } 76 ulong[2] __pad2; 77 78 /// Resets entry fields 79 void clear() @safe nothrow @nogc 80 { 81 this = SubmissionEntry.init; 82 } 83 } 84 85 enum ReadWriteFlags : int 86 { 87 NONE = 0, 88 89 /// High priority read/write. Allows block-based filesystems to 90 /// use polling of the device, which provides lower latency, but 91 /// may use additional resources. (Currently, this feature is 92 /// usable only on a file descriptor opened using the 93 /// O_DIRECT flag.) 94 /// 95 /// (since Linux 4.6) 96 HIPRI = 0x00000001, 97 98 /// Provide a per-write equivalent of the O_DSYNC open(2) flag. 99 /// This flag is meaningful only for pwritev2(), and its effect 100 /// applies only to the data range written by the system call. 101 /// 102 /// (since Linux 4.7) 103 DSYNC = 0x00000002, 104 105 /// Provide a per-write equivalent of the O_SYNC open(2) flag. 106 /// This flag is meaningful only for pwritev2(), and its effect 107 /// applies only to the data range written by the system call. 108 /// 109 /// (since Linux 4.7) 110 SYNC = 0x00000004, 111 112 /// Do not wait for data which is not immediately available. If 113 /// this flag is specified, the preadv2() system call will 114 /// return instantly if it would have to read data from the 115 /// backing storage or wait for a lock. If some data was 116 /// successfully read, it will return the number of bytes read. 117 /// If no bytes were read, it will return -1 and set errno to 118 /// EAGAIN. Currently, this flag is meaningful only for 119 /// preadv2(). 120 /// 121 /// (since Linux 4.14) 122 NOWAIT = 0x00000008, 123 124 /// Provide a per-write equivalent of the O_APPEND open(2) flag. 125 /// This flag is meaningful only for pwritev2(), and its effect 126 /// applies only to the data range written by the system call. 127 /// The offset argument does not affect the write operation; the 128 /// data is always appended to the end of the file. However, if 129 /// the offset argument is -1, the current file offset is 130 /// updated. 131 /// 132 /// (since Linux 4.16) 133 APPEND = 0x00000010 134 } 135 136 enum FsyncFlags : uint 137 { 138 /// Normal file integrity sync 139 NORMAL = 0, 140 141 /** 142 * `fdatasync` semantics. 143 * 144 * See_Also: `fsync(2)` for details 145 */ 146 DATASYNC = (1 << 0) 147 } 148 149 /** Possible poll event flags. 150 * See: poll(2) 151 */ 152 enum PollEvents : uint 153 { 154 NONE = 0, 155 156 /// There is data to read. 157 IN = POLLIN, 158 159 /** Writing is now possible, though a write larger that the available 160 * space in a socket or pipe will still block (unless O_NONBLOCK is set). 161 */ 162 OUT = POLLOUT, 163 164 /** There is some exceptional condition on the file descriptor. 165 * Possibilities include: 166 * 167 * * There is out-of-band data on a TCP socket (see tcp(7)). 168 * * A pseudoterminal master in packet mode has seen a state 169 * change on the slave (see ioctl_tty(2)). 170 * * A cgroup.events file has been modified (see cgroups(7)). 171 */ 172 PRI = POLLPRI, 173 174 /** Error condition (only returned in revents; ignored in events). 175 * This bit is also set for a file descriptor referring to the 176 * write end of a pipe when the read end has been closed. 177 */ 178 ERR = POLLERR, 179 180 /// Invalid request: fd not open (only returned in revents; ignored in events). 181 NVAL = POLLNVAL, 182 183 RDNORM = POLLRDNORM, /// Equivalent to POLLIN. 184 RDBAND = POLLRDBAND, /// Priority band data can be read (generally unused on Linux). 185 WRNORM = POLLWRNORM, /// Equivalent to POLLOUT. 186 WRBAND = POLLWRBAND, /// Priority data may be written. 187 188 /** Hang up (only returned in revents; ignored in events). Note 189 * that when reading from a channel such as a pipe or a stream 190 * socket, this event merely indicates that the peer closed its 191 * end of the channel. Subsequent reads from the channel will 192 * return 0 (end of file) only after all outstanding data in the 193 * channel has been consumed. 194 */ 195 HUP = POLLHUP, 196 197 /** (since Linux 2.6.17) 198 * Stream socket peer closed connection, or shut down writing half of connection. 199 */ 200 RDHUP = 0x2000, 201 202 /** (since Linux 4.5) 203 * Sets an exclusive wakeup mode for the epoll file descriptor that is being attached to the 204 * target file descriptor, fd. When a wakeup event occurs and multiple epoll file descriptors 205 * are attached to the same target file using EPOLLEXCLUSIVE, one or more of the epoll file 206 * descriptors will receive an event with epoll_wait(2). The default in this scenario (when 207 * EPOLLEXCLUSIVE is not set) is for all epoll file descriptors to receive an event. 208 * EPOLLEXCLUSIVE is thus useful for avoiding thundering herd problems in certain scenarios. 209 */ 210 EXCLUSIVE = 0x10000000, 211 } 212 213 /** 214 * Flags for `sync_file_range(2)` operation. 215 * 216 * See_Also: `sync_file_range(2)` for details 217 */ 218 enum SyncFileRangeFlags : uint 219 { 220 NOOP = 0, /// no operation 221 /// Wait upon write-out of all pages in the specified range that have already been submitted to 222 /// the device driver for write-out before performing any write. 223 WAIT_BEFORE = 1U << 0, 224 225 /// Initiate write-out of all dirty pages in the specified range which are not presently 226 /// submitted write-out. Note that even this may block if you attempt to write more than 227 /// request queue size. 228 WRITE = 1U << 1, 229 230 /// Wait upon write-out of all pages in the range after performing any write. 231 WAIT_AFTER = 1U << 2, 232 233 /// This is a write-for-data-integrity operation that will ensure that all pages in the 234 /// specified range which were dirty when sync_file_range() was called are committed to disk. 235 WRITE_AND_WAIT = WAIT_BEFORE | WRITE | WAIT_AFTER 236 } 237 238 /** 239 * Flags for `sendmsg(2)` and `recvmsg(2)` operations. 240 * 241 * See_Also: man pages for the operations. 242 */ 243 enum MsgFlags : uint 244 { 245 /// No flags defined 246 NONE = 0, 247 248 /// Sends out-of-band data on sockets that support this notion (e.g., of type `SOCK_STREAM`); the 249 /// underlying protocol must also support out-of-band data. 250 OOB = 0x01, 251 252 /// This flag causes the receive operation to return data from the beginning of the receive 253 /// queue without removing that data from the queue. Thus, a subsequent receive call will return 254 /// the same data. 255 PEEK = 0x02, 256 257 /// Don't use a gateway to send out the packet, send to hosts only on directly connected 258 /// networks. This is usually used only by diagnostic or routing programs. This is defined only 259 /// for protocol families that route; packet sockets don't. 260 DONTROUTE = 0x04, 261 262 /// For raw (`AF_PACKET`), Internet datagram (since Linux 2.4.27/2.6.8), netlink (since Linux 263 /// 2.6.22), and UNIX datagram (since Linux 3.4) sockets: return the real length of the packet 264 /// or datagram, even when it was longer than the passed buffer. 265 /// 266 /// For use with Internet stream sockets, see `tcp(7)`. 267 TRUNC = 0x20, 268 269 /// Enables nonblocking operation; if the operation would block, EAGAIN or EWOULDBLOCK is 270 /// returned. This provides similar behavior to setting the O_NONBLOCK flag (via the `fcntl(2)` 271 /// F_SETFL operation), but differs in that `MSG_DONTWAIT` is a per-call option, whereas 272 /// `O_NONBLOCK` is a setting on the open file description (see `open(2)`), which will affect 273 /// all threads in the calling process and as well as other processes that hold file descriptors 274 /// referring to the same open file description. 275 DONTWAIT = 0x40, 276 277 /// Terminates a record (when this notion is supported, as for sockets of type `SOCK_SEQPACKET`). 278 EOR = 0x80, 279 280 /// This flag requests that the operation block until the full request is satisfied. However, 281 /// the call may still return less data than requested if a signal is caught, an error or 282 /// disconnect occurs, or the next data to be received is of a different type than that 283 /// returned. This flag has no effect for datagram sockets. 284 WAITALL = 0x100, 285 286 /// Tell the link layer that forward progress happened: you got a successful reply from the 287 /// other side. If the link layer doesn't get this it will regularly reprobe the neighbor (e.g., 288 /// via a unicast ARP). Valid only on SOCK_DGRAM and SOCK_RAW sockets and currently 289 /// implemented only for IPv4 and IPv6. See arp(7) for details. 290 CONFIRM = 0x800, 291 292 /// This flag specifies that queued errors should be received from the socket error queue. The 293 /// error is passed in an ancillary message with a type dependent on the protocol (for IPv4 294 /// `IP_RECVERR`). The user should supply a buffer of sufficient size. See `cmsg(3)` and `ip(7)` 295 /// for more information. The payload of the original packet that caused the error is passed as 296 /// normal data via msg_iovec. The original destination address of the datagram that caused the 297 /// error is supplied via `msg_name`. 298 ERRQUEUE = 0x2000, 299 300 /// Don't generate a `SIGPIPE` signal if the peer on a stream-oriented socket has closed the 301 /// connection. The `EPIPE` error is still returned. This provides similar behavior to using 302 /// `sigaction(2)` to ignore `SIGPIPE`, but, whereas `MSG_NOSIGNAL` is a per-call feature, 303 /// ignoring `SIGPIPE` sets a process attribute that affects all threads in the process. 304 NOSIGNAL = 0x4000, 305 306 /// The caller has more data to send. This flag is used with TCP sockets to obtain the same 307 /// effect as the `TCP_CORK` socket option (see `tcp(7)`), with the difference that this flag can be 308 /// set on a per-call basis. 309 /// 310 /// Since Linux 2.6, this flag is also supported for UDP sockets, and informs the kernel to 311 /// package all of the data sent in calls with this flag set into a single datagram which is 312 /// transmitted only when a call is performed that does not specify this flag. 313 /// 314 /// See_Also: the `UDP_CORK` socket option described in `udp(7)` 315 MORE = 0x8000, 316 317 /// Set the close-on-exec flag for the file descriptor received via a UNIX domain file 318 /// descriptor using the `SCM_RIGHTS` operation (described in `unix(7)`). This flag is useful 319 /// for the same reasons as the `O_CLOEXEC` flag of `open(2)`. (recvmsg only) 320 CMSG_CLOEXEC = 0x40000000 321 } 322 323 /** sqe->timeout_flags 324 */ 325 enum TimeoutFlags : uint 326 { 327 REL = 0, /// Relative time is the default 328 ABS = 1U << 0, /// Absolute time - `IORING_TIMEOUT_ABS` (from Linux 5.5) 329 330 /** 331 * `IORING_TIMEOUT_UPDATE` (from Linux 5.11) 332 * 333 * Support timeout updates through `IORING_OP_TIMEOUT_REMOVE` with passed in `IORING_TIMEOUT_UPDATE`. 334 */ 335 UPDATE = 1U << 1, 336 337 /** 338 * `IORING_TIMEOUT_BOOTTIME` (from Linux 5.15) 339 */ 340 BOOTTIME = 1U << 2, 341 342 /** 343 * `IORING_TIMEOUT_REALTIME` (from Linux 5.15) 344 */ 345 REALTIME = 1U << 3, 346 347 /** 348 * `IORING_LINK_TIMEOUT_UPDATE` (from Linux 5.15) 349 */ 350 LINK_TIMEOUT_UPDATE = 1U << 4, 351 352 /** 353 * `IORING_TIMEOUT_CLOCK_MASK` (from Linux 5.15) 354 */ 355 CLOCK_MASK = BOOTTIME | REALTIME, 356 357 /** 358 * `IORING_TIMEOUT_UPDATE_MASK` (from Linux 5.15) 359 */ 360 UPDATE_MASK = UPDATE | LINK_TIMEOUT_UPDATE, 361 } 362 363 /** 364 * sqe->splice_flags 365 * extends splice(2) flags 366 */ 367 enum SPLICE_F_FD_IN_FIXED = 1U << 31; /* the last bit of __u32 */ 368 369 /** 370 * POLL_ADD flags 371 * 372 * Note that since sqe->poll_events is the flag space, the command flags for POLL_ADD are stored in 373 * sqe->len. 374 */ 375 enum PollFlags : uint 376 { 377 NONE = 0, 378 379 /** 380 * `IORING_POLL_ADD_MULTI` - Multishot poll. Sets `IORING_CQE_F_MORE` if the poll handler will 381 * continue to report CQEs on behalf of the same SQE. 382 * 383 * The default io_uring poll mode is one-shot, where once the event triggers, the poll command 384 * is completed and won't trigger any further events. If we're doing repeated polling on the 385 * same file or socket, then it can be more efficient to do multishot, where we keep triggering 386 * whenever the event becomes true. 387 * 388 * This deviates from the usual norm of having one CQE per SQE submitted. Add a CQE flag, 389 * IORING_CQE_F_MORE, which tells the application to expect further completion events from the 390 * submitted SQE. Right now the only user of this is POLL_ADD in multishot mode. 391 * 392 * An application should expect more CQEs for the specificed SQE if the CQE is flagged with 393 * IORING_CQE_F_MORE. In multishot mode, only cancelation or an error will terminate the poll 394 * request, in which case the flag will be cleared. 395 * 396 * Note: available from Linux 5.13 397 */ 398 ADD_MULTI = 1U << 0, 399 400 /** 401 * `IORING_POLL_UPDATE_EVENTS` 402 * 403 * Update existing poll request, matching sqe->addr as the old user_data field. 404 * 405 * Note: available from Linux 5.13 406 */ 407 UPDATE_EVENTS = 1U << 1, 408 409 /** 410 * `IORING_POLL_UPDATE_USER_DATA` 411 * 412 * Update existing poll request, matching sqe->addr as the old user_data field. 413 * 414 * Note: available from Linux 5.13 415 */ 416 UPDATE_USER_DATA = 1U << 2, 417 } 418 419 /** 420 * Flags that can be used with the `accept4(2)` operation. 421 */ 422 enum AcceptFlags : uint 423 { 424 /// Same as `accept()` 425 NONE = 0, 426 427 /// Set the `O_NONBLOCK` file status flag on the new open file description. Using this flag saves 428 /// extra calls to `fcntl(2)` to achieve the same result. 429 NONBLOCK = 0x800, // octal 00004000 430 431 /// Set the close-on-exec (`FD_CLOEXEC`) flag on the new file descriptor. See the description of 432 /// the `O_CLOEXEC` flag in `open(2)` for reasons why this may be useful. 433 CLOEXEC = 0x80000 // octal 02000000 434 } 435 436 /** 437 * Describes the operation to be performed 438 * 439 * See_Also: `io_uring_enter(2)` 440 */ 441 enum Operation : ubyte 442 { 443 // available from Linux 5.1 444 NOP = 0, /// IORING_OP_NOP 445 READV = 1, /// IORING_OP_READV 446 WRITEV = 2, /// IORING_OP_WRITEV 447 FSYNC = 3, /// IORING_OP_FSYNC 448 READ_FIXED = 4, /// IORING_OP_READ_FIXED 449 WRITE_FIXED = 5, /// IORING_OP_WRITE_FIXED 450 POLL_ADD = 6, /// IORING_OP_POLL_ADD 451 POLL_REMOVE = 7, /// IORING_OP_POLL_REMOVE 452 453 // available from Linux 5.2 454 SYNC_FILE_RANGE = 8, /// IORING_OP_SYNC_FILE_RANGE 455 456 // available from Linux 5.3 457 SENDMSG = 9, /// IORING_OP_SENDMSG 458 RECVMSG = 10, /// IORING_OP_RECVMSG 459 460 // available from Linux 5.4 461 TIMEOUT = 11, /// IORING_OP_TIMEOUT 462 463 // available from Linux 5.5 464 TIMEOUT_REMOVE = 12, /// IORING_OP_TIMEOUT_REMOVE 465 ACCEPT = 13, /// IORING_OP_ACCEPT 466 ASYNC_CANCEL = 14, /// IORING_OP_ASYNC_CANCEL 467 LINK_TIMEOUT = 15, /// IORING_OP_LINK_TIMEOUT 468 CONNECT = 16, /// IORING_OP_CONNECT 469 470 // available from Linux 5.6 471 FALLOCATE = 17, /// IORING_OP_FALLOCATE 472 OPENAT = 18, /// IORING_OP_OPENAT 473 CLOSE = 19, /// IORING_OP_CLOSE 474 FILES_UPDATE = 20, /// IORING_OP_FILES_UPDATE 475 STATX = 21, /// IORING_OP_STATX 476 READ = 22, /// IORING_OP_READ 477 WRITE = 23, /// IORING_OP_WRITE 478 FADVISE = 24, /// IORING_OP_FADVISE 479 MADVISE = 25, /// IORING_OP_MADVISE 480 SEND = 26, /// IORING_OP_SEND 481 RECV = 27, /// IORING_OP_RECV 482 OPENAT2 = 28, /// IORING_OP_OPENAT2 483 EPOLL_CTL = 29, /// IORING_OP_EPOLL_CTL 484 485 // available from Linux 5.7 486 SPLICE = 30, /// IORING_OP_SPLICE 487 PROVIDE_BUFFERS = 31, /// IORING_OP_PROVIDE_BUFFERS 488 REMOVE_BUFFERS = 32, /// IORING_OP_REMOVE_BUFFERS 489 490 // available from Linux 5.8 491 TEE = 33, /// IORING_OP_TEE 492 493 // available from Linux 5.11 494 SHUTDOWN = 34, /// IORING_OP_SHUTDOWN 495 RENAMEAT = 35, /// IORING_OP_RENAMEAT - see renameat2() 496 UNLINKAT = 36, /// IORING_OP_UNLINKAT - see unlinkat(2) 497 498 // available from Linux 5.15 499 MKDIRAT = 37, /// IORING_OP_MKDIRAT - see mkdirat(2) 500 SYMLINKAT = 38, /// IORING_OP_SYMLINKAT - see symlinkat(2) 501 LINKAT = 39, /// IORING_OP_LINKAT - see linkat(2) 502 } 503 504 /// sqe->flags 505 enum SubmissionEntryFlags : ubyte 506 { 507 NONE = 0, 508 509 /// Use fixed fileset (`IOSQE_FIXED_FILE`) 510 /// 511 /// When this flag is specified, fd is an index into the files array registered with the 512 /// io_uring instance (see the `IORING_REGISTER_FILES` section of the io_uring_register(2) man 513 /// page). 514 FIXED_FILE = 1U << 0, 515 516 /** 517 * `IOSQE_IO_DRAIN`: issue after inflight IO 518 * 519 * If a request is marked with `IO_DRAIN`, then previous commands must complete before this one 520 * is issued. Subsequent requests are not started until the drain has completed. 521 * 522 * Note: available from Linux 5.2 523 */ 524 IO_DRAIN = 1U << 1, 525 526 /** 527 * `IOSQE_IO_LINK` 528 * 529 * If set, the next SQE in the ring will depend on this SQE. A dependent SQE will not be started 530 * until the parent SQE has completed. If the parent SQE fails, then a dependent SQE will be 531 * failed without being started. Link chains can be arbitrarily long, the chain spans any new 532 * SQE that continues tohave the IOSQE_IO_LINK flag set. Once an SQE is encountered that does 533 * not have this flag set, that defines the end of the chain. This features allows to form 534 * dependencies between individual SQEs. 535 * 536 * Note: available from Linux 5.3 537 */ 538 IO_LINK = 1U << 2, 539 540 /** 541 * `IOSQE_IO_HARDLINK` - like LINK, but stronger 542 * 543 * Some commands will invariably end in a failure in the sense that the 544 * completion result will be less than zero. One such example is timeouts 545 * that don't have a completion count set, they will always complete with 546 * `-ETIME` unless cancelled. 547 * 548 * For linked commands, we sever links and fail the rest of the chain if 549 * the result is less than zero. Since we have commands where we know that 550 * will happen, add IOSQE_IO_HARDLINK as a stronger link that doesn't sever 551 * regardless of the completion result. Note that the link will still sever 552 * if we fail submitting the parent request, hard links are only resilient 553 * in the presence of completion results for requests that did submit 554 * correctly. 555 * 556 * Note: available from Linux 5.5 557 */ 558 IO_HARDLINK = 1U << 3, 559 560 /** 561 * `IOSQE_ASYNC` 562 * 563 * io_uring defaults to always doing inline submissions, if at all possible. But for larger 564 * copies, even if the data is fully cached, that can take a long time. Add an IOSQE_ASYNC flag 565 * that the application can set on the SQE - if set, it'll ensure that we always go async for 566 * those kinds of requests. 567 * 568 * Note: available from Linux 5.6 569 */ 570 ASYNC = 1U << 4, /* always go async */ 571 572 /** 573 * `IOSQE_BUFFER_SELECT` 574 * If a server process has tons of pending socket connections, generally it uses epoll to wait 575 * for activity. When the socket is ready for reading (or writing), the task can select a buffer 576 * and issue a recv/send on the given fd. 577 * 578 * Now that we have fast (non-async thread) support, a task can have tons of pending reads or 579 * writes pending. But that means they need buffers to back that data, and if the number of 580 * connections is high enough, having them preallocated for all possible connections is 581 * unfeasible. 582 * 583 * With IORING_OP_PROVIDE_BUFFERS, an application can register buffers to use for any request. 584 * The request then sets IOSQE_BUFFER_SELECT in the sqe, and a given group ID in sqe->buf_group. 585 * When the fd becomes ready, a free buffer from the specified group is selected. If none are 586 * available, the request is terminated with -ENOBUFS. If successful, the CQE on completion will 587 * contain the buffer ID chosen in the cqe->flags member, encoded as: 588 * 589 * `(buffer_id << IORING_CQE_BUFFER_SHIFT) | IORING_CQE_F_BUFFER;` 590 * 591 * Once a buffer has been consumed by a request, it is no longer available and must be 592 * registered again with IORING_OP_PROVIDE_BUFFERS. 593 * 594 * Requests need to support this feature. For now, IORING_OP_READ and IORING_OP_RECV support it. 595 * This is checked on SQE submission, a CQE with res == -EOPNOTSUPP will be posted if attempted 596 * on unsupported requests. 597 * 598 * Note: available from Linux 5.7 599 */ 600 BUFFER_SELECT = 1U << 5, /* select buffer from sqe->buf_group */ 601 } 602 603 /** 604 * IO completion data structure (Completion Queue Entry) 605 * 606 * C API: `struct io_uring_cqe` 607 */ 608 struct CompletionEntry 609 { 610 ulong user_data; /** sqe->data submission passed back */ 611 int res; /** result code for this event */ 612 CQEFlags flags; 613 } 614 615 /// Flags used with `CompletionEntry` 616 enum CQEFlags : uint 617 { 618 NONE = 0, /// No flags set 619 620 /// `IORING_CQE_F_BUFFER` (from Linux 5.7) 621 /// If set, the upper 16 bits are the buffer ID 622 BUFFER = 1U << 0, 623 624 /// `IORING_CQE_F_MORE` (from Linux 5.13) 625 /// If set, parent SQE will generate more CQE entries 626 MORE = 1U << 1, 627 } 628 629 enum { 630 CQE_BUFFER_SHIFT = 16, /// Note: available from Linux 5.7 631 } 632 633 /** 634 * Passed in for io_uring_setup(2). Copied back with updated info on success. 635 * 636 * C API: `struct io_uring_params` 637 */ 638 struct SetupParameters 639 { 640 // Magic offsets for the application to mmap the data it needs 641 642 /// `IORING_OFF_SQ_RING`: mmap offset for submission queue ring 643 enum ulong SUBMISSION_QUEUE_RING_OFFSET = 0UL; 644 /// `IORING_OFF_CQ_RING`: mmap offset for completion queue ring 645 enum ulong COMPLETION_QUEUE_RING_OFFSET = 0x8000000UL; 646 /// `IORING_OFF_SQES`: mmap offset for submission entries 647 enum ulong SUBMISSION_QUEUE_ENTRIES_OFFSET = 0x10000000UL; 648 649 /// (output) allocated entries in submission queue 650 /// (both ring index `array` and separate entry array at `SUBMISSION_QUEUE_ENTRIES_OFFSET`). 651 uint sq_entries; 652 653 /// (output) allocated entries in completion queue 654 uint cq_entries; 655 656 SetupFlags flags; /// (input) 657 658 /// (input) used if SQ_AFF and SQPOLL flags are active to pin poll thread to specific cpu. 659 /// right now always checked in kernel for "possible cpu". 660 uint sq_thread_cpu; 661 662 /// (input) used if SQPOLL flag is active; timeout in milliseconds 663 /// until kernel poll thread goes to sleep. 664 uint sq_thread_idle; 665 SetupFeatures features; /// (from Linux 5.4) 666 uint wq_fd; /// (from Linux 5.6) 667 private uint[3] resv; // reserved 668 SubmissionQueueRingOffsets sq_off; /// (output) submission queue ring data field offsets 669 CompletionQueueRingOffsets cq_off; /// (output) completion queue ring data field offsets 670 } 671 672 /// `io_uring_setup()` flags 673 enum SetupFlags : uint 674 { 675 /// No flags set 676 NONE = 0, 677 678 /** 679 * `IORING_SETUP_IOPOLL` 680 * 681 * Perform busy-waiting for an I/O completion, as opposed to getting notifications via an 682 * asynchronous IRQ (Interrupt Request). The file system (if any) and block device must 683 * support polling in order for this to work. Busy-waiting provides lower latency, but may 684 * consume more CPU resources than interrupt driven I/O. Currently, this feature is usable 685 * only on a file descriptor opened using the O_DIRECT flag. When a read or write is submitted 686 * to a polled context, the application must poll for completions on the CQ ring by calling 687 * io_uring_enter(2). It is illegal to mix and match polled and non-polled I/O on an io_uring 688 * instance. 689 */ 690 IOPOLL = 1U << 0, 691 692 /** 693 * `IORING_SETUP_SQPOLL` 694 * 695 * When this flag is specified, a kernel thread is created to perform submission queue polling. 696 * An io_uring instance configured in this way enables an application to issue I/O without ever 697 * context switching into the kernel. 698 * By using the submission queue to fill in new submission queue entries and watching for 699 * completions on the completion queue, the application can submit and reap I/Os without doing 700 * a single system call. 701 * If the kernel thread is idle for more than sq_thread_idle microseconds, it will set the 702 * IORING_SQ_NEED_WAKEUP bit in the flags field of the struct io_sq_ring. When this happens, 703 * the application must call io_uring_enter(2) to wake the kernel thread. If I/O is kept busy, 704 * the kernel thread will never sleep. An application making use of this feature will need to 705 * guard the io_uring_enter(2) call with the following code sequence: 706 * 707 * ```` 708 * // Ensure that the wakeup flag is read after the tail pointer has been written. 709 * smp_mb(); 710 * if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP) 711 * io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP); 712 * ``` 713 * 714 * where sq_ring is a submission queue ring setup using the struct io_sqring_offsets described below. 715 * 716 * To successfully use this feature, the application must register a set of files to be used for 717 * IO through io_uring_register(2) using the IORING_REGISTER_FILES opcode. Failure to do so will 718 * result in submitted IO being errored with EBADF. 719 */ 720 SQPOLL = 1U << 1, 721 722 /** 723 * `IORING_SETUP_SQ_AFF` 724 * 725 * If this flag is specified, then the poll thread will be bound to the cpu set in the 726 * sq_thread_cpu field of the struct io_uring_params. This flag is only meaningful when 727 * IORING_SETUP_SQPOLL is specified. 728 */ 729 SQ_AFF = 1U << 2, 730 731 /** 732 * `IORING_SETUP_CQSIZE` 733 * 734 * Create the completion queue with struct io_uring_params.cq_entries entries. The value must 735 * be greater than entries, and may be rounded up to the next power-of-two. 736 * 737 * Note: Available from Linux 5.5 738 */ 739 CQSIZE = 1U << 3, 740 741 /** 742 * `IORING_SETUP_CLAMP` 743 * 744 * Some applications like to start small in terms of ring size, and then ramp up as needed. This 745 * is a bit tricky to do currently, since we don't advertise the max ring size. 746 * 747 * This adds IORING_SETUP_CLAMP. If set, and the values for SQ or CQ ring size exceed what we 748 * support, then clamp them at the max values instead of returning -EINVAL. Since we return the 749 * chosen ring sizes after setup, no further changes are needed on the application side. 750 * io_uring already changes the ring sizes if the application doesn't ask for power-of-two 751 * sizes, for example. 752 * 753 * Note: Available from Linux 5.6 754 */ 755 CLAMP = 1U << 4, /* clamp SQ/CQ ring sizes */ 756 757 /** 758 * `IORING_SETUP_ATTACH_WQ` 759 * 760 * If IORING_SETUP_ATTACH_WQ is set, it expects wq_fd in io_uring_params to be a valid io_uring 761 * fd io-wq of which will be shared with the newly created io_uring instance. If the flag is set 762 * but it can't share io-wq, it fails. 763 * 764 * This allows creation of "sibling" io_urings, where we prefer to keep the SQ/CQ private, but 765 * want to share the async backend to minimize the amount of overhead associated with having 766 * multiple rings that belong to the same backend. 767 * 768 * Note: Available from Linux 5.6 769 */ 770 ATTACH_WQ = 1U << 5, /* attach to existing wq */ 771 772 /** 773 * `IORING_SETUP_R_DISABLED` flag to start the rings disabled, allowing the user to register 774 * restrictions, buffers, files, before to start processing SQEs. 775 * 776 * When `IORING_SETUP_R_DISABLED` is set, SQE are not processed and SQPOLL kthread is not started. 777 * 778 * The restrictions registration are allowed only when the rings are disable to prevent 779 * concurrency issue while processing SQEs. 780 * 781 * The rings can be enabled using `IORING_REGISTER_ENABLE_RINGS` opcode with io_uring_register(2). 782 * 783 * Note: Available from Linux 5.10 784 */ 785 R_DISABLED = 1U << 6, /* start with ring disabled */ 786 } 787 788 /// `io_uring_params->features` flags 789 enum SetupFeatures : uint 790 { 791 NONE = 0, 792 793 /** 794 * `IORING_FEAT_SINGLE_MMAP` (from Linux 5.4) 795 * 796 * Indicates that we can use single mmap feature to map both sq and cq rings and so to avoid the 797 * second mmap. 798 */ 799 SINGLE_MMAP = 1U << 0, 800 801 /** 802 * `IORING_FEAT_NODROP` (from Linux 5.5) 803 * 804 * Currently we drop completion events, if the CQ ring is full. That's fine 805 * for requests with bounded completion times, but it may make it harder or 806 * impossible to use io_uring with networked IO where request completion 807 * times are generally unbounded. Or with POLL, for example, which is also 808 * unbounded. 809 * 810 * After this patch, we never overflow the ring, we simply store requests 811 * in a backlog for later flushing. This flushing is done automatically by 812 * the kernel. To prevent the backlog from growing indefinitely, if the 813 * backlog is non-empty, we apply back pressure on IO submissions. Any 814 * attempt to submit new IO with a non-empty backlog will get an -EBUSY 815 * return from the kernel. This is a signal to the application that it has 816 * backlogged CQ events, and that it must reap those before being allowed 817 * to submit more IO. 818 * 819 * Note that if we do return -EBUSY, we will have filled whatever 820 * backlogged events into the CQ ring first, if there's room. This means 821 * the application can safely reap events WITHOUT entering the kernel and 822 * waiting for them, they are already available in the CQ ring. 823 */ 824 NODROP = 1U << 1, 825 826 /** 827 * `IORING_FEAT_SUBMIT_STABLE` (from Linux 5.5) 828 * 829 * If this flag is set, applications can be certain that any data for async offload has been 830 * consumed when the kernel has consumed the SQE. 831 */ 832 SUBMIT_STABLE = 1U << 2, 833 834 /** 835 * `IORING_FEAT_RW_CUR_POS` (from Linux 5.6) 836 * 837 * If this flag is set, applications can know if setting `-1` as file offsets (meaning to work 838 * with current file position) is supported. 839 */ 840 RW_CUR_POS = 1U << 3, 841 842 /** 843 * `IORING_FEAT_CUR_PERSONALITY` (from Linux 5.6) 844 * 845 * We currently setup the io_wq with a static set of mm and creds. Even for a single-use io-wq 846 * per io_uring, this is suboptimal as we have may have multiple enters of the ring. For 847 * sharing the io-wq backend, it doesn't work at all. 848 * 849 * Switch to passing in the creds and mm when the work item is setup. This means that async 850 * work is no longer deferred to the io_uring mm and creds, it is done with the current mm and 851 * creds. 852 * 853 * Flag this behavior with IORING_FEAT_CUR_PERSONALITY, so applications know they can rely on 854 * the current personality (mm and creds) being the same for direct issue and async issue. 855 */ 856 CUR_PERSONALITY = 1U << 4, 857 858 /** 859 * `IORING_FEAT_FAST_POLL` (from Linux 5.7) 860 * 861 * Currently io_uring tries any request in a non-blocking manner, if it can, and then retries 862 * from a worker thread if we get -EAGAIN. Now that we have a new and fancy poll based retry 863 * backend, use that to retry requests if the file supports it. 864 * 865 * This means that, for example, an IORING_OP_RECVMSG on a socket no longer requires an async 866 * thread to complete the IO. If we get -EAGAIN reading from the socket in a non-blocking 867 * manner, we arm a poll handler for notification on when the socket becomes readable. When it 868 * does, the pending read is executed directly by the task again, through the io_uring task 869 * work handlers. Not only is this faster and more efficient, it also means we're not 870 * generating potentially tons of async threads that just sit and block, waiting for the IO to 871 * complete. 872 * 873 * The feature is marked with IORING_FEAT_FAST_POLL, meaning that async pollable IO is fast, 874 * and that poll<link>other_op is fast as well. 875 */ 876 FAST_POLL = 1U << 5, 877 878 /** 879 * `IORING_FEAT_POLL_32BITS` (from Linux 5.9) 880 * 881 * Poll events should be 32-bits to cover EPOLLEXCLUSIVE. 882 * Explicit word-swap the poll32_events for big endian to make sure the ABI is not changed. We 883 * call this feature IORING_FEAT_POLL_32BITS, applications who want to use EPOLLEXCLUSIVE should 884 * check the feature bit first. 885 */ 886 POLL_32BITS = 1U << 6, 887 888 /** 889 * `IORING_FEAT_SQPOLL_NONFIXED` (from Linux 5.11) 890 * 891 * The restriction of needing fixed files for SQPOLL is problematic, and prevents/inhibits 892 * several valid uses cases. With the referenced files_struct that we have now, it's trivially 893 * supportable. 894 * 895 * Treat ->files like we do the mm for the SQPOLL thread - grab a reference to it (and assign 896 * it), and drop it when we're done. 897 * 898 * This feature is exposed as IORING_FEAT_SQPOLL_NONFIXED. 899 */ 900 SQPOLL_NONFIXED = 1U << 7, 901 902 /** 903 * `IORING_FEAT_EXT_ARG` (from Linux 5.11) 904 * 905 * Supports adding timeout to `existing io_uring_enter()` 906 */ 907 EXT_ARG = 1U << 8, 908 909 /// `IORING_FEAT_NATIVE_WORKERS (1U << 9)` (from Linux 5.12) 910 NATIVE_WORKERS = 1U << 9, 911 912 /// `IORING_FEAT_RSRC_TAGS (1U << 9)` (from Linux 5.13) 913 RSRC_TAGS = 1U << 10, 914 } 915 916 /** 917 * Filled with the offset for mmap(2) 918 * 919 * C API: `struct io_sqring_offsets` 920 */ 921 struct SubmissionQueueRingOffsets 922 { 923 /// Incremented by kernel after entry at `head` was processed. 924 /// Pending submissions: [head..tail] 925 uint head; 926 927 /// Modified by user space when new entry was queued; points to next 928 /// entry user space is going to fill. 929 uint tail; 930 931 /// value `value_at(self.ring_entries) - 1` 932 /// mask for indices at `head` and `tail` (don't delete masked bits! 933 /// `head` and `tail` can point to the same entry, but if they are 934 /// not exactly equal it implies the ring is full, and if they are 935 /// exactly equal the ring is empty.) 936 uint ring_mask; 937 938 /// value same as SetupParameters.sq_entries, power of 2. 939 uint ring_entries; 940 941 /// SubmissionQueueFlags 942 SubmissionQueueFlags flags; 943 944 /// number of (invalid) entries that were dropped; entries are 945 /// invalid if their index (in `array`) is out of bounds. 946 uint dropped; 947 948 /// index into array of `SubmissionEntry`s at offset `SUBMISSION_QUEUE_ENTRIES_OFFSET` in mmap() 949 uint array; 950 951 private uint[3] resv; // reserved 952 } 953 954 enum SubmissionQueueFlags: uint 955 { 956 NONE = 0, 957 958 /// `IORING_SQ_NEED_WAKEUP`: needs io_uring_enter wakeup 959 /// set by kernel poll thread when it goes sleeping, and reset on wakeup 960 NEED_WAKEUP = 1U << 0, 961 962 /// `IORING_SQ_CQ_OVERFLOW`: CQ ring is overflown 963 /// Since Kernel 5.8 964 /// For those applications which are not willing to use io_uring_enter() to reap and handle 965 /// cqes, they may completely rely on liburing's io_uring_peek_cqe(), but if cq ring has 966 /// overflowed, currently because io_uring_peek_cqe() is not aware of this overflow, it won't 967 /// enter kernel to flush cqes. 968 /// To fix this issue, export cq overflow status to userspace by adding new 969 /// IORING_SQ_CQ_OVERFLOW flag, then helper functions() in liburing, such as io_uring_peek_cqe, 970 /// can be aware of this cq overflow and do flush accordingly. 971 CQ_OVERFLOW = 1U << 1 972 } 973 974 /** 975 * Field offsets used to map kernel structure to our. 976 * 977 * C API: `struct io_cqring_offsets` 978 */ 979 struct CompletionQueueRingOffsets 980 { 981 /// incremented by user space after entry at `head` was processed. 982 /// available entries for processing: [head..tail] 983 uint head; 984 985 /// modified by kernel when new entry was created; points to next 986 /// entry kernel is going to fill. 987 uint tail; 988 989 /// value `value_at(ring_entries) - 1` 990 /// mask for indices at `head` and `tail` (don't delete masked bits! 991 /// `head` and `tail` can point to the same entry, but if they are 992 /// not exactly equal it implies the ring is full, and if they are 993 /// exactly equal the ring is empty.) 994 uint ring_mask; 995 996 /// value same as SetupParameters.cq_entries, power of 2. 997 uint ring_entries; 998 999 /// incremented by the kernel every time it failed to queue a 1000 /// completion event because the ring was full. 1001 uint overflow; 1002 1003 /// Offset to array of completion queue entries 1004 uint cqes; 1005 1006 CQRingFlags flags; /// (available from Linux 5.8) 1007 private uint _resv1; 1008 private ulong _resv2; 1009 } 1010 1011 /// CompletionQueue ring flags 1012 enum CQRingFlags : uint 1013 { 1014 NONE = 0, /// No flags set 1015 1016 /// `IORING_CQ_EVENTFD_DISABLED` disable eventfd notifications (available from Linux 5.8) 1017 /// This new flag should be set/clear from the application to disable/enable eventfd notifications when a request is completed and queued to the CQ ring. 1018 /// 1019 /// Before this patch, notifications were always sent if an eventfd is registered, so IORING_CQ_EVENTFD_DISABLED is not set during the initialization. 1020 /// It will be up to the application to set the flag after initialization if no notifications are required at the beginning. 1021 EVENTFD_DISABLED = 1U << 0, 1022 } 1023 1024 /// io_uring_register(2) opcodes and arguments 1025 enum RegisterOpCode : uint 1026 { 1027 /** 1028 * `arg` points to a struct iovec array of nr_args entries. The buffers associated with the 1029 * iovecs will be locked in memory and charged against the user's RLIMIT_MEMLOCK resource limit. 1030 * See getrlimit(2) for more informa‐ tion. Additionally, there is a size limit of 1GiB per 1031 * buffer. Currently, the buffers must be anonymous, non-file-backed memory, such as that 1032 * returned by malloc(3) or mmap(2) with the MAP_ANONYMOUS flag set. It is expected that this 1033 * limitation will be lifted in the future. Huge pages are supported as well. Note that the 1034 * entire huge page will be pinned in the kernel, even if only a portion of it is used. 1035 * 1036 * After a successful call, the supplied buffers are mapped into the kernel and eligible for 1037 * I/O. To make use of them, the application must specify the IORING_OP_READ_FIXED or 1038 * IORING_OP_WRITE_FIXED opcodes in the submis‐ sion queue entry (see the struct io_uring_sqe 1039 * definition in io_uring_enter(2)), and set the buf_index field to the desired buffer index. 1040 * The memory range described by the submission queue entry's addr and len fields must fall 1041 * within the indexed buffer. 1042 * 1043 * It is perfectly valid to setup a large buffer and then only use part of it for an I/O, as 1044 * long as the range is within the originally mapped region. 1045 * 1046 * An application can increase or decrease the size or number of registered buffers by first 1047 * unregistering the existing buffers, and then issuing a new call to io_uring_register() with 1048 * the new buffers. 1049 * 1050 * An application need not unregister buffers explicitly before shutting down the io_uring 1051 * instance. 1052 * 1053 * `IORING_REGISTER_BUFFERS` 1054 */ 1055 REGISTER_BUFFERS = 0, 1056 1057 /** 1058 * This operation takes no argument, and `arg` must be passed as NULL. All previously registered 1059 * buffers associated with the io_uring instance will be released. 1060 * 1061 * `IORING_UNREGISTER_BUFFERS` 1062 */ 1063 UNREGISTER_BUFFERS = 1, 1064 1065 /** 1066 * Register files for I/O. `arg` contains a pointer to an array of `nr_args` file descriptors 1067 * (signed 32 bit integers). 1068 * 1069 * To make use of the registered files, the IOSQE_FIXED_FILE flag must be set in the flags 1070 * member of the struct io_uring_sqe, and the fd member is set to the index of the file in the 1071 * file descriptor array. 1072 * 1073 * Files are automatically unregistered when the io_uring instance is torn down. An application 1074 * need only unregister if it wishes to register a new set of fds. 1075 * 1076 * `IORING_REGISTER_FILES` 1077 */ 1078 REGISTER_FILES = 2, 1079 1080 /** 1081 * This operation requires no argument, and `arg` must be passed as NULL. All previously 1082 * registered files associated with the io_uring instance will be unregistered. 1083 * 1084 * `IORING_UNREGISTER_FILES` 1085 */ 1086 UNREGISTER_FILES = 3, 1087 1088 /** 1089 * `IORING_REGISTER_EVENTFD` 1090 * 1091 * Registers eventfd that would be used to notify about completions on io_uring itself. 1092 * 1093 * Note: available from Linux 5.2 1094 */ 1095 REGISTER_EVENTFD = 4, 1096 1097 /** 1098 * `IORING_UNREGISTER_EVENTFD` 1099 * 1100 * Unregisters previously registered eventfd. 1101 * 1102 * Note: available from Linux 5.2 1103 */ 1104 UNREGISTER_EVENTFD = 5, 1105 1106 /// `IORING_REGISTER_FILES_UPDATE` (from Linux 5.5) 1107 REGISTER_FILES_UPDATE = 6, 1108 1109 /** 1110 * `IORING_REGISTER_EVENTFD_ASYNC` (from Linux 5.6) 1111 * 1112 * If an application is using eventfd notifications with poll to know when new SQEs can be 1113 * issued, it's expecting the following read/writes to complete inline. And with that, it knows 1114 * that there are events available, and don't want spurious wakeups on the eventfd for those 1115 * requests. 1116 * 1117 * This adds IORING_REGISTER_EVENTFD_ASYNC, which works just like IORING_REGISTER_EVENTFD, 1118 * except it only triggers notifications for events that happen from async completions (IRQ, or 1119 * io-wq worker completions). Any completions inline from the submission itself will not 1120 * trigger notifications. 1121 */ 1122 REGISTER_EVENTFD_ASYNC = 7, 1123 1124 /** 1125 * `IORING_REGISTER_PROBE` (from Linux 5.6) 1126 * 1127 * The application currently has no way of knowing if a given opcode is supported or not 1128 * without having to try and issue one and see if we get -EINVAL or not. And even this approach 1129 * is fraught with peril, as maybe we're getting -EINVAL due to some fields being missing, or 1130 * maybe it's just not that easy to issue that particular command without doing some other leg 1131 * work in terms of setup first. 1132 * 1133 * This adds IORING_REGISTER_PROBE, which fills in a structure with info on what it supported 1134 * or not. This will work even with sparse opcode fields, which may happen in the future or 1135 * even today if someone backports specific features to older kernels. 1136 */ 1137 REGISTER_PROBE = 8, 1138 1139 /** 1140 * `IORING_REGISTER_PERSONALITY` (from Linux 5.6) 1141 * 1142 * If an application wants to use a ring with different kinds of credentials, it can register 1143 * them upfront. We don't lookup credentials, the credentials of the task calling 1144 * IORING_REGISTER_PERSONALITY is used. 1145 * 1146 * An 'id' is returned for the application to use in subsequent personality support. 1147 */ 1148 REGISTER_PERSONALITY = 9, 1149 1150 /// `IORING_UNREGISTER_PERSONALITY` (from Linux 5.6) 1151 UNREGISTER_PERSONALITY = 10, 1152 1153 /** 1154 * `IORING_REGISTER_RESTRICTIONS` (from Linux 5.10) 1155 * 1156 * Permanently installs a feature allowlist on an io_ring_ctx. The io_ring_ctx can then be 1157 * passed to untrusted code with the knowledge that only operations present in the allowlist can 1158 * be executed. 1159 * 1160 * The allowlist approach ensures that new features added to io_uring do not accidentally become 1161 * available when an existing application is launched on a newer kernel version. 1162 * 1163 * Currently it's possible to restrict sqe opcodes, sqe flags, and register opcodes. 1164 * 1165 * `IOURING_REGISTER_RESTRICTIONS` can only be made once. Afterwards it is not possible to 1166 * change restrictions anymore. This prevents untrusted code from removing restrictions. 1167 */ 1168 REGISTER_RESTRICTIONS = 11, 1169 1170 /** 1171 *`IORING_REGISTER_ENABLE_RINGS` (from Linux 5.10) 1172 * 1173 * This operation is to be used when rings are disabled on start with `IORING_SETUP_R_DISABLED`. 1174 */ 1175 ENABLE_RINGS = 12, 1176 1177 /** 1178 * `IORING_REGISTER_FILES2` (from Linux 5.13) 1179 */ 1180 REGISTER_FILES2 = 13, 1181 1182 /** 1183 * `IORING_REGISTER_FILES_UPDATE2` (from Linux 5.13) 1184 */ 1185 REGISTER_FILES_UPDATE2 = 14, 1186 1187 /** 1188 * `IORING_REGISTER_BUFFERS2` (from Linux 5.13) 1189 */ 1190 REGISTER_BUFFERS2 = 15, 1191 1192 /** 1193 * `IORING_REGISTER_BUFFERS_UPDATE` (from Linux 5.13) 1194 */ 1195 REGISTER_BUFFERS_UPDATE = 16, 1196 1197 /* set/clear io-wq thread affinities */ 1198 /// `IORING_REGISTER_IOWQ_AFF` (from Linux 5.14) 1199 REGISTER_IOWQ_AFF = 17, 1200 1201 /// `IORING_UNREGISTER_IOWQ_AFF` (from Linux 5.14) 1202 UNREGISTER_IOWQ_AFF = 18, 1203 1204 /// `IORING_REGISTER_IOWQ_MAX_WORKERS` (from Linux 5.15) 1205 /// set/get max number of io-wq workers 1206 REGISTER_IOWQ_MAX_WORKERS = 19, 1207 } 1208 1209 /* io-wq worker categories */ 1210 enum IOWQCategory 1211 { 1212 BOUND, /// `IO_WQ_BOUND` 1213 UNBOUND, /// `IO_WQ_UNBOUND` 1214 } 1215 1216 /// io_uring_enter(2) flags 1217 enum EnterFlags: uint 1218 { 1219 NONE = 0, 1220 GETEVENTS = 1U << 0, /// `IORING_ENTER_GETEVENTS` 1221 SQ_WAKEUP = 1U << 1, /// `IORING_ENTER_SQ_WAKEUP` 1222 1223 /** 1224 * `IORING_ENTER_SQ_WAIT` (from Linux 5.10) 1225 * 1226 * When using SQPOLL, applications can run into the issue of running out of SQ ring entries 1227 * because the thread hasn't consumed them yet. The only option for dealing with that is 1228 * checking later, or busy checking for the condition. 1229 */ 1230 SQ_WAIT = 1U << 2, 1231 1232 /** 1233 * `IORING_ENTER_EXT_ARG` (from Linux 5.11) 1234 * 1235 * Adds support for timeout to existing io_uring_enter() function. 1236 */ 1237 EXT_ARG = 1U << 3, 1238 } 1239 1240 /// Time specification as defined in kernel headers (used by TIMEOUT operations) 1241 struct KernelTimespec 1242 { 1243 long tv_sec; /// seconds 1244 long tv_nsec; /// nanoseconds 1245 } 1246 1247 static assert(CompletionEntry.sizeof == 16); 1248 static assert(CompletionQueueRingOffsets.sizeof == 40); 1249 static assert(SetupParameters.sizeof == 120); 1250 static assert(SubmissionEntry.sizeof == 64); 1251 static assert(SubmissionQueueRingOffsets.sizeof == 40); 1252 1253 /// Indicating that OP is supported by the kernel 1254 enum IO_URING_OP_SUPPORTED = 1U << 0; 1255 1256 /** 1257 * Skip updating fd indexes set to this value in the fd table 1258 * 1259 * Support for skipping a file descriptor when using `IORING_REGISTER_FILES_UPDATE`. 1260 * `__io_sqe_files_update` will skip fds set to `IORING_REGISTER_FILES_SKIP` 1261 * 1262 * Note: Available from Linux 5.12 1263 */ 1264 enum IORING_REGISTER_FILES_SKIP = -2; 1265 1266 struct io_uring_probe_op 1267 { 1268 ubyte op; 1269 ubyte resv; 1270 ushort flags; /* IO_URING_OP_* flags */ 1271 uint resv2; 1272 } 1273 1274 struct io_uring_probe 1275 { 1276 ubyte last_op; /* last opcode supported */ 1277 ubyte ops_len; /* length of ops[] array below */ 1278 ushort resv; 1279 uint[3] resv2; 1280 io_uring_probe_op[0] ops; 1281 } 1282 1283 struct io_uring_restriction 1284 { 1285 RestrictionOp opcode; 1286 union 1287 { 1288 ubyte register_op; /// IORING_RESTRICTION_REGISTER_OP 1289 ubyte sqe_op; /// IORING_RESTRICTION_SQE_OP 1290 ubyte sqe_flags; /// IORING_RESTRICTION_SQE_FLAGS_* 1291 } 1292 ubyte resv; 1293 uint[3] resv2; 1294 } 1295 1296 /** 1297 * io_uring_restriction->opcode values 1298 */ 1299 enum RestrictionOp : ushort 1300 { 1301 /// Allow an io_uring_register(2) opcode 1302 IORING_RESTRICTION_REGISTER_OP = 0, 1303 1304 /// Allow an sqe opcode 1305 IORING_RESTRICTION_SQE_OP = 1, 1306 1307 /// Allow sqe flags 1308 IORING_RESTRICTION_SQE_FLAGS_ALLOWED = 2, 1309 1310 /// Require sqe flags (these flags must be set on each submission) 1311 IORING_RESTRICTION_SQE_FLAGS_REQUIRED = 3, 1312 } 1313 1314 struct io_uring_getevents_arg 1315 { 1316 ulong sigmask; 1317 uint sigmask_sz; 1318 uint pad; 1319 ulong ts; 1320 } 1321 1322 /** 1323 * Setup a context for performing asynchronous I/O. 1324 * 1325 * The `io_uring_setup()` system call sets up a submission queue (SQ) and completion queue (CQ) with 1326 * at least entries entries, and returns a file descriptor which can be used to perform subsequent 1327 * operations on the io_uring instance. The submission and completion queues are shared between 1328 * userspace and the kernel, which eliminates the need to copy data when initiating and completing 1329 * I/O. 1330 * 1331 * See_Also: `io_uring_setup(2)` 1332 * 1333 * Params: 1334 * entries = Defines how many entries can submission queue hold. 1335 * p = `SetupParameters` 1336 * 1337 * Returns: 1338 * `io_uring_setup(2)` returns a new file descriptor on success. The application may then provide 1339 * the file descriptor in a subsequent `mmap(2)` call to map the submission and completion queues, 1340 * or to the `io_uring_register(2)` or `io_uring_enter(2)` system calls. 1341 * 1342 * On error, -1 is returned and `errno` is set appropriately. 1343 */ 1344 int io_uring_setup(uint entries, scope ref SetupParameters p) @trusted 1345 { 1346 pragma(inline); 1347 return syscall(SYS_io_uring_setup, entries, &p); 1348 } 1349 1350 /** 1351 * Initiate and/or complete asynchronous I/O 1352 * 1353 * `io_uring_enter()` is used to initiate and complete I/O using the shared submission and 1354 * completion queues setup by a call to `io_uring_setup(2)`. A single call can both submit new I/O 1355 * and wait for completions of I/O initiated by this call or previous calls to `io_uring_enter()``. 1356 * 1357 * When the system call returns that a certain amount of SQEs have been consumed and submitted, it's 1358 * safe to reuse SQE entries in the ring. This is true even if the actual IO submission had to be 1359 * punted to async context, which means that the SQE may in fact not have been submitted yet. If the 1360 * kernel requires later use of a particular SQE entry, it will have made a private copy of it. 1361 * 1362 * Note: For interrupt driven I/O (where `IORING_SETUP_IOPOLL` was not specified in the call to 1363 * `io_uring_setup(2)`), an application may check the completion queue for event completions without 1364 * entering the kernel at all. 1365 * 1366 * See_Also: `io_uring_enter(2)` 1367 * 1368 * Params: 1369 * fd = the file descriptor returned by io_uring_setup(2). 1370 * to_submit = specifies the number of I/Os to submit from the submission queue. 1371 * min_complete = If the `IORING_ENTER_GETEVENTS` bit is set in flags, then the system call will attempt 1372 * to wait for `min_complete` event completions before returning. If the io_uring instance was configured 1373 * for polling, by specifying IORING_SETUP_IOPOLL in the call to io_uring_setup(2), then 1374 * min_complete has a slightly different meaning. Passing a value of 0 instructs the kernel to 1375 * return any events which are already complete, without blocking. If min_complete is a non-zero 1376 * value, the kernel will still return immediately if any completion events are available. If 1377 * no event completions are available, then the call will poll either until one or more 1378 * completions become available, or until the process has exceeded its scheduler time slice. 1379 * flags = Behavior modification flags - `EnterFlags` 1380 * sig = a pointer to a signal mask (see `sigprocmask(2)`); if sig is not `null`, `io_uring_enter()` 1381 * first replaces the current signal mask by the one pointed to by sig, then waits for events to 1382 * become available in the completion queue, and then restores the original signal mask. The 1383 * following `io_uring_enter()` call: 1384 * 1385 * ``` 1386 * ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, &sig); 1387 * ``` 1388 * 1389 * is equivalent to atomically executing the following calls: 1390 * 1391 * ``` 1392 * pthread_sigmask(SIG_SETMASK, &sig, &orig); 1393 * ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, NULL); 1394 * pthread_sigmask(SIG_SETMASK, &orig, NULL); 1395 * ``` 1396 * 1397 * See the description of `pselect(2)` for an explanation of why the sig parameter is necessary. 1398 * 1399 * Returns: 1400 */ 1401 int io_uring_enter(int fd, uint to_submit, uint min_complete, EnterFlags flags, const sigset_t* sig = null) 1402 { 1403 pragma(inline); 1404 return syscall(SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigset_t.sizeof); 1405 } 1406 1407 /// ditto 1408 int io_uring_enter(int fd, uint to_submit, uint min_complete, EnterFlags flags, const io_uring_getevents_arg* args) 1409 { 1410 pragma(inline); 1411 return syscall(SYS_io_uring_enter, fd, to_submit, min_complete, flags, args, io_uring_getevents_arg.sizeof); 1412 } 1413 1414 /** 1415 * Register files or user buffers for asynchronous I/O. 1416 * 1417 * The `io_uring_register()` system call registers user buffers or files for use in an `io_uring(7)` 1418 * instance referenced by fd. Registering files or user buffers allows the kernel to take long term 1419 * references to internal data structures or create long term mappings of application memory, 1420 * greatly reducing per-I/O overhead. 1421 * 1422 * See_Also: `io_uring_register(2) 1423 * 1424 * Params: 1425 * fd = the file descriptor returned by a call to io_uring_setup(2) 1426 * opcode = code of operation to execute on args 1427 * arg = Args used by specified operation. See `RegisterOpCode` for usage details. 1428 * nr_args = number of provided arguments 1429 * 1430 * Returns: On success, io_uring_register() returns 0. On error, -1 is returned, and errno is set accordingly. 1431 */ 1432 int io_uring_register(int fd, RegisterOpCode opcode, const(void)* arg, uint nr_args) 1433 { 1434 pragma(inline); 1435 return syscall(SYS_io_uring_register, fd, opcode, arg, nr_args); 1436 } 1437 1438 private: 1439 1440 // Syscalls 1441 enum 1442 { 1443 SYS_io_uring_setup = 425, 1444 SYS_io_uring_enter = 426, 1445 SYS_io_uring_register = 427 1446 } 1447 1448 extern (C): 1449 1450 /// Invoke `system call' number `sysno`, passing it the remaining arguments. 1451 int syscall(int sysno, ...);