1 /**
2  * io_uring system api definitions.
3  *
4  * See: https://github.com/torvalds/linux/blob/master/include/uapi/linux/io_uring.h
5  *
6  * Last changes from: dd47c104533dedb90434a3f142e94a671ac623a6 (20210913)
7  */
8 module during.io_uring;
9 
10 version (linux):
11 
12 import core.sys.posix.poll;
13 import core.sys.posix.signal;
14 
15 @system nothrow @nogc:
16 
17 /**
18  * IO operation submission data structure (Submission queue entry).
19  *
20  * C API: `struct io_uring_sqe`
21  */
22 struct SubmissionEntry
23 {
24     Operation               opcode;         /// type of operation for this sqe
25     SubmissionEntryFlags    flags;          /// IOSQE_ flags
26     ushort                  ioprio;         /// ioprio for the request
27     int                     fd;             /// file descriptor to do IO on
28     union
29     {
30         ulong off;                          /// offset into file
31         ulong addr2;                        /// from Linux 5.5
32     }
33 
34     union
35     {
36         ulong addr;                         /// pointer to buffer or iovecs
37         ulong splice_off_in;
38     }
39     uint len;                               /// buffer size or number of iovecs
40 
41     union
42     {
43         ReadWriteFlags      rw_flags;
44         FsyncFlags          fsync_flags;
45         ushort              poll_events;        /// Unused from 5.9, kept for compatibility reasons - see https://github.com/torvalds/linux/commit/5769a351b89cd4d82016f18fa5f6c4077403564d
46         PollEvents          poll_events32;      /// from Linux 5.9 - word-reversed for BE
47         SyncFileRangeFlags  sync_range_flags;   /// from Linux 5.2
48         MsgFlags            msg_flags;          /// from Linux 5.3
49         TimeoutFlags        timeout_flags;      /// from Linux 5.4
50         AcceptFlags         accept_flags;       /// from Linux 5.5
51         uint                cancel_flags;       /// from Linux 5.5
52         uint                open_flags;         /// from Linux 5.6
53         uint                statx_flags;        /// from Linux 5.6
54         uint                fadvise_advice;     /// from Linux 5.6
55         uint                splice_flags;       /// from Linux 5.7
56         uint                rename_flags;       /// from Linux 5.11
57         uint                unlink_flags;       /// from Linux 5.11
58         uint                hardlink_flags;     /// from Linux 5.15
59     }
60 
61     ulong user_data;                        /// data to be passed back at completion time
62 
63     union
64     {
65         align (1):
66         ushort buf_index;   /// index into fixed buffers, if used
67         ushort buf_group;   /// for grouped buffer selection
68     }
69 
70     ushort personality;     /// personality to use, if used
71     union
72     {
73         int splice_fd_in;
74         uint file_index;
75     }
76     ulong[2] __pad2;
77 
78     /// Resets entry fields
79     void clear() @safe nothrow @nogc
80     {
81         this = SubmissionEntry.init;
82     }
83 }
84 
85 enum ReadWriteFlags : int
86 {
87     NONE = 0,
88 
89     /// High priority read/write.  Allows block-based filesystems to
90     /// use polling of the device, which provides lower latency, but
91     /// may use additional resources.  (Currently, this feature is
92     /// usable only  on  a  file  descriptor opened using the
93     /// O_DIRECT flag.)
94     ///
95     /// (since Linux 4.6)
96     HIPRI = 0x00000001,
97 
98     /// Provide a per-write equivalent of the O_DSYNC open(2) flag.
99     /// This flag is meaningful only for pwritev2(), and its effect
100     /// applies only to the data range written by the system call.
101     ///
102     /// (since Linux 4.7)
103     DSYNC = 0x00000002,
104 
105     /// Provide a per-write equivalent of the O_SYNC open(2) flag.
106     /// This flag is meaningful only for pwritev2(), and its effect
107     /// applies only to the data range written by the system call.
108     ///
109     /// (since Linux 4.7)
110     SYNC = 0x00000004,
111 
112     /// Do not wait for data which is not immediately available.  If
113     /// this flag is specified, the preadv2() system call will
114     /// return instantly if it would have to read data from the
115     /// backing storage or wait for a lock.  If some data was
116     /// successfully read, it will return the number of bytes read.
117     /// If no bytes were read, it will return -1 and set errno to
118     /// EAGAIN.  Currently, this flag is meaningful only for
119     /// preadv2().
120     ///
121     /// (since Linux 4.14)
122     NOWAIT = 0x00000008,
123 
124     /// Provide a per-write equivalent of the O_APPEND open(2) flag.
125     /// This flag is meaningful only for pwritev2(), and its effect
126     /// applies only to the data range written by the system call.
127     /// The offset argument does not affect the write operation; the
128     /// data is always appended to the end of the file.  However, if
129     /// the offset argument is -1, the current file offset is
130     /// updated.
131     ///
132     /// (since Linux 4.16)
133     APPEND = 0x00000010
134 }
135 
136 enum FsyncFlags : uint
137 {
138     /// Normal file integrity sync
139     NORMAL      = 0,
140 
141     /**
142      * `fdatasync` semantics.
143      *
144      * See_Also: `fsync(2)` for details
145      */
146     DATASYNC    = (1 << 0)
147 }
148 
149 /** Possible poll event flags.
150  *  See: poll(2)
151  */
152 enum PollEvents : uint
153 {
154     NONE    = 0,
155 
156     /// There is data to read.
157     IN      = POLLIN,
158 
159     /** Writing is now possible, though a write larger that the available
160      *  space in a socket or pipe will still block (unless O_NONBLOCK is set).
161      */
162     OUT     = POLLOUT,
163 
164     /** There is some exceptional condition on the file descriptor.
165      *  Possibilities include:
166      *
167      *  *  There is out-of-band data on a TCP socket (see tcp(7)).
168      *  *  A pseudoterminal master in packet mode has seen a state
169      *      change on the slave (see ioctl_tty(2)).
170      *  *  A cgroup.events file has been modified (see cgroups(7)).
171      */
172     PRI     = POLLPRI,
173 
174     /** Error condition (only returned in revents; ignored in events).
175       * This bit is also set for a file descriptor referring to the
176       * write end of a pipe when the read end has been closed.
177      */
178     ERR     = POLLERR,
179 
180     /// Invalid request: fd not open (only returned in revents; ignored in events).
181     NVAL    = POLLNVAL,
182 
183     RDNORM  = POLLRDNORM, /// Equivalent to POLLIN.
184     RDBAND  = POLLRDBAND, /// Priority band data can be read (generally unused on Linux).
185     WRNORM  = POLLWRNORM, /// Equivalent to POLLOUT.
186     WRBAND  = POLLWRBAND, /// Priority data may be written.
187 
188     /** Hang up (only returned in revents; ignored in events).  Note
189      *  that when reading from a channel such as a pipe or a stream
190      *  socket, this event merely indicates that the peer closed its
191      *  end of the channel.  Subsequent reads from the channel will
192      *  return 0 (end of file) only after all outstanding data in the
193      *  channel has been consumed.
194      */
195     HUP     = POLLHUP,
196 
197     /** (since Linux 2.6.17)
198      * Stream socket peer closed connection, or shut down writing half of connection.
199      */
200     RDHUP   = 0x2000,
201 
202     /** (since Linux 4.5)
203      * Sets an exclusive wakeup mode for the epoll file descriptor that is being attached to the
204      * target file descriptor, fd. When a wakeup event occurs and multiple epoll file descriptors
205      * are attached to the same target file using EPOLLEXCLUSIVE, one or more of the epoll file
206      * descriptors will receive an event with epoll_wait(2).  The default in this scenario (when
207      * EPOLLEXCLUSIVE is not set) is for all epoll file descriptors to receive an event.
208      * EPOLLEXCLUSIVE is thus useful for avoiding thundering herd problems in certain scenarios.
209      */
210     EXCLUSIVE = 0x10000000,
211 }
212 
213 /**
214  * Flags for `sync_file_range(2)` operation.
215  *
216  * See_Also: `sync_file_range(2)` for details
217  */
218 enum SyncFileRangeFlags : uint
219 {
220     NOOP            = 0, /// no operation
221     /// Wait upon write-out of all pages in the specified range that have already been submitted to
222     /// the device driver for write-out before performing any write.
223     WAIT_BEFORE     = 1U << 0,
224 
225     /// Initiate write-out of all dirty pages in the specified range which are not presently
226     /// submitted write-out.  Note that even this may block if you attempt to write more than
227     /// request queue size.
228     WRITE           = 1U << 1,
229 
230     /// Wait upon write-out of all pages in the range after performing any write.
231     WAIT_AFTER      = 1U << 2,
232 
233     /// This is a write-for-data-integrity operation that will ensure that all pages in the
234     /// specified range which were dirty when sync_file_range() was called are committed to disk.
235     WRITE_AND_WAIT  = WAIT_BEFORE | WRITE | WAIT_AFTER
236 }
237 
238 /**
239  * Flags for `sendmsg(2)` and `recvmsg(2)` operations.
240  *
241  * See_Also: man pages for the operations.
242  */
243 enum MsgFlags : uint
244 {
245     /// No flags defined
246     NONE = 0,
247 
248     /// Sends out-of-band data on sockets that support this notion (e.g., of type `SOCK_STREAM`); the
249     /// underlying protocol must also support out-of-band data.
250     OOB = 0x01,
251 
252     /// This flag causes the receive operation to return data from the beginning of the receive
253     /// queue without removing that data from the queue. Thus, a subsequent receive call will return
254     /// the same data.
255     PEEK = 0x02,
256 
257     /// Don't use a gateway to send out the packet, send to hosts only on directly connected
258     /// networks. This is usually used only by diagnostic or routing programs. This is defined only
259     /// for protocol families that route; packet sockets don't.
260     DONTROUTE = 0x04,
261 
262     /// For raw (`AF_PACKET`), Internet datagram (since Linux 2.4.27/2.6.8), netlink (since Linux
263     /// 2.6.22), and UNIX datagram (since Linux 3.4) sockets: return the real length of the packet
264     /// or datagram, even when it was longer than the passed buffer.
265     ///
266     /// For use with Internet stream sockets, see `tcp(7)`.
267     TRUNC = 0x20,
268 
269     /// Enables nonblocking operation; if the operation would block, EAGAIN or EWOULDBLOCK is
270     /// returned. This provides similar behavior to setting the O_NONBLOCK flag (via the `fcntl(2)`
271     /// F_SETFL operation), but differs in that `MSG_DONTWAIT` is a per-call option, whereas
272     /// `O_NONBLOCK` is a setting on the open file description (see `open(2)`), which will affect
273     /// all threads in the calling process and as well as other processes that hold file descriptors
274     /// referring to the same open file description.
275     DONTWAIT = 0x40,
276 
277     /// Terminates a record (when this notion is supported, as for sockets of type `SOCK_SEQPACKET`).
278     EOR = 0x80,
279 
280     /// This flag requests that the operation block until the full request is satisfied. However,
281     /// the call may still return less data than requested if a signal is caught, an error or
282     /// disconnect occurs, or the next data to be received is of a different type than that
283     /// returned. This flag has no effect for datagram sockets.
284     WAITALL = 0x100,
285 
286     /// Tell the link layer that forward progress happened: you got a successful reply from the
287     /// other side. If the link layer doesn't get this it will regularly reprobe the neighbor (e.g.,
288     /// via a unicast ARP). Valid  only  on SOCK_DGRAM and SOCK_RAW sockets and currently
289     /// implemented only for IPv4 and IPv6. See arp(7) for details.
290     CONFIRM = 0x800,
291 
292     /// This flag specifies that queued errors should be received from the socket error queue. The
293     /// error is passed in an ancillary message with a type dependent on the protocol (for IPv4
294     /// `IP_RECVERR`). The user should supply a buffer of sufficient size. See `cmsg(3)` and `ip(7)`
295     /// for more information. The payload of the original packet that caused the error is passed as
296     /// normal data via msg_iovec. The original destination address of the datagram that caused the
297     /// error is supplied via `msg_name`.
298     ERRQUEUE = 0x2000,
299 
300     /// Don't generate a `SIGPIPE` signal if the peer on a stream-oriented socket has closed the
301     /// connection. The `EPIPE` error is still returned. This provides similar behavior to using
302     /// `sigaction(2)` to ignore `SIGPIPE`, but, whereas `MSG_NOSIGNAL` is a per-call feature,
303     /// ignoring `SIGPIPE` sets a process attribute that affects all threads in the process.
304     NOSIGNAL = 0x4000,
305 
306     /// The caller has more data to send. This flag is used with TCP sockets to obtain the same
307     /// effect as the `TCP_CORK` socket option (see `tcp(7)`), with the difference that this flag can be
308     /// set on a per-call basis.
309     ///
310     /// Since Linux 2.6, this flag is also supported for UDP sockets, and informs the kernel to
311     /// package all of the data sent in calls with this flag set into a single datagram which is
312     /// transmitted only when a call is performed that does not specify this flag.
313     ///
314     /// See_Also: the `UDP_CORK` socket option described in `udp(7)`
315     MORE = 0x8000,
316 
317     /// Set the close-on-exec flag for the file descriptor received via a UNIX domain file
318     /// descriptor using the `SCM_RIGHTS` operation (described in `unix(7)`). This flag is useful
319     /// for the same reasons as the `O_CLOEXEC` flag of `open(2)`. (recvmsg only)
320     CMSG_CLOEXEC = 0x40000000
321 }
322 
323 /** sqe->timeout_flags
324  */
325 enum TimeoutFlags : uint
326 {
327     REL = 0,            /// Relative time is the default
328     ABS = 1U << 0,      /// Absolute time - `IORING_TIMEOUT_ABS` (from Linux 5.5)
329 
330     /**
331      * `IORING_TIMEOUT_UPDATE` (from Linux 5.11)
332      *
333      * Support timeout updates through `IORING_OP_TIMEOUT_REMOVE` with passed in `IORING_TIMEOUT_UPDATE`.
334      */
335     UPDATE = 1U << 1,
336 
337     /**
338      * `IORING_TIMEOUT_BOOTTIME` (from Linux 5.15)
339      */
340     BOOTTIME = 1U << 2,
341 
342     /**
343      * `IORING_TIMEOUT_REALTIME` (from Linux 5.15)
344      */
345     REALTIME = 1U << 3,
346 
347     /**
348      * `IORING_LINK_TIMEOUT_UPDATE` (from Linux 5.15)
349      */
350     LINK_TIMEOUT_UPDATE = 1U << 4,
351 
352     /**
353      * `IORING_TIMEOUT_CLOCK_MASK` (from Linux 5.15)
354      */
355     CLOCK_MASK = BOOTTIME | REALTIME,
356 
357     /**
358      * `IORING_TIMEOUT_UPDATE_MASK` (from Linux 5.15)
359      */
360     UPDATE_MASK = UPDATE | LINK_TIMEOUT_UPDATE,
361 }
362 
363 /**
364  * sqe->splice_flags
365  * extends splice(2) flags
366  */
367 enum SPLICE_F_FD_IN_FIXED = 1U << 31; /* the last bit of __u32 */
368 
369 /**
370  * POLL_ADD flags
371  *
372  * Note that since sqe->poll_events is the flag space, the command flags for POLL_ADD are stored in
373  * sqe->len.
374  */
375 enum PollFlags : uint
376 {
377     NONE = 0,
378 
379     /**
380      * `IORING_POLL_ADD_MULTI` - Multishot poll. Sets `IORING_CQE_F_MORE` if the poll handler will
381      * continue to report CQEs on behalf of the same SQE.
382      *
383      * The default io_uring poll mode is one-shot, where once the event triggers, the poll command
384      * is completed and won't trigger any further events. If we're doing repeated polling on the
385      * same file or socket, then it can be more efficient to do multishot, where we keep triggering
386      * whenever the event becomes true.
387      *
388      * This deviates from the usual norm of having one CQE per SQE submitted. Add a CQE flag,
389      * IORING_CQE_F_MORE, which tells the application to expect further completion events from the
390      * submitted SQE. Right now the only user of this is POLL_ADD in multishot mode.
391      *
392      * An application should expect more CQEs for the specificed SQE if the CQE is flagged with
393      * IORING_CQE_F_MORE. In multishot mode, only cancelation or an error will terminate the poll
394      * request, in which case the flag will be cleared.
395      *
396      * Note: available from Linux 5.13
397      */
398     ADD_MULTI = 1U << 0,
399 
400     /**
401      * `IORING_POLL_UPDATE_EVENTS`
402      *
403      * Update existing poll request, matching sqe->addr as the old user_data field.
404      *
405      * Note: available from Linux 5.13
406      */
407     UPDATE_EVENTS = 1U << 1,
408 
409     /**
410      * `IORING_POLL_UPDATE_USER_DATA`
411      *
412      * Update existing poll request, matching sqe->addr as the old user_data field.
413      *
414      * Note: available from Linux 5.13
415      */
416     UPDATE_USER_DATA = 1U << 2,
417 }
418 
419 /**
420  * Flags that can be used with the `accept4(2)` operation.
421  */
422 enum AcceptFlags : uint
423 {
424     /// Same as `accept()`
425     NONE = 0,
426 
427     /// Set the `O_NONBLOCK` file status flag on the new open file description. Using this flag saves
428     /// extra calls to `fcntl(2)` to achieve the same result.
429     NONBLOCK = 0x800, // octal 00004000
430 
431     /// Set the close-on-exec (`FD_CLOEXEC`) flag on the new file descriptor. See the description of
432     /// the `O_CLOEXEC` flag in `open(2)` for reasons why this may be useful.
433     CLOEXEC = 0x80000 // octal 02000000
434 }
435 
436 /**
437  * Describes the operation to be performed
438  *
439  * See_Also: `io_uring_enter(2)`
440  */
441 enum Operation : ubyte
442 {
443     // available from Linux 5.1
444     NOP = 0,                /// IORING_OP_NOP
445     READV = 1,              /// IORING_OP_READV
446     WRITEV = 2,             /// IORING_OP_WRITEV
447     FSYNC = 3,              /// IORING_OP_FSYNC
448     READ_FIXED = 4,         /// IORING_OP_READ_FIXED
449     WRITE_FIXED = 5,        /// IORING_OP_WRITE_FIXED
450     POLL_ADD = 6,           /// IORING_OP_POLL_ADD
451     POLL_REMOVE = 7,        /// IORING_OP_POLL_REMOVE
452 
453     // available from Linux 5.2
454     SYNC_FILE_RANGE = 8,    /// IORING_OP_SYNC_FILE_RANGE
455 
456     // available from Linux 5.3
457     SENDMSG = 9,            /// IORING_OP_SENDMSG
458     RECVMSG = 10,           /// IORING_OP_RECVMSG
459 
460     // available from Linux 5.4
461     TIMEOUT = 11,           /// IORING_OP_TIMEOUT
462 
463     // available from Linux 5.5
464     TIMEOUT_REMOVE = 12,    /// IORING_OP_TIMEOUT_REMOVE
465     ACCEPT = 13,            /// IORING_OP_ACCEPT
466     ASYNC_CANCEL = 14,      /// IORING_OP_ASYNC_CANCEL
467     LINK_TIMEOUT = 15,      /// IORING_OP_LINK_TIMEOUT
468     CONNECT = 16,           /// IORING_OP_CONNECT
469 
470     // available from Linux 5.6
471     FALLOCATE = 17,         /// IORING_OP_FALLOCATE
472     OPENAT = 18,            /// IORING_OP_OPENAT
473     CLOSE = 19,             /// IORING_OP_CLOSE
474     FILES_UPDATE = 20,      /// IORING_OP_FILES_UPDATE
475     STATX = 21,             /// IORING_OP_STATX
476     READ = 22,              /// IORING_OP_READ
477     WRITE = 23,             /// IORING_OP_WRITE
478     FADVISE = 24,           /// IORING_OP_FADVISE
479     MADVISE = 25,           /// IORING_OP_MADVISE
480     SEND = 26,              /// IORING_OP_SEND
481     RECV = 27,              /// IORING_OP_RECV
482     OPENAT2 = 28,           /// IORING_OP_OPENAT2
483     EPOLL_CTL = 29,         /// IORING_OP_EPOLL_CTL
484 
485     // available from Linux 5.7
486     SPLICE = 30,            /// IORING_OP_SPLICE
487     PROVIDE_BUFFERS = 31,   /// IORING_OP_PROVIDE_BUFFERS
488     REMOVE_BUFFERS = 32,    /// IORING_OP_REMOVE_BUFFERS
489 
490     // available from Linux 5.8
491     TEE = 33,               /// IORING_OP_TEE
492 
493     // available from Linux 5.11
494     SHUTDOWN = 34,          /// IORING_OP_SHUTDOWN
495     RENAMEAT = 35,          /// IORING_OP_RENAMEAT - see renameat2()
496     UNLINKAT = 36,          /// IORING_OP_UNLINKAT - see unlinkat(2)
497 
498     // available from Linux 5.15
499     MKDIRAT = 37,           /// IORING_OP_MKDIRAT - see mkdirat(2)
500     SYMLINKAT = 38,         /// IORING_OP_SYMLINKAT - see symlinkat(2)
501     LINKAT = 39,            /// IORING_OP_LINKAT - see linkat(2)
502 }
503 
504 /// sqe->flags
505 enum SubmissionEntryFlags : ubyte
506 {
507     NONE        = 0,
508 
509     /// Use fixed fileset (`IOSQE_FIXED_FILE`)
510     ///
511     /// When this flag is specified, fd is an index into the files array registered with the
512     /// io_uring instance (see the `IORING_REGISTER_FILES` section of the io_uring_register(2) man
513     /// page).
514     FIXED_FILE  = 1U << 0,
515 
516     /**
517      * `IOSQE_IO_DRAIN`: issue after inflight IO
518      *
519      * If a request is marked with `IO_DRAIN`, then previous commands must complete before this one
520      * is issued. Subsequent requests are not started until the drain has completed.
521      *
522      * Note: available from Linux 5.2
523      */
524     IO_DRAIN    = 1U << 1,
525 
526     /**
527      * `IOSQE_IO_LINK`
528      *
529      * If set, the next SQE in the ring will depend on this SQE. A dependent SQE will not be started
530      * until the parent SQE has completed. If the parent SQE fails, then a dependent SQE will be
531      * failed without being started. Link chains can be arbitrarily long, the chain spans any new
532      * SQE that continues tohave the IOSQE_IO_LINK flag set. Once an SQE is encountered that does
533      * not have this flag set, that defines the end of the chain. This features allows to form
534      * dependencies between individual SQEs.
535      *
536      * Note: available from Linux 5.3
537      */
538     IO_LINK     = 1U << 2,
539 
540     /**
541      * `IOSQE_IO_HARDLINK` - like LINK, but stronger
542      *
543      * Some commands will invariably end in a failure in the sense that the
544      * completion result will be less than zero. One such example is timeouts
545      * that don't have a completion count set, they will always complete with
546      * `-ETIME` unless cancelled.
547      *
548      * For linked commands, we sever links and fail the rest of the chain if
549      * the result is less than zero. Since we have commands where we know that
550      * will happen, add IOSQE_IO_HARDLINK as a stronger link that doesn't sever
551      * regardless of the completion result. Note that the link will still sever
552      * if we fail submitting the parent request, hard links are only resilient
553      * in the presence of completion results for requests that did submit
554      * correctly.
555      *
556      * Note: available from Linux 5.5
557      */
558     IO_HARDLINK = 1U << 3,
559 
560     /**
561      * `IOSQE_ASYNC`
562      *
563      * io_uring defaults to always doing inline submissions, if at all possible. But for larger
564      * copies, even if the data is fully cached, that can take a long time. Add an IOSQE_ASYNC flag
565      * that the application can set on the SQE - if set, it'll ensure that we always go async for
566      * those kinds of requests.
567      *
568      * Note: available from Linux 5.6
569      */
570     ASYNC       = 1U << 4,    /* always go async */
571 
572     /**
573      * `IOSQE_BUFFER_SELECT`
574      * If a server process has tons of pending socket connections, generally it uses epoll to wait
575      * for activity. When the socket is ready for reading (or writing), the task can select a buffer
576      * and issue a recv/send on the given fd.
577      *
578      * Now that we have fast (non-async thread) support, a task can have tons of pending reads or
579      * writes pending. But that means they need buffers to back that data, and if the number of
580      * connections is high enough, having them preallocated for all possible connections is
581      * unfeasible.
582      *
583      * With IORING_OP_PROVIDE_BUFFERS, an application can register buffers to use for any request.
584      * The request then sets IOSQE_BUFFER_SELECT in the sqe, and a given group ID in sqe->buf_group.
585      * When the fd becomes ready, a free buffer from the specified group is selected. If none are
586      * available, the request is terminated with -ENOBUFS. If successful, the CQE on completion will
587      * contain the buffer ID chosen in the cqe->flags member, encoded as:
588      *
589      * `(buffer_id << IORING_CQE_BUFFER_SHIFT) | IORING_CQE_F_BUFFER;`
590      *
591      * Once a buffer has been consumed by a request, it is no longer available and must be
592      * registered again with IORING_OP_PROVIDE_BUFFERS.
593      *
594      * Requests need to support this feature. For now, IORING_OP_READ and IORING_OP_RECV support it.
595      * This is checked on SQE submission, a CQE with res == -EOPNOTSUPP will be posted if attempted
596      * on unsupported requests.
597      *
598      * Note: available from Linux 5.7
599      */
600     BUFFER_SELECT = 1U << 5, /* select buffer from sqe->buf_group */
601 }
602 
603 /**
604  * IO completion data structure (Completion Queue Entry)
605  *
606  * C API: `struct io_uring_cqe`
607  */
608 struct CompletionEntry
609 {
610     ulong       user_data;  /** sqe->data submission passed back */
611     int         res;        /** result code for this event */
612     CQEFlags    flags;
613 }
614 
615 /// Flags used with `CompletionEntry`
616 enum CQEFlags : uint
617 {
618     NONE = 0, /// No flags set
619 
620     /// `IORING_CQE_F_BUFFER` (from Linux 5.7)
621     /// If set, the upper 16 bits are the buffer ID
622     BUFFER = 1U << 0,
623 
624     /// `IORING_CQE_F_MORE` (from Linux 5.13)
625     /// If set, parent SQE will generate more CQE entries
626     MORE = 1U << 1,
627 }
628 
629 enum {
630     CQE_BUFFER_SHIFT = 16, /// Note: available from Linux 5.7
631 }
632 
633 /**
634  * Passed in for io_uring_setup(2). Copied back with updated info on success.
635  *
636  * C API: `struct io_uring_params`
637  */
638 struct SetupParameters
639 {
640     // Magic offsets for the application to mmap the data it needs
641 
642     /// `IORING_OFF_SQ_RING`: mmap offset for submission queue ring
643     enum ulong SUBMISSION_QUEUE_RING_OFFSET = 0UL;
644     /// `IORING_OFF_CQ_RING`: mmap offset for completion queue ring
645     enum ulong COMPLETION_QUEUE_RING_OFFSET = 0x8000000UL;
646     /// `IORING_OFF_SQES`: mmap offset for submission entries
647     enum ulong SUBMISSION_QUEUE_ENTRIES_OFFSET = 0x10000000UL;
648 
649     /// (output) allocated entries in submission queue
650     /// (both ring index `array` and separate entry array at `SUBMISSION_QUEUE_ENTRIES_OFFSET`).
651     uint                        sq_entries;
652 
653     /// (output) allocated entries in completion queue
654     uint                        cq_entries;
655 
656     SetupFlags                  flags;          /// (input)
657 
658     /// (input) used if SQ_AFF and SQPOLL flags are active to pin poll thread to specific cpu.
659     /// right now always checked in kernel for "possible cpu".
660     uint                        sq_thread_cpu;
661 
662     /// (input) used if SQPOLL flag is active; timeout in milliseconds
663     /// until kernel poll thread goes to sleep.
664     uint                        sq_thread_idle;
665     SetupFeatures               features;       /// (from Linux 5.4)
666     uint                        wq_fd;          /// (from Linux 5.6)
667     private uint[3]             resv;           // reserved
668     SubmissionQueueRingOffsets  sq_off;         /// (output) submission queue ring data field offsets
669     CompletionQueueRingOffsets  cq_off;         /// (output) completion queue ring data field offsets
670 }
671 
672 /// `io_uring_setup()` flags
673 enum SetupFlags : uint
674 {
675     /// No flags set
676     NONE    = 0,
677 
678     /**
679      * `IORING_SETUP_IOPOLL`
680      *
681      * Perform busy-waiting for an I/O completion, as opposed to getting notifications via an
682      * asynchronous IRQ (Interrupt Request).  The file system (if any) and block device must
683      * support polling in order for  this  to  work. Busy-waiting  provides  lower latency, but may
684      * consume more CPU resources than interrupt driven I/O.  Currently, this feature is usable
685      * only on a file descriptor opened using the O_DIRECT flag.  When a read or write is submitted
686      * to a polled context, the application must poll for completions on the CQ ring by calling
687      * io_uring_enter(2).  It is illegal to mix and match polled and non-polled I/O on an io_uring
688      * instance.
689      */
690     IOPOLL  = 1U << 0,
691 
692     /**
693      * `IORING_SETUP_SQPOLL`
694      *
695      * When this flag is specified, a kernel thread is created to perform submission queue polling.
696      * An io_uring instance configured in this way enables an application to issue I/O without ever
697      * context switching into the kernel.
698      * By using the submission queue to fill in new submission queue entries and watching for
699      * completions on the completion queue, the application can submit and reap I/Os without doing
700      * a single system call.
701      * If the kernel thread is idle for more than sq_thread_idle microseconds, it will set the
702      * IORING_SQ_NEED_WAKEUP bit in the flags field of the struct io_sq_ring. When this happens,
703      * the application must call io_uring_enter(2) to wake the kernel thread. If I/O is kept busy,
704      * the kernel thread will never sleep. An application making use of this feature will need to
705      * guard the io_uring_enter(2) call with  the  following  code sequence:
706      *
707      *     ````
708      *     // Ensure that the wakeup flag is read after the tail pointer has been written.
709      *     smp_mb();
710      *     if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP)
711      *         io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP);
712      *     ```
713      *
714      * where sq_ring is a submission queue ring setup using the struct io_sqring_offsets described below.
715      *
716      * To  successfully  use this feature, the application must register a set of files to be used for
717      * IO through io_uring_register(2) using the IORING_REGISTER_FILES opcode. Failure to do so will
718      * result in submitted IO being errored with EBADF.
719      */
720     SQPOLL  = 1U << 1,
721 
722     /**
723      * `IORING_SETUP_SQ_AFF`
724      *
725      *  If this flag is specified, then the poll thread will be bound to the cpu set in the
726      *  sq_thread_cpu field of the struct io_uring_params.  This flag is only meaningful when
727      *  IORING_SETUP_SQPOLL is specified.
728      */
729     SQ_AFF  = 1U << 2,
730 
731     /**
732      * `IORING_SETUP_CQSIZE`
733      *
734      * Create the completion queue with struct io_uring_params.cq_entries entries.  The value must
735      * be greater than entries, and may be rounded up to the next power-of-two.
736      *
737      * Note: Available from Linux 5.5
738      */
739     CQSIZE  = 1U << 3,
740 
741     /**
742      * `IORING_SETUP_CLAMP`
743      *
744      * Some applications like to start small in terms of ring size, and then ramp up as needed. This
745      * is a bit tricky to do currently, since we don't advertise the max ring size.
746      *
747      * This adds IORING_SETUP_CLAMP. If set, and the values for SQ or CQ ring size exceed what we
748      * support, then clamp them at the max values instead of returning -EINVAL. Since we return the
749      * chosen ring sizes after setup, no further changes are needed on the application side.
750      * io_uring already changes the ring sizes if the application doesn't ask for power-of-two
751      * sizes, for example.
752      *
753      * Note: Available from Linux 5.6
754      */
755     CLAMP   = 1U << 4, /* clamp SQ/CQ ring sizes */
756 
757     /**
758      * `IORING_SETUP_ATTACH_WQ`
759      *
760      * If IORING_SETUP_ATTACH_WQ is set, it expects wq_fd in io_uring_params to be a valid io_uring
761      * fd io-wq of which will be shared with the newly created io_uring instance. If the flag is set
762      * but it can't share io-wq, it fails.
763      *
764      * This allows creation of "sibling" io_urings, where we prefer to keep the SQ/CQ private, but
765      * want to share the async backend to minimize the amount of overhead associated with having
766      * multiple rings that belong to the same backend.
767      *
768      * Note: Available from Linux 5.6
769      */
770     ATTACH_WQ = 1U << 5, /* attach to existing wq */
771 
772     /**
773      * `IORING_SETUP_R_DISABLED` flag to start the rings disabled, allowing the user to register
774      * restrictions, buffers, files, before to start processing SQEs.
775      *
776      * When `IORING_SETUP_R_DISABLED` is set, SQE are not processed and SQPOLL kthread is not started.
777      *
778      * The restrictions registration are allowed only when the rings are disable to prevent
779      * concurrency issue while processing SQEs.
780      *
781      * The rings can be enabled using `IORING_REGISTER_ENABLE_RINGS` opcode with io_uring_register(2).
782      *
783      * Note: Available from Linux 5.10
784      */
785     R_DISABLED = 1U << 6, /* start with ring disabled */
786 }
787 
788 /// `io_uring_params->features` flags
789 enum SetupFeatures : uint
790 {
791     NONE            = 0,
792 
793     /**
794      * `IORING_FEAT_SINGLE_MMAP` (from Linux 5.4)
795      *
796      * Indicates that we can use single mmap feature to map both sq and cq rings and so to avoid the
797      * second mmap.
798      */
799     SINGLE_MMAP     = 1U << 0,
800 
801     /**
802      * `IORING_FEAT_NODROP` (from Linux 5.5)
803      *
804      * Currently we drop completion events, if the CQ ring is full. That's fine
805      * for requests with bounded completion times, but it may make it harder or
806      * impossible to use io_uring with networked IO where request completion
807      * times are generally unbounded. Or with POLL, for example, which is also
808      * unbounded.
809      *
810      * After this patch, we never overflow the ring, we simply store requests
811      * in a backlog for later flushing. This flushing is done automatically by
812      * the kernel. To prevent the backlog from growing indefinitely, if the
813      * backlog is non-empty, we apply back pressure on IO submissions. Any
814      * attempt to submit new IO with a non-empty backlog will get an -EBUSY
815      * return from the kernel. This is a signal to the application that it has
816      * backlogged CQ events, and that it must reap those before being allowed
817      * to submit more IO.
818      *
819      * Note that if we do return -EBUSY, we will have filled whatever
820      * backlogged events into the CQ ring first, if there's room. This means
821      * the application can safely reap events WITHOUT entering the kernel and
822      * waiting for them, they are already available in the CQ ring.
823      */
824     NODROP          = 1U << 1,
825 
826     /**
827      * `IORING_FEAT_SUBMIT_STABLE` (from Linux 5.5)
828      *
829      * If this flag is set, applications can be certain that any data for async offload has been
830      * consumed when the kernel has consumed the SQE.
831      */
832     SUBMIT_STABLE   = 1U << 2,
833 
834     /**
835      * `IORING_FEAT_RW_CUR_POS` (from Linux 5.6)
836      *
837      * If this flag is set, applications can know if setting `-1` as file offsets (meaning to work
838      * with current file position) is supported.
839      */
840     RW_CUR_POS = 1U << 3,
841 
842     /**
843      * `IORING_FEAT_CUR_PERSONALITY` (from Linux 5.6)
844      *
845      * We currently setup the io_wq with a static set of mm and creds. Even for a single-use io-wq
846      * per io_uring, this is suboptimal as we have may have multiple enters of the ring. For
847      * sharing the io-wq backend, it doesn't work at all.
848      *
849      * Switch to passing in the creds and mm when the work item is setup. This means that async
850      * work is no longer deferred to the io_uring mm and creds, it is done with the current mm and
851      * creds.
852      *
853      * Flag this behavior with IORING_FEAT_CUR_PERSONALITY, so applications know they can rely on
854      * the current personality (mm and creds) being the same for direct issue and async issue.
855      */
856     CUR_PERSONALITY = 1U << 4,
857 
858     /**
859      * `IORING_FEAT_FAST_POLL` (from Linux 5.7)
860      *
861      * Currently io_uring tries any request in a non-blocking manner, if it can, and then retries
862      * from a worker thread if we get -EAGAIN. Now that we have a new and fancy poll based retry
863      * backend, use that to retry requests if the file supports it.
864      *
865      * This means that, for example, an IORING_OP_RECVMSG on a socket no longer requires an async
866      * thread to complete the IO. If we get -EAGAIN reading from the socket in a non-blocking
867      * manner, we arm a poll handler for notification on when the socket becomes readable. When it
868      * does, the pending read is executed directly by the task again, through the io_uring task
869      * work handlers. Not only is this faster and more efficient, it also means we're not
870      * generating potentially tons of async threads that just sit and block, waiting for the IO to
871      * complete.
872      *
873      * The feature is marked with IORING_FEAT_FAST_POLL, meaning that async pollable IO is fast,
874      * and that poll<link>other_op is fast as well.
875      */
876     FAST_POLL = 1U << 5,
877 
878     /**
879      * `IORING_FEAT_POLL_32BITS` (from Linux 5.9)
880      *
881      * Poll events should be 32-bits to cover EPOLLEXCLUSIVE.
882      * Explicit word-swap the poll32_events for big endian to make sure the ABI is not changed.  We
883      * call this feature IORING_FEAT_POLL_32BITS, applications who want to use EPOLLEXCLUSIVE should
884      * check the feature bit first.
885      */
886     POLL_32BITS = 1U << 6,
887 
888     /**
889      * `IORING_FEAT_SQPOLL_NONFIXED` (from Linux 5.11)
890      *
891      * The restriction of needing fixed files for SQPOLL is problematic, and prevents/inhibits
892      * several valid uses cases. With the referenced files_struct that we have now, it's trivially
893      * supportable.
894      *
895      * Treat ->files like we do the mm for the SQPOLL thread - grab a reference to it (and assign
896      * it), and drop it when we're done.
897      *
898      * This feature is exposed as IORING_FEAT_SQPOLL_NONFIXED.
899      */
900     SQPOLL_NONFIXED = 1U << 7,
901 
902     /**
903      * `IORING_FEAT_EXT_ARG` (from Linux 5.11)
904      *
905      * Supports adding timeout to `existing io_uring_enter()`
906      */
907     EXT_ARG = 1U << 8,
908 
909     /// `IORING_FEAT_NATIVE_WORKERS	(1U << 9)` (from Linux 5.12)
910     NATIVE_WORKERS = 1U << 9,
911 
912     /// `IORING_FEAT_RSRC_TAGS	(1U << 9)` (from Linux 5.13)
913     RSRC_TAGS = 1U << 10,
914 }
915 
916 /**
917  * Filled with the offset for mmap(2)
918  *
919  * C API: `struct io_sqring_offsets`
920  */
921 struct SubmissionQueueRingOffsets
922 {
923     /// Incremented by kernel after entry at `head` was processed.
924     /// Pending submissions: [head..tail]
925     uint head;
926 
927     /// Modified by user space when new entry was queued; points to next
928     /// entry user space is going to fill.
929     uint tail;
930 
931     /// value `value_at(self.ring_entries) - 1`
932     /// mask for indices at `head` and `tail` (don't delete masked bits!
933     /// `head` and `tail` can point to the same entry, but if they are
934     /// not exactly equal it implies the ring is full, and if they are
935     /// exactly equal the ring is empty.)
936     uint ring_mask;
937 
938     /// value same as SetupParameters.sq_entries, power of 2.
939     uint ring_entries;
940 
941     /// SubmissionQueueFlags
942     SubmissionQueueFlags flags;
943 
944     /// number of (invalid) entries that were dropped; entries are
945     /// invalid if their index (in `array`) is out of bounds.
946     uint dropped;
947 
948     /// index into array of `SubmissionEntry`s at offset `SUBMISSION_QUEUE_ENTRIES_OFFSET` in mmap()
949     uint array;
950 
951     private uint[3] resv; // reserved
952 }
953 
954 enum SubmissionQueueFlags: uint
955 {
956     NONE        = 0,
957 
958     /// `IORING_SQ_NEED_WAKEUP`: needs io_uring_enter wakeup
959     /// set by kernel poll thread when it goes sleeping, and reset on wakeup
960     NEED_WAKEUP = 1U << 0,
961 
962     /// `IORING_SQ_CQ_OVERFLOW`: CQ ring is overflown
963     /// Since Kernel 5.8
964     /// For those applications which are not willing to use io_uring_enter() to reap and handle
965     /// cqes, they may completely rely on liburing's io_uring_peek_cqe(), but if cq ring has
966     /// overflowed, currently because io_uring_peek_cqe() is not aware of this overflow, it won't
967     /// enter kernel to flush cqes.
968     /// To fix this issue, export cq overflow status to userspace by adding new
969     /// IORING_SQ_CQ_OVERFLOW flag, then helper functions() in liburing, such as io_uring_peek_cqe,
970     /// can be aware of this cq overflow and do flush accordingly.
971     CQ_OVERFLOW = 1U << 1
972 }
973 
974 /**
975  * Field offsets used to map kernel structure to our.
976  *
977  * C API: `struct io_cqring_offsets`
978  */
979 struct CompletionQueueRingOffsets
980 {
981     /// incremented by user space after entry at `head` was processed.
982     /// available entries for processing: [head..tail]
983     uint head;
984 
985     /// modified by kernel when new entry was created; points to next
986     /// entry kernel is going to fill.
987     uint tail;
988 
989     /// value `value_at(ring_entries) - 1`
990     /// mask for indices at `head` and `tail` (don't delete masked bits!
991     /// `head` and `tail` can point to the same entry, but if they are
992     /// not exactly equal it implies the ring is full, and if they are
993     /// exactly equal the ring is empty.)
994     uint ring_mask;
995 
996     /// value same as SetupParameters.cq_entries, power of 2.
997     uint ring_entries;
998 
999     /// incremented by the kernel every time it failed to queue a
1000     /// completion event because the ring was full.
1001     uint overflow;
1002 
1003     /// Offset to array of completion queue entries
1004     uint cqes;
1005 
1006     CQRingFlags flags;             /// (available from Linux 5.8)
1007     private uint _resv1;
1008     private ulong _resv2;
1009 }
1010 
1011 /// CompletionQueue ring flags
1012 enum CQRingFlags : uint
1013 {
1014     NONE = 0, /// No flags set
1015 
1016     /// `IORING_CQ_EVENTFD_DISABLED` disable eventfd notifications (available from Linux 5.8)
1017     /// This new flag should be set/clear from the application to disable/enable eventfd notifications when a request is completed and queued to the CQ ring.
1018     ///
1019     /// Before this patch, notifications were always sent if an eventfd is registered, so IORING_CQ_EVENTFD_DISABLED is not set during the initialization.
1020     /// It will be up to the application to set the flag after initialization if no notifications are required at the beginning.
1021     EVENTFD_DISABLED = 1U << 0,
1022 }
1023 
1024 /// io_uring_register(2) opcodes and arguments
1025 enum RegisterOpCode : uint
1026 {
1027     /**
1028      * `arg` points to a struct iovec array of nr_args entries.  The buffers associated with the
1029      * iovecs will be locked in memory and charged against the user's RLIMIT_MEMLOCK resource limit.
1030      * See getrlimit(2) for more  informa‐ tion.   Additionally,  there  is a size limit of 1GiB per
1031      * buffer.  Currently, the buffers must be anonymous, non-file-backed memory, such as that
1032      * returned by malloc(3) or mmap(2) with the MAP_ANONYMOUS flag set.  It is expected that this
1033      * limitation will be lifted in the future. Huge pages are supported as well. Note that the
1034      * entire huge page will be pinned in the kernel, even if only a portion of it is used.
1035      *
1036      * After a successful call, the supplied buffers are mapped into the kernel and eligible for
1037      * I/O.  To make use of them, the application must specify the IORING_OP_READ_FIXED or
1038      * IORING_OP_WRITE_FIXED opcodes in the submis‐ sion  queue  entry (see the struct io_uring_sqe
1039      * definition in io_uring_enter(2)), and set the buf_index field to the desired buffer index.
1040      * The memory range described by the submission queue entry's addr and len fields must fall
1041      * within the indexed buffer.
1042      *
1043      * It is perfectly valid to setup a large buffer and then only use part of it for an I/O, as
1044      * long as the range is within the originally mapped region.
1045      *
1046      * An application can increase or decrease the size or number of registered buffers by first
1047      * unregistering the existing buffers, and then issuing a new call to io_uring_register() with
1048      * the new buffers.
1049      *
1050      * An application need not unregister buffers explicitly before shutting down the io_uring
1051      * instance.
1052      *
1053      * `IORING_REGISTER_BUFFERS`
1054      */
1055     REGISTER_BUFFERS        = 0,
1056 
1057     /**
1058      * This operation takes no argument, and `arg` must be passed as NULL. All previously registered
1059      * buffers associated with the io_uring instance will be released.
1060      *
1061      * `IORING_UNREGISTER_BUFFERS`
1062      */
1063     UNREGISTER_BUFFERS      = 1,
1064 
1065     /**
1066      * Register files for I/O. `arg` contains a pointer to an array of `nr_args` file descriptors
1067      * (signed 32 bit integers).
1068      *
1069      * To make use of the registered files, the IOSQE_FIXED_FILE flag must be set in the flags
1070      * member of the struct io_uring_sqe, and the fd member is set to the index of the file in the
1071      * file descriptor array.
1072      *
1073      * Files are automatically unregistered when the io_uring instance is torn down. An application
1074      * need only unregister if it wishes to register a new set of fds.
1075      *
1076      * `IORING_REGISTER_FILES`
1077      */
1078     REGISTER_FILES          = 2,
1079 
1080     /**
1081      * This operation requires no argument, and `arg` must be passed as NULL.  All previously
1082      * registered files associated with the io_uring instance will be unregistered.
1083      *
1084      * `IORING_UNREGISTER_FILES`
1085      */
1086     UNREGISTER_FILES        = 3,
1087 
1088     /**
1089      * `IORING_REGISTER_EVENTFD`
1090      *
1091      * Registers eventfd that would be used to notify about completions on io_uring itself.
1092      *
1093      * Note: available from Linux 5.2
1094      */
1095     REGISTER_EVENTFD        = 4,
1096 
1097     /**
1098      * `IORING_UNREGISTER_EVENTFD`
1099      *
1100      * Unregisters previously registered eventfd.
1101      *
1102      * Note: available from Linux 5.2
1103      */
1104     UNREGISTER_EVENTFD      = 5,
1105 
1106     /// `IORING_REGISTER_FILES_UPDATE` (from Linux 5.5)
1107     REGISTER_FILES_UPDATE   = 6,
1108 
1109     /**
1110      * `IORING_REGISTER_EVENTFD_ASYNC` (from Linux 5.6)
1111      *
1112      * If an application is using eventfd notifications with poll to know when new SQEs can be
1113      * issued, it's expecting the following read/writes to complete inline. And with that, it knows
1114      * that there are events available, and don't want spurious wakeups on the eventfd for those
1115      * requests.
1116      *
1117      * This adds IORING_REGISTER_EVENTFD_ASYNC, which works just like IORING_REGISTER_EVENTFD,
1118      * except it only triggers notifications for events that happen from async completions (IRQ, or
1119      * io-wq worker completions). Any completions inline from the submission itself will not
1120      * trigger notifications.
1121      */
1122     REGISTER_EVENTFD_ASYNC = 7,
1123 
1124     /**
1125      * `IORING_REGISTER_PROBE` (from Linux 5.6)
1126      *
1127      * The application currently has no way of knowing if a given opcode is supported or not
1128      * without having to try and issue one and see if we get -EINVAL or not. And even this approach
1129      * is fraught with peril, as maybe we're getting -EINVAL due to some fields being missing, or
1130      * maybe it's just not that easy to issue that particular command without doing some other leg
1131      * work in terms of setup first.
1132      *
1133      * This adds IORING_REGISTER_PROBE, which fills in a structure with info on what it supported
1134      * or not. This will work even with sparse opcode fields, which may happen in the future or
1135      * even today if someone backports specific features to older kernels.
1136      */
1137     REGISTER_PROBE = 8,
1138 
1139     /**
1140      * `IORING_REGISTER_PERSONALITY` (from Linux 5.6)
1141      *
1142      * If an application wants to use a ring with different kinds of credentials, it can register
1143      * them upfront. We don't lookup credentials, the credentials of the task calling
1144      * IORING_REGISTER_PERSONALITY is used.
1145      *
1146      * An 'id' is returned for the application to use in subsequent personality support.
1147      */
1148     REGISTER_PERSONALITY = 9,
1149 
1150     /// `IORING_UNREGISTER_PERSONALITY` (from Linux 5.6)
1151     UNREGISTER_PERSONALITY = 10,
1152 
1153     /**
1154      * `IORING_REGISTER_RESTRICTIONS` (from Linux 5.10)
1155      *
1156      * Permanently installs a feature allowlist on an io_ring_ctx. The io_ring_ctx can then be
1157      * passed to untrusted code with the knowledge that only operations present in the allowlist can
1158      * be executed.
1159      *
1160      * The allowlist approach ensures that new features added to io_uring do not accidentally become
1161      * available when an existing application is launched on a newer kernel version.
1162      *
1163      * Currently it's possible to restrict sqe opcodes, sqe flags, and register opcodes.
1164      *
1165      * `IOURING_REGISTER_RESTRICTIONS` can only be made once. Afterwards it is not possible to
1166      * change restrictions anymore. This prevents untrusted code from removing restrictions.
1167      */
1168     REGISTER_RESTRICTIONS = 11,
1169 
1170     /**
1171      *`IORING_REGISTER_ENABLE_RINGS` (from Linux 5.10)
1172      *
1173      * This operation is to be used when rings are disabled on start with `IORING_SETUP_R_DISABLED`.
1174      */
1175     ENABLE_RINGS = 12,
1176 
1177     /**
1178      * `IORING_REGISTER_FILES2` (from Linux 5.13)
1179      */
1180     REGISTER_FILES2 = 13,
1181 
1182     /**
1183      * `IORING_REGISTER_FILES_UPDATE2` (from Linux 5.13)
1184      */
1185     REGISTER_FILES_UPDATE2 = 14,
1186 
1187     /**
1188      * `IORING_REGISTER_BUFFERS2` (from Linux 5.13)
1189      */
1190     REGISTER_BUFFERS2 = 15,
1191 
1192     /**
1193      * `IORING_REGISTER_BUFFERS_UPDATE` (from Linux 5.13)
1194      */
1195     REGISTER_BUFFERS_UPDATE = 16,
1196 
1197     /* set/clear io-wq thread affinities */
1198     /// `IORING_REGISTER_IOWQ_AFF` (from Linux 5.14)
1199     REGISTER_IOWQ_AFF        = 17,
1200 
1201     /// `IORING_UNREGISTER_IOWQ_AFF` (from Linux 5.14)
1202     UNREGISTER_IOWQ_AFF      = 18,
1203 
1204     /// `IORING_REGISTER_IOWQ_MAX_WORKERS` (from Linux 5.15)
1205     /// set/get max number of io-wq workers
1206     REGISTER_IOWQ_MAX_WORKERS = 19,
1207 }
1208 
1209 /* io-wq worker categories */
1210 enum IOWQCategory
1211 {
1212     BOUND, /// `IO_WQ_BOUND`
1213     UNBOUND, /// `IO_WQ_UNBOUND`
1214 }
1215 
1216 /// io_uring_enter(2) flags
1217 enum EnterFlags: uint
1218 {
1219     NONE        = 0,
1220     GETEVENTS   = 1U << 0, /// `IORING_ENTER_GETEVENTS`
1221     SQ_WAKEUP   = 1U << 1, /// `IORING_ENTER_SQ_WAKEUP`
1222 
1223     /**
1224      * `IORING_ENTER_SQ_WAIT` (from Linux 5.10)
1225      *
1226      * When using SQPOLL, applications can run into the issue of running out of SQ ring entries
1227      * because the thread hasn't consumed them yet. The only option for dealing with that is
1228      * checking later, or busy checking for the condition.
1229      */
1230     SQ_WAIT     = 1U << 2,
1231 
1232     /**
1233      * `IORING_ENTER_EXT_ARG` (from Linux 5.11)
1234      *
1235      * Adds support for timeout to existing io_uring_enter() function.
1236      */
1237     EXT_ARG     = 1U << 3,
1238 }
1239 
1240 /// Time specification as defined in kernel headers (used by TIMEOUT operations)
1241 struct KernelTimespec
1242 {
1243     long tv_sec; /// seconds
1244     long tv_nsec; /// nanoseconds
1245 }
1246 
1247 static assert(CompletionEntry.sizeof == 16);
1248 static assert(CompletionQueueRingOffsets.sizeof == 40);
1249 static assert(SetupParameters.sizeof == 120);
1250 static assert(SubmissionEntry.sizeof == 64);
1251 static assert(SubmissionQueueRingOffsets.sizeof == 40);
1252 
1253 /// Indicating that OP is supported by the kernel
1254 enum IO_URING_OP_SUPPORTED = 1U << 0;
1255 
1256 /**
1257  * Skip updating fd indexes set to this value in the fd table
1258  *
1259  * Support for skipping a file descriptor when using `IORING_REGISTER_FILES_UPDATE`.
1260  * `__io_sqe_files_update` will skip fds set to `IORING_REGISTER_FILES_SKIP`
1261  *
1262  * Note: Available from Linux 5.12
1263  */
1264 enum IORING_REGISTER_FILES_SKIP = -2;
1265 
1266 struct io_uring_probe_op
1267 {
1268     ubyte op;
1269     ubyte resv;
1270     ushort flags; /* IO_URING_OP_* flags */
1271     uint resv2;
1272 }
1273 
1274 struct io_uring_probe
1275 {
1276     ubyte last_op; /* last opcode supported */
1277     ubyte ops_len; /* length of ops[] array below */
1278     ushort resv;
1279     uint[3] resv2;
1280     io_uring_probe_op[0] ops;
1281 }
1282 
1283 struct io_uring_restriction
1284 {
1285     RestrictionOp opcode;
1286     union
1287     {
1288         ubyte register_op; /// IORING_RESTRICTION_REGISTER_OP
1289         ubyte sqe_op;      /// IORING_RESTRICTION_SQE_OP
1290         ubyte sqe_flags;   /// IORING_RESTRICTION_SQE_FLAGS_*
1291     }
1292     ubyte resv;
1293     uint[3] resv2;
1294 }
1295 
1296 /**
1297  * io_uring_restriction->opcode values
1298  */
1299 enum RestrictionOp : ushort
1300 {
1301     /// Allow an io_uring_register(2) opcode
1302     IORING_RESTRICTION_REGISTER_OP          = 0,
1303 
1304     /// Allow an sqe opcode
1305     IORING_RESTRICTION_SQE_OP               = 1,
1306 
1307     /// Allow sqe flags
1308     IORING_RESTRICTION_SQE_FLAGS_ALLOWED    = 2,
1309 
1310     /// Require sqe flags (these flags must be set on each submission)
1311     IORING_RESTRICTION_SQE_FLAGS_REQUIRED   = 3,
1312 }
1313 
1314 struct io_uring_getevents_arg
1315 {
1316     ulong   sigmask;
1317     uint    sigmask_sz;
1318     uint    pad;
1319     ulong   ts;
1320 }
1321 
1322 /**
1323  * Setup a context for performing asynchronous I/O.
1324  *
1325  * The `io_uring_setup()` system call sets up a submission queue (SQ) and completion queue (CQ) with
1326  * at least entries entries, and returns a file descriptor which can be used to perform subsequent
1327  * operations on the io_uring instance. The submission and completion queues are shared between
1328  * userspace and the kernel, which eliminates the need to copy data when initiating and completing
1329  * I/O.
1330  *
1331  * See_Also: `io_uring_setup(2)`
1332  *
1333  * Params:
1334  *   entries = Defines how many entries can submission queue hold.
1335  *   p = `SetupParameters`
1336  *
1337  * Returns:
1338  *     `io_uring_setup(2)` returns a new file descriptor on success. The application may then provide
1339  *     the file descriptor in a subsequent `mmap(2)` call to map the submission and completion queues,
1340  *     or to the `io_uring_register(2)` or `io_uring_enter(2)` system calls.
1341  *
1342  *     On error, -1 is returned and `errno` is set appropriately.
1343  */
1344 int io_uring_setup(uint entries, scope ref SetupParameters p) @trusted
1345 {
1346     pragma(inline);
1347     return syscall(SYS_io_uring_setup, entries, &p);
1348 }
1349 
1350 /**
1351  * Initiate and/or complete asynchronous I/O
1352  *
1353  * `io_uring_enter()` is used to initiate and complete I/O using the shared submission and
1354  * completion queues setup by a call to `io_uring_setup(2)`. A single call can both submit new I/O
1355  * and wait for completions of I/O initiated by this call or previous calls to `io_uring_enter()``.
1356  *
1357  * When the system call returns that a certain amount of SQEs have been consumed and submitted, it's
1358  * safe to reuse SQE entries in the ring. This is true even if the actual IO submission had to be
1359  * punted to async context, which means that the SQE may in fact not have been submitted yet. If the
1360  * kernel requires later use of a particular SQE entry, it will have made a private copy of it.
1361  *
1362  * Note: For interrupt driven I/O (where `IORING_SETUP_IOPOLL` was not specified in the call to
1363  *     `io_uring_setup(2)`), an application may check the completion queue for event completions without
1364  *     entering the kernel at all.
1365  *
1366  * See_Also: `io_uring_enter(2)`
1367  *
1368  * Params:
1369  *   fd = the file descriptor returned by io_uring_setup(2).
1370  *   to_submit = specifies the number of I/Os to submit from the submission queue.
1371  *   min_complete = If the `IORING_ENTER_GETEVENTS` bit is set in flags, then the system call will attempt
1372  *        to wait for `min_complete` event completions before returning. If the io_uring instance was configured
1373  *        for polling, by specifying IORING_SETUP_IOPOLL in the call to io_uring_setup(2), then
1374  *        min_complete has a slightly different meaning.  Passing a value of 0 instructs the kernel to
1375  *        return any events which are already complete, without blocking. If min_complete is a non-zero
1376  *        value, the kernel will still return immediately if  any completion  events are available.  If
1377  *        no event completions are available, then the call will poll either until one or more
1378  *        completions become available, or until the process has exceeded its scheduler time slice.
1379  *   flags = Behavior modification flags - `EnterFlags`
1380  *   sig = a pointer to a signal mask (see `sigprocmask(2)`); if sig is not `null`, `io_uring_enter()`
1381  *         first replaces the current signal mask by the one pointed to by sig, then waits for events to
1382  *         become available in the completion queue, and then restores the original signal mask. The
1383  *         following `io_uring_enter()` call:
1384  *
1385  *         ```
1386  *         ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, &sig);
1387  *         ```
1388  *
1389  *         is equivalent to atomically executing the following calls:
1390  *
1391  *         ```
1392  *         pthread_sigmask(SIG_SETMASK, &sig, &orig);
1393  *         ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, NULL);
1394  *         pthread_sigmask(SIG_SETMASK, &orig, NULL);
1395  *         ```
1396  *
1397  *         See the description of `pselect(2)` for an explanation of why the sig parameter is necessary.
1398  *
1399  * Returns:
1400  */
1401 int io_uring_enter(int fd, uint to_submit, uint min_complete, EnterFlags flags, const sigset_t* sig = null)
1402 {
1403     pragma(inline);
1404     return syscall(SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigset_t.sizeof);
1405 }
1406 
1407 /// ditto
1408 int io_uring_enter(int fd, uint to_submit, uint min_complete, EnterFlags flags, const io_uring_getevents_arg* args)
1409 {
1410     pragma(inline);
1411     return syscall(SYS_io_uring_enter, fd, to_submit, min_complete, flags, args, io_uring_getevents_arg.sizeof);
1412 }
1413 
1414 /**
1415  * Register files or user buffers for asynchronous I/O.
1416  *
1417  * The `io_uring_register()` system call registers user buffers or files for use in an `io_uring(7)`
1418  * instance referenced by fd.  Registering files or user buffers allows the kernel to take long term
1419  * references to internal data structures or create long term mappings of application memory,
1420  * greatly reducing per-I/O overhead.
1421  *
1422  * See_Also: `io_uring_register(2)
1423  *
1424  * Params:
1425  *   fd = the file descriptor returned by a call to io_uring_setup(2)
1426  *   opcode = code of operation to execute on args
1427  *   arg = Args used by specified operation. See `RegisterOpCode` for usage details.
1428  *   nr_args = number of provided arguments
1429  *
1430  * Returns: On success, io_uring_register() returns 0.  On error, -1 is returned, and errno is set accordingly.
1431  */
1432 int io_uring_register(int fd, RegisterOpCode opcode, const(void)* arg, uint nr_args)
1433 {
1434     pragma(inline);
1435     return syscall(SYS_io_uring_register, fd, opcode, arg, nr_args);
1436 }
1437 
1438 private:
1439 
1440 // Syscalls
1441 enum
1442 {
1443     SYS_io_uring_setup       = 425,
1444     SYS_io_uring_enter       = 426,
1445     SYS_io_uring_register    = 427
1446 }
1447 
1448 extern (C):
1449 
1450 /// Invoke `system call' number `sysno`, passing it the remaining arguments.
1451 int syscall(int sysno, ...);