1 /**
2  * io_uring system api definitions.
3  *
4  * See: https://github.com/torvalds/linux/blob/master/include/uapi/linux/io_uring.h
5  *
6  * Last changes from: f8e85cf255ad57d65eeb9a9d0e59e3dec55bdd9e (20191123)
7 */
8 module during.io_uring;
9 
10 version (linux):
11 
12 import core.sys.posix.poll;
13 import core.sys.posix.signal;
14 
15 @system nothrow @nogc:
16 
17 /**
18  * IO operation submission data structure (Submission queue entry).
19  *
20  * C API: `struct io_uring_sqe`
21  */
22 struct SubmissionEntry
23 {
24     Operation               opcode;         /// type of operation for this sqe
25     SubmissionEntryFlags    flags;          /// IOSQE_ flags
26     ushort                  ioprio;         /// ioprio for the request
27     int                     fd;             /// file descriptor to do IO on
28     union
29     {
30         ulong off;                          /// offset into file
31         ulong addr2;                        /// from Linux 5.5
32     }
33 
34     ulong addr;                             /// pointer to buffer or iovecs
35     uint len;                               /// buffer size or number of iovecs
36 
37     union
38     {
39         ReadWriteFlags      rw_flags;
40         FsyncFlags          fsync_flags;
41         PollEvents          poll_events;
42         SyncFileRangeFlags  sync_range_flags;   /// from Linux 5.2
43         MsgFlags            msg_flags;          /// from Linux 5.3
44         TimeoutFlags        timeout_flags;      /// from Linux 5.4
45         AcceptFlags         accept_flags;       /// from Linux 5.5
46         // uint                cancel_flags;       /// from Linux 5.5 (TODO: not any yet)
47     }
48 
49     ulong user_data;                        /// data to be passed back at completion time
50 
51     union
52     {
53         ushort buf_index;                   /// index into fixed buffers, if used
54         ulong[3] __pad2;
55     }
56 
57     /// Resets entry fields
58     void clear() @safe nothrow @nogc
59     {
60         this = SubmissionEntry.init;
61     }
62 }
63 
64 enum ReadWriteFlags : int
65 {
66     NONE = 0,
67 
68     /// High priority read/write.  Allows block-based filesystems to
69     /// use polling of the device, which provides lower latency, but
70     /// may use additional resources.  (Currently, this feature is
71     /// usable only  on  a  file  descriptor opened using the
72     /// O_DIRECT flag.)
73     ///
74     /// (since Linux 4.6)
75     HIPRI = 0x00000001,
76 
77     /// Provide a per-write equivalent of the O_DSYNC open(2) flag.
78     /// This flag is meaningful only for pwritev2(), and its effect
79     /// applies only to the data range written by the system call.
80     ///
81     /// (since Linux 4.7)
82     DSYNC = 0x00000002,
83 
84     /// Provide a per-write equivalent of the O_SYNC open(2) flag.
85     /// This flag is meaningful only for pwritev2(), and its effect
86     /// applies only to the data range written by the system call.
87     ///
88     /// (since Linux 4.7)
89     SYNC = 0x00000004,
90 
91     /// Do not wait for data which is not immediately available.  If
92     /// this flag is specified, the preadv2() system call will
93     /// return instantly if it would have to read data from the
94     /// backing storage or wait for a lock.  If some data was
95     /// successfully read, it will return the number of bytes read.
96     /// If no bytes were read, it will return -1 and set errno to
97     /// EAGAIN.  Currently, this flag is meaningful only for
98     /// preadv2().
99     ///
100     /// (since Linux 4.14)
101     NOWAIT = 0x00000008,
102 
103     /// Provide a per-write equivalent of the O_APPEND open(2) flag.
104     /// This flag is meaningful only for pwritev2(), and its effect
105     /// applies only to the data range written by the system call.
106     /// The offset argument does not affect the write operation; the
107     /// data is always appended to the end of the file.  However, if
108     /// the offset argument is -1, the current file offset is
109     /// updated.
110     ///
111     /// (since Linux 4.16)
112     APPEND = 0x00000010
113 }
114 
115 enum FsyncFlags : uint
116 {
117     /// Normal file integrity sync
118     NORMAL      = 0,
119 
120     /**
121      * `fdatasync` semantics.
122      *
123      * See_Also: `fsync(2)` for details
124      */
125     DATASYNC    = (1 << 0)
126 }
127 
128 /** Possible poll event flags.
129  *  See: poll(2)
130  */
131 enum PollEvents : ushort
132 {
133     /// There is data to read.
134     IN      = POLLIN,
135 
136     /** Writing is now possible, though a write larger that the available
137      *  space in a socket or pipe will still block (unless O_NONBLOCK is set).
138      */
139     OUT     = POLLOUT,
140 
141     /** There is some exceptional condition on the file descriptor.
142      *  Possibilities include:
143      *
144      *  *  There is out-of-band data on a TCP socket (see tcp(7)).
145      *  *  A pseudoterminal master in packet mode has seen a state
146      *      change on the slave (see ioctl_tty(2)).
147      *  *  A cgroup.events file has been modified (see cgroups(7)).
148      */
149     PRI     = POLLPRI,
150 
151     /** Error condition (only returned in revents; ignored in events).
152       * This bit is also set for a file descriptor referring to the
153       * write end of a pipe when the read end has been closed.
154      */
155     ERR     = POLLERR,
156 
157     /// Invalid request: fd not open (only returned in revents; ignored in events).
158     NVAL    = POLLNVAL,
159 
160     RDNORM  = POLLRDNORM, /// Equivalent to POLLIN.
161     RDBAND  = POLLRDBAND, /// Priority band data can be read (generally unused on Linux).
162     WRNORM  = POLLWRNORM, /// Equivalent to POLLOUT.
163     WRBAND  = POLLWRBAND, /// Priority data may be written.
164 
165     /** Hang up (only returned in revents; ignored in events).  Note
166      *  that when reading from a channel such as a pipe or a stream
167      *  socket, this event merely indicates that the peer closed its
168      *  end of the channel.  Subsequent reads from the channel will
169      *  return 0 (end of file) only after all outstanding data in the
170      *  channel has been consumed.
171      */
172     HUP     = POLLHUP,
173 }
174 
175 /**
176  * Flags for `sync_file_range(2)` operation.
177  *
178  * See_Also: `sync_file_range(2)` for details
179  */
180 enum SyncFileRangeFlags : uint
181 {
182     NOOP            = 0, /// no operation
183     /// Wait upon write-out of all pages in the specified range that have already been submitted to
184     /// the device driver for write-out before performing any write.
185     WAIT_BEFORE     = 1U << 0,
186 
187     /// Initiate write-out of all dirty pages in the specified range which are not presently
188     /// submitted write-out.  Note that even this may block if you attempt to write more than
189     /// request queue size.
190     WRITE           = 1U << 1,
191 
192     /// Wait upon write-out of all pages in the range after performing any write.
193     WAIT_AFTER      = 1U << 2,
194 
195     /// This is a write-for-data-integrity operation that will ensure that all pages in the
196     /// specified range which were dirty when sync_file_range() was called are committed to disk.
197     WRITE_AND_WAIT  = WAIT_BEFORE | WRITE | WAIT_AFTER
198 }
199 
200 /**
201  * Flags for `sendmsg(2)` and `recvmsg(2)` operations.
202  *
203  * See_Also: man pages for the operations.
204  */
205 enum MsgFlags : uint
206 {
207     /// No flags defined
208     NONE = 0,
209 
210     /// Sends out-of-band data on sockets that support this notion (e.g., of type `SOCK_STREAM`); the
211     /// underlying protocol must also support out-of-band data.
212     OOB = 0x01,
213 
214     /// This flag causes the receive operation to return data from the beginning of the receive
215     /// queue without removing that data from the queue. Thus, a subsequent receive call will return
216     /// the same data.
217     PEEK = 0x02,
218 
219     /// Don't use a gateway to send out the packet, send to hosts only on directly connected
220     /// networks. This is usually used only by diagnostic or routing programs. This is defined only
221     /// for protocol families that route; packet sockets don't.
222     DONTROUTE = 0x04,
223 
224     /// For raw (`AF_PACKET`), Internet datagram (since Linux 2.4.27/2.6.8), netlink (since Linux
225     /// 2.6.22), and UNIX datagram (since Linux 3.4) sockets: return the real length of the packet
226     /// or datagram, even when it was longer than the passed buffer.
227     ///
228     /// For use with Internet stream sockets, see `tcp(7)`.
229     TRUNC = 0x20,
230 
231     /// Enables nonblocking operation; if the operation would block, EAGAIN or EWOULDBLOCK is
232     /// returned. This provides similar behavior to setting the O_NONBLOCK flag (via the `fcntl(2)`
233     /// F_SETFL operation), but differs in that `MSG_DONTWAIT` is a per-call option, whereas
234     /// `O_NONBLOCK` is a setting on the open file description (see `open(2)`), which will affect
235     /// all threads in the calling process and as well as other processes that hold file descriptors
236     /// referring to the same open file description.
237     DONTWAIT = 0x40,
238 
239     /// Terminates a record (when this notion is supported, as for sockets of type `SOCK_SEQPACKET`).
240     EOR = 0x80,
241 
242     /// This flag requests that the operation block until the full request is satisfied. However,
243     /// the call may still return less data than requested if a signal is caught, an error or
244     /// disconnect occurs, or the next data to be received is of a different type than that
245     /// returned. This flag has no effect for datagram sockets.
246     WAITALL = 0x100,
247 
248     /// Tell the link layer that forward progress happened: you got a successful reply from the
249     /// other side. If the link layer doesn't get this it will regularly reprobe the neighbor (e.g.,
250     /// via a unicast ARP). Valid  only  on SOCK_DGRAM and SOCK_RAW sockets and currently
251     /// implemented only for IPv4 and IPv6. See arp(7) for details.
252     CONFIRM = 0x800,
253 
254     /// This flag specifies that queued errors should be received from the socket error queue. The
255     /// error is passed in an ancillary message with a type dependent on the protocol (for IPv4
256     /// `IP_RECVERR`). The user should supply a buffer of sufficient size. See `cmsg(3)` and `ip(7)`
257     /// for more information. The payload of the original packet that caused the error is passed as
258     /// normal data via msg_iovec. The original destination address of the datagram that caused the
259     /// error is supplied via `msg_name`.
260     ERRQUEUE = 0x2000,
261 
262     /// Don't generate a `SIGPIPE` signal if the peer on a stream-oriented socket has closed the
263     /// connection. The `EPIPE` error is still returned. This provides similar behavior to using
264     /// `sigaction(2)` to ignore `SIGPIPE`, but, whereas `MSG_NOSIGNAL` is a per-call feature,
265     /// ignoring `SIGPIPE` sets a process attribute that affects all threads in the process.
266     NOSIGNAL = 0x4000,
267 
268     /// The caller has more data to send. This flag is used with TCP sockets to obtain the same
269     /// effect as the `TCP_CORK` socket option (see `tcp(7)`), with the difference that this flag can be
270     /// set on a per-call basis.
271     ///
272     /// Since Linux 2.6, this flag is also supported for UDP sockets, and informs the kernel to
273     /// package all of the data sent in calls with this flag set into a single datagram which is
274     /// transmitted only when a call is performed that does not specify this flag.
275     ///
276     /// See_Also: the `UDP_CORK` socket option described in `udp(7)`
277     MORE = 0x8000,
278 
279     /// Set the close-on-exec flag for the file descriptor received via a UNIX domain file
280     /// descriptor using the `SCM_RIGHTS` operation (described in `unix(7)`). This flag is useful
281     /// for the same reasons as the `O_CLOEXEC` flag of `open(2)`. (recvmsg only)
282     CMSG_CLOEXEC = 0x40000000
283 }
284 
285 /** sqe->timeout_flags
286  */
287 enum TimeoutFlags : uint
288 {
289     REL = 0,
290     ABS = 1U << 0   /// `IORING_TIMEOUT_ABS` (from Linux 5.5)
291 }
292 
293 /**
294  * Flags that can be used with the `accept4(2)` operation.
295  */
296 enum AcceptFlags : uint
297 {
298     /// Same as `accept()`
299     NONE = 0,
300 
301     /// Set the `O_NONBLOCK` file status flag on the new open file description. Using this flag saves
302     /// extra calls to `fcntl(2)` to achieve the same result.
303     NONBLOCK = 0x800, // octal 00004000
304 
305     /// Set the close-on-exec (`FD_CLOEXEC`) flag on the new file descriptor. See the description of
306     /// the `O_CLOEXEC` flag in `open(2)` for reasons why this may be useful.
307     CLOEXEC = 0x80000 // octal 02000000
308 }
309 
310 /**
311  * Describes the operation to be performed
312  *
313  * See_Also: `io_uring_enter(2)`
314  */
315 enum Operation : ubyte
316 {
317     // available from Linux 5.1
318     NOP = 0,                /// IORING_OP_NOP
319     READV = 1,              /// IORING_OP_READV
320     WRITEV = 2,             /// IORING_OP_WRITEV
321     FSYNC = 3,              /// IORING_OP_FSYNC
322     READ_FIXED = 4,         /// IORING_OP_READ_FIXED
323     WRITE_FIXED = 5,        /// IORING_OP_WRITE_FIXED
324     POLL_ADD = 6,           /// IORING_OP_POLL_ADD
325     POLL_REMOVE = 7,        /// IORING_OP_POLL_REMOVE
326 
327     // available from Linux 5.2
328     SYNC_FILE_RANGE = 8,    /// IORING_OP_SYNC_FILE_RANGE
329 
330     // available from Linux 5.3
331     SENDMSG = 9,            /// IORING_OP_SENDMSG
332     RECVMSG = 10,           /// IORING_OP_RECVMSG
333 
334     // available from Linux 5.4
335     TIMEOUT = 11,           /// IORING_OP_TIMEOUT
336 
337     // available from Linux 5.5 (in master now)
338     TIMEOUT_REMOVE = 12,    /// IORING_OP_TIMEOUT_REMOVE
339     ACCEPT = 13,            /// IORING_OP_ACCEPT
340     ASYNC_CANCEL = 14,      /// IORING_OP_ASYNC_CANCEL
341     LINK_TIMEOUT = 15,      /// IORING_OP_LINK_TIMEOUT
342     CONNECT = 16,           /// IORING_OP_CONNECT
343 }
344 
345 /// sqe->flags
346 enum SubmissionEntryFlags : ubyte
347 {
348     NONE        = 0,
349     FIXED_FILE  = 1U << 0,  /// IOSQE_FIXED_FILE: use fixed fileset
350 
351     /**
352      * `IOSQE_IO_DRAIN`: issue after inflight IO
353      *
354      * If a request is marked with `IO_DRAIN`, then previous commands must complete before this one
355      * is issued. Subsequent requests are not started until the drain has completed.
356      *
357      * Note: available from Linux 5.2
358      */
359     IO_DRAIN    = 1U << 1,
360 
361     /**
362      * `IOSQE_IO_LINK`
363      *
364      * If set, the next SQE in the ring will depend on this SQE. A dependent SQE will not be started
365      * until the parent SQE has completed. If the parent SQE fails, then a dependent SQE will be
366      * failed without being started. Link chains can be arbitrarily long, the chain spans any new
367      * SQE that continues tohave the IOSQE_IO_LINK flag set. Once an SQE is encountered that does
368      * not have this flag set, that defines the end of the chain. This features allows to form
369      * dependencies between individual SQEs.
370      *
371      * Note: available from Linux 5.3
372      */
373     IO_LINK     = 1U << 2,
374 }
375 
376 /**
377  * IO completion data structure (Completion Queue Entry)
378  *
379  * C API: `struct io_uring_cqe`
380  */
381 struct CompletionEntry
382 {
383     ulong   user_data;  /* sqe->data submission passed back */
384     int     res;        /* result code for this event */
385     uint    flags;
386 }
387 
388 /**
389  * Passed in for io_uring_setup(2). Copied back with updated info on success.
390  *
391  * C API: `struct io_uring_params`
392  */
393 struct SetupParameters
394 {
395     // Magic offsets for the application to mmap the data it needs
396 
397     /// `IORING_OFF_SQ_RING`: mmap offset for submission queue ring
398     enum ulong SUBMISSION_QUEUE_RING_OFFSET = 0UL;
399     /// `IORING_OFF_CQ_RING`: mmap offset for completion queue ring
400     enum ulong COMPLETION_QUEUE_RING_OFFSET = 0x8000000UL;
401     /// `IORING_OFF_SQES`: mmap offset for submission entries
402     enum ulong SUBMISSION_QUEUE_ENTRIES_OFFSET = 0x10000000UL;
403 
404     /// (output) allocated entries in submission queue
405     /// (both ring index `array` and separate entry array at `SUBMISSION_QUEUE_ENTRIES_OFFSET`).
406     uint                        sq_entries;
407 
408     /// (output) allocated entries in completion queue
409     uint                        cq_entries;
410 
411     SetupFlags                  flags;          /// (input)
412 
413     /// (input) used if SQ_AFF and SQPOLL flags are active to pin poll thread to specific cpu.
414     /// right now always checked in kernel for "possible cpu".
415     uint                        sq_thread_cpu;
416 
417     /// (input) used if SQPOLL flag is active; timeout in milliseconds
418     /// until kernel poll thread goes to sleep.
419     uint                        sq_thread_idle;
420     SetupFeatures               features;       /// (from Linux 5.4)
421     private uint[4]             resv;           // reserved
422     SubmissionQueueRingOffsets  sq_off;         /// (output) submission queue ring data field offsets
423     CompletionQueueRingOffsets  cq_off;         /// (output) completion queue ring data field offsets
424 }
425 
426 /// `io_uring_setup()` flags
427 enum SetupFlags : uint
428 {
429     /// No flags set
430     NONE    = 0,
431 
432     /**
433      * `IORING_SETUP_IOPOLL`
434      *
435      * Perform busy-waiting for an I/O completion, as opposed to getting notifications via an
436      * asynchronous IRQ (Interrupt Request).  The file system (if any) and block device must
437      * support polling in order for  this  to  work. Busy-waiting  provides  lower latency, but may
438      * consume more CPU resources than interrupt driven I/O.  Currently, this feature is usable
439      * only on a file descriptor opened using the O_DIRECT flag.  When a read or write is submitted
440      * to a polled context, the application must poll for completions on the CQ ring by calling
441      * io_uring_enter(2).  It is illegal to mix and match polled and non-polled I/O on an io_uring
442      * instance.
443      */
444     IOPOLL  = 1U << 0,
445 
446     /**
447      * `IORING_SETUP_SQPOLL`
448      *
449      * When this flag is specified, a kernel thread is created to perform submission queue polling.
450      * An io_uring instance configured in this way enables an application to issue I/O without ever
451      * context switching into the kernel.
452      * By using the submission queue to fill in new submission queue entries and watching for
453      * completions on the completion queue, the application can submit and reap I/Os without doing
454      * a single system call.
455      * If the kernel thread is idle for more than sq_thread_idle microseconds, it will set the
456      * IORING_SQ_NEED_WAKEUP bit in the flags field of the struct io_sq_ring. When this happens,
457      * the application must call io_uring_enter(2) to wake the kernel thread. If I/O is kept busy,
458      * the kernel thread will never sleep. An application making use of this feature will need to
459      * guard the io_uring_enter(2) call with  the  following  code sequence:
460      *
461      *     ````
462      *     // Ensure that the wakeup flag is read after the tail pointer has been written.
463      *     smp_mb();
464      *     if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP)
465      *         io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP);
466      *     ```
467      *
468      * where sq_ring is a submission queue ring setup using the struct io_sqring_offsets described below.
469      *
470      * To  successfully  use this feature, the application must register a set of files to be used for
471      * IO through io_uring_register(2) using the IORING_REGISTER_FILES opcode. Failure to do so will
472      * result in submitted IO being errored with EBADF.
473      */
474     SQPOLL  = 1U << 1,
475 
476     /**
477      * `IORING_SETUP_SQ_AFF`
478      *
479      *  If this flag is specified, then the poll thread will be bound to the cpu set in the
480      *  sq_thread_cpu field of the struct io_uring_params.  This flag is only meaningful when
481      *  IORING_SETUP_SQPOLL is specified.
482      */
483     SQ_AFF  = 1U << 2,
484 
485     /**
486      * `IORING_SETUP_CQSIZE`
487      *
488      * Create the completion queue with struct io_uring_params.cq_entries entries.  The value must
489      * be greater than entries, and may be rounded up to the next power-of-two.
490      *
491      * Note: Available from Linux 5.5
492      */
493     CQSIZE  = 1U << 3,
494 }
495 
496 /// `io_uring_params->features` flags
497 enum SetupFeatures : uint
498 {
499     NONE        = 0,
500     SINGLE_MMAP = 1U << 0,  /// `IORING_FEAT_SINGLE_MMAP` (from Linux 5.4)
501     NODROP      = 1U << 1   /// `IORING_FEAT_NODROP` (from Linux 5.5)
502 }
503 
504 /**
505  * Filled with the offset for mmap(2)
506  *
507  * C API: `struct io_sqring_offsets`
508  */
509 struct SubmissionQueueRingOffsets
510 {
511     /// Incremented by kernel after entry at `head` was processed.
512     /// Pending submissions: [head..tail]
513     uint head;
514 
515     /// Modified by user space when new entry was queued; points to next
516     /// entry user space is going to fill.
517     uint tail;
518 
519     /// value `value_at(self.ring_entries) - 1`
520     /// mask for indices at `head` and `tail` (don't delete masked bits!
521     /// `head` and `tail` can point to the same entry, but if they are
522     /// not exactly equal it implies the ring is full, and if they are
523     /// exactly equal the ring is empty.)
524     uint ring_mask;
525 
526     /// value same as SetupParameters.sq_entries, power of 2.
527     uint ring_entries;
528 
529     /// SubmissionQueueFlags
530     SubmissionQueueFlags flags;
531 
532     /// number of (invalid) entries that were dropped; entries are
533     /// invalid if their index (in `array`) is out of bounds.
534     uint dropped;
535 
536     /// index into array of `SubmissionEntry`s at offset `SUBMISSION_QUEUE_ENTRIES_OFFSET` in mmap()
537     uint array;
538 
539     private uint[3] resv; // reserved
540 }
541 
542 enum SubmissionQueueFlags: uint
543 {
544     NONE        = 0,
545 
546     /// `IORING_SQ_NEED_WAKEUP`: needs io_uring_enter wakeup
547     /// set by kernel poll thread when it goes sleeping, and reset on wakeup
548     NEED_WAKEUP = 1U << 0
549 }
550 
551 /**
552  * Field offsets used to map kernel structure to our.
553  *
554  * C API: `struct io_cqring_offsets`
555  */
556 struct CompletionQueueRingOffsets
557 {
558     /// incremented by user space after entry at `head` was processed.
559     /// available entries for processing: [head..tail]
560     uint head;
561 
562     /// modified by kernel when new entry was created; points to next
563     /// entry kernel is going to fill.
564     uint tail;
565 
566     /// value `value_at(ring_entries) - 1`
567     /// mask for indices at `head` and `tail` (don't delete masked bits!
568     /// `head` and `tail` can point to the same entry, but if they are
569     /// not exactly equal it implies the ring is full, and if they are
570     /// exactly equal the ring is empty.)
571     uint ring_mask;
572 
573     /// value same as SetupParameters.cq_entries, power of 2.
574     uint ring_entries;
575 
576     /// incremented by the kernel every time it failed to queue a
577 	/// completion event because the ring was full.
578     uint overflow;
579 
580     /// Offset to array of completion queue entries
581     uint cqes;
582 
583     private ulong[2] resv; // reserved
584 }
585 
586 /// io_uring_register(2) opcodes and arguments
587 enum RegisterOpCode : uint
588 {
589     /**
590      * `arg` points to a struct iovec array of nr_args entries.  The buffers associated with the
591      * iovecs will be locked in memory and charged against the user's RLIMIT_MEMLOCK resource limit.
592      * See getrlimit(2) for more  informa‐ tion.   Additionally,  there  is a size limit of 1GiB per
593      * buffer.  Currently, the buffers must be anonymous, non-file-backed memory, such as that
594      * returned by malloc(3) or mmap(2) with the MAP_ANONYMOUS flag set.  It is expected that this
595      * limitation will be lifted in the future. Huge pages are supported as well. Note that the
596      * entire huge page will be pinned in the kernel, even if only a portion of it is used.
597      *
598      * After a successful call, the supplied buffers are mapped into the kernel and eligible for
599      * I/O.  To make use of them, the application must specify the IORING_OP_READ_FIXED or
600      * IORING_OP_WRITE_FIXED opcodes in the submis‐ sion  queue  entry (see the struct io_uring_sqe
601      * definition in io_uring_enter(2)), and set the buf_index field to the desired buffer index.
602      * The memory range described by the submission queue entry's addr and len fields must fall
603      * within the indexed buffer.
604      *
605      * It is perfectly valid to setup a large buffer and then only use part of it for an I/O, as
606      * long as the range is within the originally mapped region.
607      *
608      * An application can increase or decrease the size or number of registered buffers by first
609      * unregistering the existing buffers, and then issuing a new call to io_uring_register() with
610      * the new buffers.
611      *
612      * An application need not unregister buffers explicitly before shutting down the io_uring
613      * instance.
614      *
615      * `IORING_REGISTER_BUFFERS`
616      */
617     REGISTER_BUFFERS        = 0,
618 
619     /**
620      * This operation takes no argument, and `arg` must be passed as NULL. All previously registered
621      * buffers associated with the io_uring instance will be released.
622      *
623      * `IORING_UNREGISTER_BUFFERS`
624      */
625     UNREGISTER_BUFFERS      = 1,
626 
627     /**
628      * Register files for I/O. `arg` contains a pointer to an array of `nr_args` file descriptors
629      * (signed 32 bit integers).
630      *
631      * To make use of the registered files, the IOSQE_FIXED_FILE flag must be set in the flags
632      * member of the struct io_uring_sqe, and the fd member is set to the index of the file in the
633      * file descriptor array.
634      *
635      * Files are automatically unregistered when the io_uring instance is torn down. An application
636      * need only unregister if it wishes to register a new set of fds.
637      *
638      * `IORING_REGISTER_FILES`
639      */
640     REGISTER_FILES          = 2,
641 
642     /**
643      * This operation requires no argument, and `arg` must be passed as NULL.  All previously
644      * registered files associated with the io_uring instance will be unregistered.
645      *
646      * `IORING_UNREGISTER_FILES`
647      */
648     UNREGISTER_FILES        = 3,
649 
650     /**
651      * `IORING_REGISTER_EVENTFD`
652      *
653      * Registers eventfd that would be used to notify about completions on io_uring itself.
654      *
655      * Note: available from Linux 5.2
656      */
657     REGISTER_EVENTFD        = 4,
658 
659     /**
660      * `IORING_UNREGISTER_EVENTFD`
661      *
662      * Unregisters previously registered eventfd.
663      *
664      * Note: available from Linux 5.2
665      */
666     UNREGISTER_EVENTFD      = 5,
667 
668     /// `IORING_REGISTER_FILES_UPDATE` (from Linux 5.5)
669     REGISTER_FILES_UPDATE   = 6,
670 }
671 
672 /// io_uring_enter(2) flags
673 enum EnterFlags: uint
674 {
675     NONE        = 0,
676     GETEVENTS   = (1 << 0), /// `IORING_ENTER_GETEVENTS`
677     SQ_WAKEUP   = (1 << 1), /// `IORING_ENTER_SQ_WAKEUP`
678 }
679 
680 /// Time specification as defined in kernel headers (used by TIMEOUT operations)
681 struct KernelTimespec
682 {
683     long tv_sec; /// seconds
684     long tv_nsec; /// nanoseconds
685 }
686 
687 static assert(CompletionEntry.sizeof == 16);
688 static assert(CompletionQueueRingOffsets.sizeof == 40);
689 static assert(SetupParameters.sizeof == 120);
690 static assert(SubmissionEntry.sizeof == 64);
691 static assert(SubmissionQueueRingOffsets.sizeof == 40);
692 
693 /**
694  * Setup a context for performing asynchronous I/O.
695  *
696  * The `io_uring_setup()` system call sets up a submission queue (SQ) and completion queue (CQ) with
697  * at least entries entries, and returns a file descriptor which can be used to perform subsequent
698  * operations on the io_uring instance. The submission and completion queues are shared between
699  * userspace and the kernel, which eliminates the need to copy data when initiating and completing
700  * I/O.
701  *
702  * See_Also: `io_uring_setup(2)`
703  *
704  * Params:
705  *   entries = Defines how many entries can submission queue hold.
706  *   p = `SetupParameters`
707  *
708  * Returns:
709  *     `io_uring_setup(2)` returns a new file descriptor on success. The application may then provide
710  *     the file descriptor in a subsequent `mmap(2)` call to map the submission and completion queues,
711  *     or to the `io_uring_register(2)` or `io_uring_enter(2)` system calls.
712  *
713  *     On error, -1 is returned and `errno` is set appropriately.
714  */
715 int io_uring_setup(uint entries, scope ref SetupParameters p) @trusted
716 {
717     pragma(inline);
718     return syscall(SYS_io_uring_setup, entries, &p);
719 }
720 
721 /**
722  * Initiate and/or complete asynchronous I/O
723  *
724  * `io_uring_enter()` is used to initiate and complete I/O using the shared submission and
725  * completion queues setup by a call to `io_uring_setup(2)`. A single call can both submit new I/O
726  * and wait for completions of I/O initiated by this call or previous calls to `io_uring_enter()``.
727  *
728  * When the system call returns that a certain amount of SQEs have been consumed and submitted, it's
729  * safe to reuse SQE entries in the ring. This is true even if the actual IO submission had to be
730  * punted to async context, which means that the SQE may in fact not have been submitted yet. If the
731  * kernel requires later use of a particular SQE entry, it will have made a private copy of it.
732  *
733  * Note: For interrupt driven I/O (where `IORING_SETUP_IOPOLL` was not specified in the call to
734  *     `io_uring_setup(2)`), an application may check the completion queue for event completions without
735  *     entering the kernel at all.
736  *
737  * See_Also: `io_uring_enter(2)`
738  *
739  * Params:
740  *   fd = the file descriptor returned by io_uring_setup(2).
741  *   to_submit = specifies the number of I/Os to submit from the submission queue.
742  *   min_complete = If the `IORING_ENTER_GETEVENTS` bit is set in flags, then the system call will attempt
743  *        to wait for `min_complete` event completions before returning. If the io_uring instance was configured
744  *        for polling, by specifying IORING_SETUP_IOPOLL in the call to io_uring_setup(2), then
745  *        min_complete has a slightly different meaning.  Passing a value of 0 instructs the kernel to
746  *        return any events which are already complete, without blocking. If min_complete is a non-zero
747  *        value, the kernel will still return immediately if  any completion  events are available.  If
748  *        no event completions are available, then the call will poll either until one or more
749  *        completions become available, or until the process has exceeded its scheduler time slice.
750  *   flags = Behavior modification flags - `EnterFlags`
751  *   sig = a pointer to a signal mask (see `sigprocmask(2)`); if sig is not `null`, `io_uring_enter()`
752  *         first replaces the current signal mask by the one pointed to by sig, then waits for events to
753  *         become available in the completion queue, and then restores the original signal mask. The
754  *         following `io_uring_enter()` call:
755  *
756  *         ```
757  *         ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, &sig);
758  *         ```
759  *
760  *         is equivalent to atomically executing the following calls:
761  *
762  *         ```
763  *         pthread_sigmask(SIG_SETMASK, &sig, &orig);
764  *         ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, NULL);
765  *         pthread_sigmask(SIG_SETMASK, &orig, NULL);
766  *         ```
767  *
768  *         See the description of `pselect(2)` for an explanation of why the sig parameter is necessary.
769  *
770  * Returns:
771  */
772 int io_uring_enter(int fd, uint to_submit, uint min_complete, EnterFlags flags, const sigset_t* sig = null)
773 {
774     pragma(inline);
775     return syscall(SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigset_t.sizeof);
776 }
777 
778 /**
779  * Register files or user buffers for asynchronous I/O.
780  *
781  * The `io_uring_register()` system call registers user buffers or files for use in an `io_uring(7)`
782  * instance referenced by fd.  Registering files or user buffers allows the kernel to take long term
783  * references to internal data structures or create long term mappings of application memory,
784  * greatly reducing per-I/O overhead.
785  *
786  * See_Also: `io_uring_register(2)
787  *
788  * Params:
789  *   fd = the file descriptor returned by a call to io_uring_setup(2)
790  *   opcode = code of operation to execute on args
791  *   arg = Args used by specified operation. See `RegisterOpCode` for usage details.
792  *   nr_args = number of provided arguments
793  *
794  * Returns: On success, io_uring_register() returns 0.  On error, -1 is returned, and errno is set accordingly.
795  */
796 int io_uring_register(int fd, RegisterOpCode opcode, const(void)* arg, uint nr_args)
797 {
798     pragma(inline);
799     return syscall(SYS_io_uring_register, fd, opcode, arg, nr_args);
800 }
801 
802 private:
803 
804 // Syscalls
805 enum
806 {
807     SYS_io_uring_setup       = 425,
808     SYS_io_uring_enter       = 426,
809     SYS_io_uring_register    = 427
810 }
811 
812 extern (C):
813 
814 /// Invoke `system call' number `sysno`, passing it the remaining arguments.
815 int syscall(int sysno, ...);