diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 50a2fa9c555..03b64585c1a 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -31,6 +31,7 @@ obj-y += fsnotify.o obj-y += image-desc.o obj-y += image.o obj-y += img-streamer.o +obj-y += io_uring.o obj-y += ipc_ns.o obj-y += irmap.o obj-y += kcmp-ids.o diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 940f6224625..6e62f97c028 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -85,6 +85,7 @@ #include "pidfd-store.h" #include "apparmor.h" #include "asm/dump.h" +#include "io_uring.h" /* * Architectures can overwrite this function to restore register sets that @@ -191,10 +192,11 @@ struct cr_imgset *glob_imgset; static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) { + char buf[PATH_MAX] = {}; struct dirent *de; - DIR *fd_dir; int size = 0; - int n; + int n, pidfd; + DIR *fd_dir; pr_info("\n"); pr_info("Collecting fds (pid: %d)\n", pid); @@ -204,6 +206,55 @@ static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds) if (!fd_dir) return -1; + /* Before collecting fds, we need to bring io_uring to steady state, + * since it can install fds into task's fdtable, and if we do it later, + * during actual io_uring dump, we will miss dumping these files. + */ + if (!kdat.has_pidfd_open) { + pr_err("pidfd_open system call not supported\n"); + return -1; + } + if (!kdat.has_pidfd_getfd) { + pr_err("pidfd_getfd system call not supported\n"); + return -1; + } + + pidfd = syscall(SYS_pidfd_open, pid, 0); + if (pidfd < 0) { + pr_err("Failed to open pidfd for pid %d\n", pid); + return -1; + } + + while ((de = readdir(fd_dir))) { + if (dir_dots(de)) + continue; + + n = dirfd(fd_dir); + if (n == -1) { + close(pidfd); + return -1; + } + + n = readlinkat(n, de->d_name, buf, sizeof(buf)); + if (n == -1) { + close(pidfd); + return -1; + } + + if (is_io_uring_link(buf)) { + if (io_uring_synchronize_fd(syscall(__NR_pidfd_getfd, pidfd, atoi(de->d_name), 0))) { + pr_err("Failed to synchronize io_uring fd %d for pid %d\n", atoi(de->d_name), pid); + close(pidfd); + return -1; + } + } + } + + close(pidfd); + + /* Collect fds now */ + rewinddir(fd_dir); + n = 0; while ((de = readdir(fd_dir))) { if (dir_dots(de)) @@ -489,6 +540,8 @@ static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, const struc ret = check_sysvipc_map_dump(pid, vma); else if (vma_entry_is(vma, VMA_AREA_SOCKET)) ret = dump_socket_map(vma_area); + else if (vma_entry_is(vma, VMA_AREA_IO_URING)) + ret = dump_io_uring_map(vma_area); else ret = 0; if (ret) diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 9d2d957f856..b840bd006da 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -29,6 +29,7 @@ #include "servicefd.h" #include "image.h" #include "img-streamer.h" +#include "io_uring.h" #include "util.h" #include "util-pie.h" #include "criu-log.h" @@ -277,7 +278,7 @@ static struct collect_image_info *cinfos_files[] = { &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo, &netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo, &tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo, - &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, + &fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &io_uring_cinfo, }; /* These images are required to restore namespaces */ diff --git a/criu/files.c b/criu/files.c index 93754fb4400..d1aa98423b0 100644 --- a/criu/files.c +++ b/criu/files.c @@ -49,6 +49,7 @@ #include "kerndat.h" #include "fdstore.h" #include "bpfmap.h" +#include "io_uring.h" #include "protobuf.h" #include "util.h" @@ -536,6 +537,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, else if (is_bpfmap_link(link)) ops = &bpfmap_dump_ops; #endif + else if (is_io_uring_link(link)) + ops = &io_uring_dump_ops; else return dump_unsupp_fd(&p, lfd, "anon", link, e); diff --git a/criu/image-desc.c b/criu/image-desc.c index d65d9c0986e..b72df0d98eb 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -107,6 +107,8 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF), FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF), FD_ENTRY(APPARMOR, "apparmor"), + FD_ENTRY_F(IO_URING_FILE, "io_uring-file", O_NOBUF), + FD_ENTRY_F(IO_URING_DATA, "io_uring-data", O_NOBUF), [CR_FD_STATS] = { .fmt = "stats-%s", diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 5045baee803..bb006a207c1 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -71,6 +71,8 @@ enum { CR_FD_MEMFD_INODE, CR_FD_BPFMAP_FILE, CR_FD_BPFMAP_DATA, + CR_FD_IO_URING_FILE, + CR_FD_IO_URING_DATA, _CR_FD_GLOB_TO, CR_FD_TMPFS_IMG, diff --git a/criu/include/image.h b/criu/include/image.h index 14659dbd24f..13e0dbcc89e 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -79,11 +79,12 @@ #define VMA_ANON_SHARED (1 << 8) #define VMA_ANON_PRIVATE (1 << 9) -#define VMA_AREA_SYSVIPC (1 << 10) -#define VMA_AREA_SOCKET (1 << 11) -#define VMA_AREA_VVAR (1 << 12) -#define VMA_AREA_AIORING (1 << 13) -#define VMA_AREA_MEMFD (1 << 14) +#define VMA_AREA_SYSVIPC (1 << 10) +#define VMA_AREA_SOCKET (1 << 11) +#define VMA_AREA_VVAR (1 << 12) +#define VMA_AREA_AIORING (1 << 13) +#define VMA_AREA_MEMFD (1 << 14) +#define VMA_AREA_IO_URING (1 << 15) #define VMA_CLOSE (1 << 28) #define VMA_NO_PROT_WRITE (1 << 29) diff --git a/criu/include/io_uring.h b/criu/include/io_uring.h new file mode 100644 index 00000000000..09bfb1c35e8 --- /dev/null +++ b/criu/include/io_uring.h @@ -0,0 +1,108 @@ +#ifndef __CR_IO_URING_H__ +#define __CR_IO_URING_H__ + +#include + +#include "files.h" +#include "io_uring.pb-c.h" + +/* Definitions */ +struct __io_uring_restriction { + __u16 opcode; + union { + __u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */ + __u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */ + __u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */ + }; + __u8 resv; + __u32 resv2[3]; +}; + +#ifndef IORING_SETUP_IOPOLL +#define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ +#endif +#ifndef IORING_SETUP_SQPOLL +#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ +#endif +#ifndef IORING_SETUP_SQ_AFF +#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ +#endif +#ifndef IORING_SETUP_CQSIZE +#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ +#endif +#ifndef IORING_SETUP_ATTACH_WQ +#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ +#endif +#ifndef IORING_SETUP_R_DISABLED +#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ +#endif + +#ifndef IORING_OFF_SQ_RING +#define IORING_OFF_SQ_RING 0ULL +#endif +#ifndef IORING_OFF_CQ_RING +#define IORING_OFF_CQ_RING 0x8000000ULL +#endif +#ifndef IORING_OFF_SQES +#define IORING_OFF_SQES 0x10000000ULL +#endif + +#ifndef IOSQE_IO_DRAIN +#define IOSQE_IO_DRAIN (1U << 1) +#endif + +#define __IORING_RESTRICTION_REGISTER_OP 0 +#define __IORING_RESTRICTION_SQE_OP 1 +#define __IORING_RESTRICTION_SQE_FLAGS_ALLOWED 2 +#define __IORING_RESTRICTION_SQE_FLAGS_REQUIRED 3 +#define __IORING_REGISTER_PERSONALITY 9 +#define __IORING_REGISTER_RESTRICTIONS 11 +#define __IORING_REGISTER_ENABLE_RINGS 12 + +struct io_uring_file_info { + IoUringFileEntry *iofe; + struct file_desc d; +}; + +struct io_uring_data_info { + IoUringDataEntry *iode; +}; + +struct io_uring_group_desc { + struct list_head list; + gid_t group; + char group_name[32]; +}; + +struct io_uring_personality_desc { + int id; + uid_t uid; + uid_t euid; + uid_t suid; + uid_t fsuid; + gid_t gid; + gid_t egid; + gid_t sgid; + gid_t fsgid; + u32 cap_eff[CR_CAP_SIZE]; + size_t nr_groups; + struct list_head group_list; +}; + +struct io_uring_ctx; + +extern struct collect_image_info io_uring_cinfo; +extern struct collect_image_info io_uring_data_cinfo; +extern const struct fdtype_ops io_uring_dump_ops; + +int is_io_uring_link(char *link); +int io_uring_synchronize_fd(int fd); +int collect_io_uring_map(struct vma_area *vma); +int dump_io_uring_map(struct vma_area *vma); +int add_one_io_uring_mapping(uint64_t offset, ino_t inode); + +int io_uring_push_buf(struct io_uring_ctx *ctx, unsigned int idx, long long unsigned int address, unsigned int len); +int io_uring_push_personality(struct io_uring_ctx *ctx, struct io_uring_personality_desc *desc); +IoUringFileEntry *io_uring_get_iofe(struct io_uring_ctx *ctx); + +#endif /* __CR_IO_URING_H__ */ diff --git a/criu/include/magic.h b/criu/include/magic.h index 22d7218e45f..b968828e722 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -100,6 +100,8 @@ #define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */ #define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */ #define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */ +#define IO_URING_FILE_MAGIC 0x55403656 /* Butyn */ +#define IO_URING_DATA_MAGIC 0x54194822 /* Ulyanovsk */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 3824de101f3..dc4634978ec 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -70,6 +70,8 @@ enum { PB_BPFMAP_FILE, PB_BPFMAP_DATA, PB_APPARMOR, + PB_IO_URING_FILE, + PB_IO_URING_DATA, /* PB_AUTOGEN_STOP */ diff --git a/criu/include/vma.h b/criu/include/vma.h index ed9f31ef67f..3a259da8e13 100644 --- a/criu/include/vma.h +++ b/criu/include/vma.h @@ -35,7 +35,10 @@ struct vma_area { union { struct /* for dump */ { - int vm_socket_id; + union { + int vm_socket_id; + int io_uring_id; + }; char *aufs_rpath; /* path from aufs root */ char *aufs_fpath; /* full path from global root */ diff --git a/criu/io_uring.c b/criu/io_uring.c new file mode 100644 index 00000000000..906e8f4ac56 --- /dev/null +++ b/criu/io_uring.c @@ -0,0 +1,1036 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "util.h" +#include "bitmap.h" +#include "fdinfo.h" +#include "imgset.h" +#include "string.h" +#include "file-ids.h" +#include "io_uring.h" +#include "protobuf.h" +#include "common/list.h" + +#include + +#define CTX_F_SEEN_SQE (1UL << 0) /* SQE ring mapped */ +#define CTX_F_SEEN_SQE_ARR (1UL << 1) /* SQE array mapped */ +#define CTX_F_SEEN_CQE (1UL << 2) /* CQE ring mapped */ +#define CTX_F_SEEN_RINGS (CTX_F_SEEN_SQE | CTX_F_SEEN_SQE_ARR | CTX_F_SEEN_CQE) +#define CTX_F_SINGLE_MMAP (1UL << 3) /* SQE/CQE ring are in single mapping */ +#define CTX_F_DONE_FILE (1UL << 4) /* File dump done */ +#define CTX_F_DONE_DATA (1UL << 5) /* Data dump done */ +#define CTX_F_DONE_ALL (CTX_F_DONE_FILE | CTX_F_DONE_DATA) +#define CTX_F_INIT_IOFE (1UL << 6) /* Iofe set for ctx */ + +#define atomic_load_relaxed(x) __atomic_load_n((x), __ATOMIC_RELAXED) +#define atomic_load_acquire(x) __atomic_load_n((x), __ATOMIC_ACQUIRE) +#define atomic_store_release(x, val) __atomic_store_n((x), (val), __ATOMIC_RELEASE) + +#define IO_URING_HASH_TABLE_BITS 5 +#define IO_URING_HASH_TABLE_MAX (1UL << IO_URING_HASH_TABLE_BITS) +#define IO_URING_HASH_TABLE_MASK (IO_URING_HASH_TABLE_MAX - 1) + +#ifndef IORING_FEAT_SQPOLL_NONFIXED +#define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) +#endif + +struct io_uring_map { + struct io_uring_sqe *sqe; + struct io_uring_cqe *cqe; + unsigned int *sq_array; + unsigned int *sq_ring_tail; + unsigned int *sq_ring_head; + unsigned int *cqe_ring_head; + unsigned int *cqe_ring_tail; + unsigned int *sq_ring_mask; + unsigned int *cqe_ring_mask; + size_t sq_len; + size_t sqe_len; + size_t cqe_len; +}; + +struct io_uring_buf { + struct list_head list; + unsigned int idx; + long long unsigned int address; + unsigned int len; +}; + +/* We store uid name in image to avoid mismatch on restore which could turn into + * a potential security risk, as user name may not match for the same UID and + * user may end up exposing resources to other users unintentionally. + */ +struct io_uring_personality { + struct list_head list; + struct io_uring_personality_desc desc; + char uid_name[32]; + char euid_name[32]; + char suid_name[32]; + char fsuid_name[32]; + char gid_name[32]; + char egid_name[32]; + char sgid_name[32]; + char fsgid_name[32]; +}; + +struct io_uring_ctx { + struct io_uring_ctx *next; + ino_t inode; + u32 id; + u32 state; + union { + struct { + IoUringFileEntry iofe; + struct io_uring_map map; + + struct list_head buf_list; + struct list_head pers_list; + size_t nr_pers; + } dump; + struct { + void *data; + size_t sqe_bytes; + size_t cqe_bytes; + size_t sq_arr_bytes; + } restore; + }; +}; + +static struct io_uring_ctx *ctx_hash_table[IO_URING_HASH_TABLE_MAX]; + +static struct io_uring_ctx *alloc_ctx(void) +{ + struct io_uring_ctx *ctx; + + ctx = xzalloc(sizeof(*ctx)); + if (!ctx) + return NULL; + + INIT_LIST_HEAD(&ctx->dump.buf_list); + INIT_LIST_HEAD(&ctx->dump.pers_list); + + return ctx; +} + +static struct io_uring_ctx *lookup_ctx(ino_t inode) +{ + struct io_uring_ctx *ctx; + + ctx = ctx_hash_table[inode & IO_URING_HASH_TABLE_MASK]; + for (; ctx; ctx = ctx->next) { + if (ctx->inode == inode) + break; + } + + return ctx; +} + +static void insert_ctx(ino_t inode, struct io_uring_ctx *ctx) +{ + struct io_uring_ctx **slot; + + slot = &ctx_hash_table[inode & IO_URING_HASH_TABLE_MASK]; + ctx->next = *slot; + *slot = ctx; +} + +static uint64_t offset_to_state(uint64_t offset) +{ + switch (offset) { + case IORING_OFF_SQ_RING: + return CTX_F_SEEN_SQE; + case IORING_OFF_CQ_RING: + return CTX_F_SEEN_CQE; + case IORING_OFF_SQES: + return CTX_F_SEEN_SQE_ARR; + default: + return 0; + } +} + +static const char *offset_to_str(uint64_t offset) +{ + switch (offset) { + case IORING_OFF_SQ_RING: + return "IORING_OFF_SQ_RING"; + case IORING_OFF_CQ_RING: + return "IORING_OFF_CQ_RING"; + case IORING_OFF_SQES: + return "IORING_OFF_SQES"; + default: + return "Unknown"; + } +} + +int io_uring_push_buf(struct io_uring_ctx *ctx, unsigned int idx, long long unsigned int address, unsigned int len) +{ + struct io_uring_buf *buf; + + buf = xzalloc(sizeof(*buf)); + if (!buf) + return -ENOMEM; + + buf->idx = idx; + buf->address = address; + buf->len = len; + list_add_tail(&buf->list, &ctx->dump.buf_list); + + return 0; +} + +int io_uring_push_personality(struct io_uring_ctx *ctx, struct io_uring_personality_desc *desc) +{ + struct io_uring_personality *p; + struct io_uring_group_desc *g; + struct passwd *pwd; + struct group *grp; + int grps = 0; + + p = xzalloc(sizeof(*p)); + if (!p) + return -ENOMEM; + INIT_LIST_HEAD(&p->list); + + p->desc = *desc; + INIT_LIST_HEAD(&p->desc.group_list); + +#define X(ptr, sub) \ + pwd = getpwuid(desc->sub); \ + if (pwd) \ + strlcpy(ptr->sub##_name, pwd->pw_name, sizeof(ptr->sub##_name)); + X(p, uid); + X(p, euid); + X(p, suid); + X(p, fsuid); +#undef X +#define X(ptr, sub) \ + grp = getgrgid(desc->sub); \ + if (grp) \ + strlcpy(ptr->sub##_name, grp->gr_name, sizeof(ptr->sub##_name)); + X(p, gid); + X(p, egid); + X(p, sgid); + X(p, fsgid); +#undef X + + list_for_each_entry(g, &desc->group_list, list) { + grp = getgrgid(g->group); + if (pwd) + strlcpy(g->group_name, grp->gr_name, sizeof(g->group_name)); + grps++; + } + BUG_ON(grps != desc->nr_groups); + + /* Migrate prepared group list from local desc to personality object */ + list_splice(&desc->group_list, &p->desc.group_list); + + /* ... and append personality object to ctx personality list */ + list_add_tail(&p->list, &ctx->dump.pers_list); + ctx->dump.nr_pers++; + return 0; +} + +IoUringFileEntry *io_uring_get_iofe(struct io_uring_ctx *ctx) +{ + return &ctx->dump.iofe; +} + +/* + * TODO: + * Handle IORING_REGISTER_BUFFERS + * Handle IORING_REGISTER_FILES + * Handle IORING_REGISTER_EVENTFD_{ASYNC} + * + * Handle wq_fd registration + * * Compare in-kernel ctx->sq_data to associate with open fd + * Audit memory cleanup after error at various places + */ + +static int sys_io_uring_setup(unsigned int entries, struct io_uring_params *p) +{ + return (int)syscall(__NR_io_uring_setup, entries, p); +} + +/* XXX: We can expose timeout here to not block indefinitely when trying to sync + * io_uring fd during dump stage, in case forward progress depends on one + * of the stopped threads. + */ +static int sys_io_uring_enter(int ring_fd, unsigned int to_submit, unsigned int min_complete, unsigned int flags) +{ + return (int)syscall(__NR_io_uring_enter, ring_fd, to_submit, min_complete, flags, NULL, 0); +} + +static int sys_io_uring_register(int ring_fd, unsigned int opcode, void *arg, unsigned int nr_args) +{ + return (int)syscall(__NR_io_uring_register, ring_fd, opcode, arg, nr_args); +} + +static int io_uring_restore_personality(int fd, IoUringPersonalityId *pers_id) +{ + struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {}; + struct cap_header hdr; + pid_t pid; + int ret; + + /* fork into a new child to manipulate credentials and register personality */ + pid = fork(); + if (pid) { + pid = waitpid(pid, &ret, 0); + if (pid < 0) + return -errno; + return -ret; + } else if (!pid) { + u32 cap[2] = { + pers_id->cap_eff & 0xffffffff00000000, + pers_id->cap_eff & 0x00000000ffffffff, + }; + size_t n_grps = 0, sz = 32; + struct passwd *pwd; + bool group = false; + struct group *grp; + gid_t *groups; + +#define X(c, m, x) \ + if (c) { \ + if (strcmp(c->m##_name, pers_id->x##_name)) \ + pr_warn("User name from image and system do not match for %s %d\n", group ? "GID" : "UID", \ + pers_id->x); \ + } else { \ + pr_warn("No user for %s %d on system\n", group ? "GID" : "UID", pers_id->x); \ + } + pwd = getpwuid(pers_id->uid); + X(pwd, pw, uid); + pwd = getpwuid(pers_id->euid); + X(pwd, pw, euid); + pwd = getpwuid(pers_id->suid); + X(pwd, pw, suid); + pwd = getpwuid(pers_id->fsuid); + X(pwd, pw, fsuid); + + group = true; + + grp = getgrgid(pers_id->gid); + X(grp, gr, gid); + grp = getgrgid(pers_id->egid); + X(grp, gr, egid); + grp = getgrgid(pers_id->sgid); + X(grp, gr, sgid); + grp = getgrgid(pers_id->fsgid); + X(grp, gr, fsgid); +#undef X + + ret = setresuid(pers_id->uid, pers_id->euid, pers_id->suid); + if (ret < 0) + goto end; + ret = setfsuid(pers_id->fsuid); + if (ret < 0) + goto end; + ret = setresgid(pers_id->gid, pers_id->euid, pers_id->suid); + if (ret < 0) + goto end; + ret = setfsgid(pers_id->fsgid); + if (ret < 0) + goto end; + + groups = xmalloc(sz * sizeof(*groups)); + if (!groups) { + errno = ENOMEM; + goto end; + } + + for (int i = 0; i < pers_id->n_group_id; i++) { + IoUringGroupId *gd = pers_id->group_id[i]; + struct group *grp; + gid_t *g; + + grp = getgrgid(gd->group); + if (!grp) + pr_warn("Group name not found for GID %d\n", gd->group); + if (strcmp(gd->group_name, grp->gr_name)) + pr_warn("Group name in image and on system do not match for GID %d\n", gd->group); + + if (sz <= n_grps) { + sz *= 2; + g = xrealloc(groups, sz * sizeof(*g)); + if (!g) { + xfree(groups); + errno = ENOMEM; + goto end; + } + groups = g; + } + groups[n_grps++] = gd->group; + } + + ret = setgroups(n_grps, groups); + xfree(groups); + if (ret < 0) { + errno = -ret; + goto end; + } + + hdr.version = _LINUX_CAPABILITY_VERSION_3; + hdr.pid = 0; + BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE); + + for (int i = 0; i < CR_CAP_SIZE; i++) + data[i].eff = cap[i]; + + ret = syscall(__NR_capset, &hdr, data); + if (ret < 0) { + errno = -ret; + goto end; + } + + ret = sys_io_uring_register(fd, __IORING_REGISTER_PERSONALITY, NULL, 0); + if (ret < 0) { + errno = -ret; + goto end; + } + + exit(0); + end: + exit(errno); + } else { + return -errno; + } + + return 0; +} + +int is_io_uring_link(char *link) +{ + return is_anon_link_type(link, "[io_uring]"); +} + +static void io_uring_submit_nop(struct io_uring_map *map, bool barrier) +{ + unsigned int tail, index; + + BUG_ON(!map); + + tail = atomic_load_acquire(map->sq_ring_tail); + index = tail & *map->sq_ring_mask; + map->sqe[index].opcode = IORING_OP_NOP; + if (barrier) + map->sqe[index].flags = IOSQE_IO_DRAIN; + map->sq_array[index] = index; + atomic_store_release(map->sq_ring_tail, tail + 1); +} + +static int io_uring_consume_n(struct io_uring_map *map, int n) +{ + unsigned int head; + int ret; + + BUG_ON(!map); + + head = *map->cqe_ring_head; + ret = map->cqe[head & *map->cqe_ring_mask].res; + atomic_store_release(map->cqe_ring_head, head + n); + + return ret; +} + +static void io_uring_consume_all(struct io_uring_map *map) +{ + BUG_ON(!map); + + (void)io_uring_consume_n(map, atomic_load_acquire(map->cqe_ring_tail) - *map->cqe_ring_head); +} + +static int map_io_uring_fd(int fd, struct io_uring_params *p, struct io_uring_map *map) +{ + int ret = 0; + + BUG_ON(!p); + BUG_ON(!map); + + /* XXX: Optimize using FEAT_SINGLE_MMAP */ + map->sq_len = p->sq_off.array + p->sq_entries * sizeof(unsigned int); + map->cqe_len = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); + map->sqe_len = p->sq_entries * sizeof(struct io_uring_sqe); + + map->sq_array = + mmap(NULL, map->sq_len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); + if (map->sq_array == MAP_FAILED) { + ret = -errno; + pr_perror("Failed to mmap SQ array ring"); + goto end; + } + + map->cqe = mmap(NULL, map->cqe_len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); + if (map->cqe == MAP_FAILED) { + ret = -errno; + pr_perror("Failed to mmap CQE ring"); + goto end_sq_ptr; + } + + map->sq_ring_head = map->sq_array + p->sq_off.head; + map->sq_ring_tail = map->sq_array + p->sq_off.tail; + map->cqe_ring_head = (unsigned int *)map->cqe + p->cq_off.head; + map->cqe_ring_tail = (unsigned int *)map->cqe + p->cq_off.tail; + map->sq_ring_mask = map->sq_array + p->sq_off.ring_mask; + map->cqe_ring_mask = (unsigned int *)map->cqe + p->cq_off.ring_mask; + map->sq_array += p->sq_off.array; + + map->sqe = mmap(NULL, map->sqe_len, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); + if (map->sqe == MAP_FAILED) { + ret = -errno; + pr_perror("Failed to mmap SQE ring"); + goto end_cqe_ptr; + } + + return ret; + + munmap(map->sqe, map->sqe_len); +end_cqe_ptr: + munmap(map->cqe, map->cqe_len); +end_sq_ptr: + munmap(map->sq_array, map->sq_len); +end: + return ret; +} + +static void unmap_io_uring_fd(struct io_uring_map *map) +{ + BUG_ON(!map); + BUG_ON(!map->sqe); + BUG_ON(!map->cqe); + BUG_ON(!map->sq_array); + + munmap(map->sqe, map->sqe_len); + munmap(map->cqe, map->cqe_len); + munmap(map->sq_array, map->sq_len); +} + +int io_uring_synchronize_fd(int fd) +{ + struct io_uring_map map = {}; + struct io_uring_params p; + struct io_uring_ctx *ctx; + unsigned int rem; + struct stat st; + bool sq_poll; + int ret; + + if (fd < 0) + return fd; + + if (fstat(fd, &st)) + return -errno; + + ctx = lookup_ctx(st.st_ino); + if (!ctx) + return -ENOENT; + + assert("File Entry must be unitialized" && !(ctx->state & CTX_F_INIT_IOFE)); + /* Obtains sq_off.array, while the rest are offsets we can get from a + * io_uring_setup call. Also caches this in ctx so that we don't have to + * parse once again. + */ + if (parse_fdinfo(fd, FD_TYPES__IO_URING, ctx)) + return -EINVAL; + ctx->state |= CTX_F_INIT_IOFE; + return 0; + + sq_poll = ctx->dump.iofe.setup_flags & IORING_SETUP_SQPOLL; + + memset(&p, 0, sizeof(p)); + ret = sys_io_uring_setup(1, &p); + if (ret < 0) + return -errno; + close(ret); + + p.sq_off.array = ctx->dump.iofe.sq_off_array; + p.sq_entries = ctx->dump.iofe.sq_entries; + p.cq_entries = ctx->dump.iofe.cq_entries; + + ret = map_io_uring_fd(fd, &p, &map); + if (ret < 0) + return ret; + + /* Preserve head/tail and ring mask */ + ctx->dump.iofe.sq_head = atomic_load_acquire(map.sq_ring_head); + ctx->dump.iofe.sq_tail = *map.sq_ring_tail; + ctx->dump.iofe.cqe_head = *map.cqe_ring_head; + ctx->dump.iofe.sq_ring_mask = *map.sq_ring_mask; + + io_uring_consume_all(&map); + + rem = ctx->dump.iofe.sq_tail - ctx->dump.iofe.sq_head; + /* XXX: Add timeout to gracefully handle indefinite blocking */ + ret = sys_io_uring_enter(fd, rem, rem, IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP); + if (ret < 0) { + ret = -errno; + pr_perror("Failed to call io_uring_enter"); + } + + ctx->dump.iofe.cqe_tail = atomic_load_acquire(map.cqe_ring_tail); + if (sq_poll) + ctx->dump.iofe.sq_head = ctx->dump.iofe.sq_tail; + + ctx->dump.map = map; + return ret; +} + +static int replay_io_uring_data(int fd, struct io_uring_ctx *ctx, struct io_uring_params *p, IoUringFileEntry *iofe) +{ + unsigned int nop_count, cons_count; + struct io_uring_map map; + int ret = 0, flags = 0; + void *data; + + BUG_ON(!ctx); + BUG_ON(!p); + BUG_ON(!iofe); + BUG_ON(p->sq_entries != ctx->restore.sqe_bytes / sizeof(struct io_uring_sqe)); + BUG_ON(p->cq_entries != ctx->restore.cqe_bytes / sizeof(struct io_uring_cqe)); + BUG_ON(p->sq_entries != ctx->restore.sq_arr_bytes / sizeof(unsigned int)); + + /* To replay the data, we first need to advance head and tail to the + * values they were when the io_uring instance was dumped. At the ABI + * level the request and completion structure have same size for all + * operations, so filling IORING_OP_NOP operations and reaping them + * adjust the kernel's offsets, after which we overwrite the ring with + * data we dumped in the image. + */ + if (p->flags & IORING_SETUP_SQPOLL) + flags |= IORING_ENTER_SQ_WAKEUP; + + ret = map_io_uring_fd(fd, p, &map); + if (ret < 0) + return ret; + + nop_count = iofe->sq_head & iofe->sq_ring_mask; + cons_count = iofe->cqe_tail & iofe->cq_ring_mask; + + for (int i = 0; i < nop_count; i++) + io_uring_submit_nop(&map, false); + + ret = sys_io_uring_enter(fd, nop_count, nop_count, IORING_ENTER_GETEVENTS | flags); + if (ret < 0) { + pr_perror("Failed to call io_uring_enter"); + goto end; + } + + io_uring_consume_n(&map, cons_count); + + data = ctx->restore.data; + memcpy(map.sqe, data, ctx->restore.sqe_bytes); + data += ctx->restore.sqe_bytes; + memcpy(map.cqe, data, ctx->restore.cqe_bytes); + data += ctx->restore.cqe_bytes; + memcpy(map.sq_array, data, ctx->restore.sq_arr_bytes); + +end: + xfree(ctx->restore.data); + unmap_io_uring_fd(&map); + return ret; +} + +static int dump_one_io_uring_data(struct io_uring_ctx *ctx, IoUringFileEntry *iofe, int lfd, const struct fd_parms *p) +{ + IoUringDataEntry iode = IO_URING_DATA_ENTRY__INIT; + struct io_uring_map *map; + struct cr_img *img; + int ret; + + map = &ctx->dump.map; + + BUG_ON(!map->sqe); + BUG_ON(!map->cqe); + BUG_ON(!map->sq_array); + + img = img_from_set(glob_imgset, CR_FD_IO_URING_DATA); + BUG_ON(ctx->state & CTX_F_DONE_DATA); + + iode.id = ctx->inode; + iode.sqe_bytes = sizeof(struct io_uring_sqe) * ctx->dump.iofe.sq_entries; + iode.cqe_bytes = sizeof(struct io_uring_cqe) * ctx->dump.iofe.cq_entries; + iode.sq_arr_bytes = sizeof(unsigned int) * ctx->dump.iofe.sq_entries; + + ret = -1; + if (pb_write_one(img, &iode, PB_IO_URING_DATA)) + goto end; + + /* Layout |SQE|CQE|SQARR| */ + if (write(img_raw_fd(img), map->sqe, iode.sqe_bytes) != iode.sqe_bytes) + goto end; + if (write(img_raw_fd(img), map->cqe, iode.cqe_bytes) != iode.cqe_bytes) + goto end; + if (write(img_raw_fd(img), map->sq_array, iode.sq_arr_bytes) != iode.sq_arr_bytes) + goto end; + + ret = 0; + ctx->state |= CTX_F_DONE_DATA; +end: + unmap_io_uring_fd(map); + return ret; +} + +static int dump_one_io_uring(int lfd, u32 id, const struct fd_parms *p) +{ + IoUringFileEntry iofe = IO_URING_FILE_ENTRY__INIT; + struct io_uring_personality *per_i, *ptmp; + struct io_uring_buf *buf_i, *btmp; + FileEntry fe = FILE_ENTRY__INIT; + struct io_uring_ctx *ctx; + int i = 0, j = 0; + + ctx = lookup_ctx(p->stat.st_ino); + if (!ctx) + return -ENOENT; + + BUG_ON(!(ctx->state & CTX_F_INIT_IOFE)); + BUG_ON(ctx->state & CTX_F_DONE_FILE); + + iofe.id = ctx->id = id; + iofe.inode = ctx->inode; + iofe.flags = p->flags; + iofe.fown = (FownEntry *)&p->fown; + + fe.type = FD_TYPES__IO_URING; + fe.id = iofe.id; + fe.io_uring = &iofe; + + list_for_each_entry_safe(buf_i, btmp, &ctx->dump.buf_list, list) { + /* XXX: match struct page address for buf_i->idx from eBPF + * iterator output + */ + xfree(buf_i); + } + + BUG_ON(!list_empty(&ctx->dump.pers_list) && !ctx->dump.nr_pers); + ctx->dump.iofe.n_pers_id = ctx->dump.nr_pers; + ctx->dump.iofe.pers_id = xzalloc(pb_repeated_size(&ctx->dump.iofe, pers_id)); + if (!ctx->dump.iofe.pers_id) + return -ENOMEM; + + list_for_each_entry_safe(per_i, ptmp, &ctx->dump.pers_list, list) { + struct io_uring_group_desc *grp_i, *gtmp; + IoUringPersonalityId *pers_id; + + BUG_ON(i + 1 != per_i->desc.id); + ctx->dump.iofe.pers_id[i] = xzalloc(sizeof(*ctx->dump.iofe.pers_id[i])); + if (!ctx->dump.iofe.pers_id[i]) + return -ENOMEM; + + pers_id = ctx->dump.iofe.pers_id[i]; + +#define X(x) pers_id->x = per_i->desc.x; + X(uid); + X(euid); + X(suid); + X(fsuid); + X(gid); + X(egid); + X(sgid); + X(fsgid); +#undef X + +#define X(x) \ + pers_id->x##_name = xstrdup(per_i->x##_name); \ + if (!pers_id->x##_name) \ + return -ENOMEM; + X(uid); + X(euid); + X(suid); + X(fsuid); + X(gid); + X(egid); + X(sgid); + X(fsgid); +#undef X + memcpy(&pers_id->cap_eff, per_i->desc.cap_eff, sizeof(per_i->desc.cap_eff)); + BUG_ON(!list_empty(&per_i->desc.group_list) && !per_i->desc.nr_groups); + pers_id->n_group_id = per_i->desc.nr_groups; + pers_id->group_id = xzalloc(pb_repeated_size(pers_id, group_id)); + if (!pers_id->group_id) + return -ENOMEM; + /* Now, iterate over group list for personality, and dump each + * group ID and group name + */ + j = 0; + list_for_each_entry_safe(grp_i, gtmp, &per_i->desc.group_list, list) { + pers_id->group_id[j] = xzalloc(sizeof(*pers_id->group_id[j])); + if (!pers_id->group_id[j]) + return -ENOMEM; + pers_id->group_id[j]->group = grp_i->group; + pers_id->group_id[j]->group_name = xstrdup(grp_i->group_name); + if (!pers_id->group_id[j]->group_name) + return -ENOMEM; + j++; + xfree(grp_i); + } + BUG_ON(j != per_i->desc.nr_groups); + i++; + xfree(per_i); + } + BUG_ON(i != ctx->dump.nr_pers); + + if (pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE)) + return -1; + ctx->state |= CTX_F_DONE_FILE; + + return dump_one_io_uring_data(ctx, &iofe, lfd, p); +} + +const struct fdtype_ops io_uring_dump_ops = { + .type = FD_TYPES__IO_URING, + .dump = dump_one_io_uring, +}; + +static int open_io_uring_desc(struct file_desc *d, int *new_fd) +{ + struct __io_uring_restriction res[4]; + struct io_uring_file_info *iofi; + struct io_uring_ctx *ctx; + struct io_uring_params p; + IoUringFileEntry *iofe; + int fd, ret = -1; + + iofi = container_of(d, struct io_uring_file_info, d); + iofe = iofi->iofe; + + /* XXX: when we handle IORING_REGISTER_FILES, and wq_fd registration, + * handle post_open processing here to re-register files... + * + * For wq_fd, there is a parent io_uring fd that will be restored first + * (without any other dependencies on io_uring instances). Cycles cannot + * be created as io_uring won't allow IORING_REGISTER_FILES for another + * io_uring, so we cannot deadlock, and wq_fd registration won't be + * circular either. wq_fd is determined using ctx->sq_data matching in + * eBPF iteration. + */ + ctx = lookup_ctx(iofe->id); + if (!ctx) + return -ENOENT; + + memset(&p, 0, sizeof(p)); + p.sq_thread_cpu = iofe->sq_thread_cpu; + p.sq_thread_idle = iofe->sq_thread_idle; + p.cq_entries = iofe->cq_entries; + p.flags = iofe->setup_flags | IORING_SETUP_CQSIZE; + + if (iofe->restrictions) + p.flags |= IORING_SETUP_R_DISABLED; + + fd = sys_io_uring_setup(iofe->sq_entries, &p); + if (fd < 0) + return -errno; + + for (int i = 0; i < iofe->n_pers_id; i++) { + IoUringPersonalityId *pers_id = iofe->pers_id[i]; + + ret = io_uring_restore_personality(fd, pers_id); + if (ret < 0) + goto end; + } + + if (iofe->restrictions) { + int nr = 0; + + if (iofe->reg_op) { + res[nr].opcode = __IORING_RESTRICTION_REGISTER_OP; + res[nr++].register_op = iofe->reg_op; + } + + if (iofe->sqe_op) { + res[nr].opcode = __IORING_RESTRICTION_SQE_OP; + res[nr++].sqe_op = iofe->sqe_op; + } + + if (iofe->sqe_flags_allowed) { + res[nr].opcode = __IORING_RESTRICTION_SQE_FLAGS_ALLOWED; + res[nr++].sqe_flags = iofe->sqe_flags_allowed; + } + + if (iofe->sqe_flags_required) { + res[nr].opcode = __IORING_RESTRICTION_SQE_FLAGS_REQUIRED; + res[nr++].sqe_flags = iofe->sqe_flags_required; + } + + BUG_ON(nr >= ARRAY_SIZE(res)); + if (nr) { + ret = sys_io_uring_register(fd, __IORING_REGISTER_RESTRICTIONS, res, nr); + if (ret < 0) + goto end; + } + + ret = sys_io_uring_register(fd, __IORING_REGISTER_ENABLE_RINGS, NULL, 0); + if (ret < 0) + goto end; + } + + if ((p.flags & IORING_SETUP_SQPOLL) && !iofe->nr_user_files && !(p.features & IORING_FEAT_SQPOLL_NONFIXED)) { + ret = -ENOTSUP; + pr_err("Dumped io_uring instance %#08x has IORING_SETUP_SQPOLL flag, but no registered files,\n" + "and system does not support SQPOLL in this mode, as IORING_FEAT_SQPOLL_NONFIXED \n" + "feature is missing\n", + iofe->id); + goto end; + } + + if (rst_file_params(fd, iofe->fown, iofi->iofe->flags)) { + pr_perror("Can't restore file params on io_uring %#08x", iofe->id); + goto end; + } + + ret = replay_io_uring_data(fd, ctx, &p, iofe); + if (ret < 0) + goto end; + + *new_fd = fd; + + return 0; +end: + close(fd); + return ret; +} + +static struct file_desc_ops io_uring_desc_ops = { + .type = FD_TYPES__IO_URING, + .open = open_io_uring_desc, +}; + +static int collect_one_io_uring(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct io_uring_file_info *iofi = o; + struct io_uring_ctx *ctx; + + ctx = alloc_ctx(); + if (!ctx) + return -ENOMEM; + + iofi->iofe = pb_msg(base, IoUringFileEntry); + ctx->inode = iofi->iofe->id; + insert_ctx(iofi->iofe->id, ctx); + return file_desc_add(&iofi->d, iofi->iofe->id, &io_uring_desc_ops); +} + +struct collect_image_info io_uring_cinfo = { + .fd_type = CR_FD_IO_URING_FILE, + .pb_type = PB_IO_URING_FILE, + .priv_size = sizeof(struct io_uring_file_info), + .collect = collect_one_io_uring, +}; + +static int collect_one_io_uring_data(void *o, ProtobufCMessage *base, struct cr_img *i) +{ + struct io_uring_data_info *iodi = o; + struct io_uring_ctx *ctx; + size_t bytes; + + iodi->iode = pb_msg(base, IoUringDataEntry); + + ctx = lookup_ctx(iodi->iode->id); + if (!ctx) { + /* Should have been inserted by file collect stage */ + pr_err("Failed to failed io_uring restore ctx for id %#08lx\n", (unsigned long)iodi->iode->id); + return -ENOENT; + } + + bytes = iodi->iode->sqe_bytes + iodi->iode->cqe_bytes + iodi->iode->sq_arr_bytes; + ctx->restore.data = xmalloc(bytes); + if (!ctx->restore.data) + return -ENOMEM; + + return read_img_buf(i, ctx->restore.data, bytes); +} + +struct collect_image_info io_uring_data_cinfo = { + .fd_type = CR_FD_IO_URING_DATA, + .pb_type = PB_IO_URING_DATA, + .priv_size = sizeof(struct io_uring_data_info), + .collect = collect_one_io_uring_data, +}; + +static int open_io_uring_map(int pid, struct vma_area *vma) +{ + struct fdinfo_list_entry *fle; + VmaEntry *vme = vma->e; + struct file_desc *fd; + + fd = find_file_desc_raw(FD_TYPES__IO_URING, vme->shmid); + if (!fd) + return -1; + + list_for_each_entry(fle, &fd->fd_info_head, desc_list) { + if (fle->pid == pid) { + int fd; + + fd = dup(fle->fe->fd); + if (fd < 0) + return -errno; + + vme->fd = fd; + return 0; + } + } + + return -ENOENT; +} + +int collect_io_uring_map(struct vma_area *vma) +{ + vma->vm_open = open_io_uring_map; + return 0; +} + +int dump_io_uring_map(struct vma_area *vma) +{ + struct io_uring_ctx *ctx; + + ctx = lookup_ctx(vma->io_uring_id); + if (!ctx) + return -ENOENT; + + if (!(ctx->state & CTX_F_DONE_ALL)) { + pr_err("Mapping(s) found for io_uring but no fd open, cannot dump " + "io_uring instance without access to io_uring fd corresponding " + "to the mapping\n"); + return -ENOTSUP; + } + + vma->e->shmid = ctx->inode; + return 0; +} + +int add_one_io_uring_mapping(uint64_t offset, ino_t inode) +{ + struct io_uring_ctx *ctx; + uint64_t flag; + + pr_debug("Processing for io_uring mapping at offset=%s\n", offset_to_str(offset)); + flag = offset_to_state(offset); + if (!flag) { + pr_err("Invalid offset of mapping offset=%" PRIu64 "\n", offset); + return -EINVAL; + } + + ctx = lookup_ctx(inode); + if (!ctx) { + pr_debug("No io_uring ctx associated with inode=%lu, creating one...\n", (unsigned long)inode); + + ctx = alloc_ctx(); + if (!ctx) + return -ENOMEM; + + ctx->inode = inode; + insert_ctx(ctx->inode, ctx); + } + + ctx->state |= flag; + return 0; +} diff --git a/criu/mem.c b/criu/mem.c index ca74bfbb655..7a1f3555217 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -31,6 +31,9 @@ #include "prctl.h" #include "compel/infect-util.h" #include "pidfd-store.h" +#include "compel/plugins/std/syscall-codes.h" +#include "common/scm.h" +#include "io_uring.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" @@ -125,6 +128,8 @@ bool should_dump_page(VmaEntry *vmae, u64 pme) return false; if (vma_entry_is(vmae, VMA_AREA_AIORING)) return true; + if (vma_entry_is(vmae, VMA_AREA_IO_URING)) + return false; if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme)) return true; @@ -704,6 +709,8 @@ int prepare_mm_pid(struct pstree_item *i) ret = collect_filemap(vma); else if (vma_area_is(vma, VMA_AREA_SOCKET)) ret = collect_socket_map(vma); + else if (vma_area_is(vma, VMA_AREA_IO_URING)) + ret = collect_io_uring_map(vma); else ret = 0; if (ret) diff --git a/criu/proc_parse.c b/criu/proc_parse.c index f3491e78175..b343209da9e 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "types.h" @@ -41,6 +42,7 @@ #include "path.h" #include "fault-injection.h" #include "memfd.h" +#include "io_uring.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" @@ -75,7 +77,8 @@ static char *buf = __buf.buf; * This is how AIO ring buffers look like in proc */ -#define AIO_FNAME "/[aio]" +#define AIO_FNAME "/[aio]" +#define IO_URING_FNAME "anon_inode:[io_uring]" /* check the @line starts with "%lx-%lx" format */ static bool __is_vma_range_fmt(char *line) @@ -171,7 +174,8 @@ static void parse_vma_vmflags(char *buf, struct vma_area *vma_area) * only exception is VVAR area that mapped by the kernel as * VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP */ - if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) + if (io_pf && !vma_area_is(vma_area, VMA_AREA_IO_URING) && !vma_area_is(vma_area, VMA_AREA_VVAR) && + !vma_entry_is(vma_area->e, VMA_FILE_SHARED)) vma_area->e->status |= VMA_UNSUPP; if (vma_area->e->madv) @@ -373,14 +377,20 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, st /* * If vfi is equal (!) and negative @vm_file_fd -- - * we have nothing to borrow for sure. + * we have nothing to borrow for sure, unless it's io_uring */ - if (*vm_file_fd < 0) + if (*vm_file_fd < 0 && !vma_area_is(prev, VMA_AREA_IO_URING)) return 0; pr_debug("vma %" PRIx64 " borrows vfi from previous %" PRIx64 "\n", vma->e->start, prev->e->start); - if (prev->e->status & VMA_AREA_SOCKET) + if (prev->e->status & VMA_AREA_SOCKET) { vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; + } else if (prev->e->status & VMA_AREA_IO_URING) { + vma->e->status |= VMA_AREA_IO_URING | VMA_AREA_REGULAR; + vma->io_uring_id = prev->io_uring_id; + /* Add page to io_uring ctx */ + add_one_io_uring_mapping(vma->e->pgoff, vma->io_uring_id); + } /* * FIXME -- in theory there can be vmas that have @@ -437,6 +447,16 @@ static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, st return 0; } + if (!strncmp(fname, IO_URING_FNAME, sizeof(IO_URING_FNAME) - 1)) { + pr_debug("Marking VMA as IO_URING | REGULAR for inode %lu\n", + (unsigned long)buf.st_ino); + vma->io_uring_id = buf.st_ino; + vma->e->status |= VMA_AREA_IO_URING | VMA_AREA_REGULAR; + /* Add page to io_uring ctx */ + add_one_io_uring_mapping(vma->e->pgoff, vma->io_uring_id); + return 0; + } + pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); return -1; } @@ -616,6 +636,11 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_pat */ if (vma_area->mnt_id != -1 && get_fd_mntid(*vm_file_fd, &vma_area->mnt_id)) return -1; + } else if (vma_area->e->status & VMA_AREA_IO_URING) { + if (vma_area->e->flags & MAP_PRIVATE) + vma_area->e->status |= VMA_FILE_PRIVATE; + else + vma_area->e->status |= VMA_FILE_SHARED; } else { /* * No file but mapping -- anonymous one. @@ -792,6 +817,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t du if (handle_vma(pid, vma_area, str + path_off, map_files_dir, &vfi, &prev_vfi, &vm_file_fd)) goto err; + //pr_info("dump_filemap=%p IO_URING=%d?\n", dump_filemap, (int) vma_area_is(vma_area, VMA_AREA_IO_URING)); if (vma_entry_is(vma_area->e, VMA_FILE_PRIVATE) || vma_entry_is(vma_area->e, VMA_FILE_SHARED)) { if (dump_filemap && dump_filemap(vma_area, vm_file_fd)) goto err; @@ -1708,6 +1734,261 @@ static int parse_bpfmap(struct bfd *f, char *str, BpfmapFileEntry *bpf) return 0; } +static int parse_io_uring(struct bfd *f, char *str, struct io_uring_ctx *ctx) +{ + IoUringFileEntry *iofe = io_uring_get_iofe(ctx); + unsigned int nr; + pid_t pid; + int r; + + /* + * Format is: + * + * SqThread: %d + * SqThreadCpu: %d + * UserFiles: %u (number of registered files) (OPTIONAL DATA) + * %5u: %s (idx: filename) + * UserBufs: %u (number of registered buffers) (OPTIONAL DATA) + * %5u: 0x%llx/%u (idx: 0xaddr/len) + * Personalities: (OPTIONAL HEADING and DATA) + * %5d (id) + * Uid: %llu %llu %llu %llu (uid euid suid fsuid) + * Gid: %llu %llu %llu %llu (gid egid sgid fsgid) + * Groups: %llu %llu ... %llu (groups) + * CapEff: %llx ... %llx + * PollList: (OPTIONAL DATA) + * op=%d, task_works=%d (op=opcode, task_works=0 or 1) + * --- (Added by patch) + * Locked: %d (0 or 1) + * SqThreadIdle: %u + * SetupFlags: 0x%x + * SqEntries: %u + * CqEntries: %u + * SqOffArray: %u + * ... (OPTIONAL FIELDS) + * RestrictRegisterOp: %s (bitmap) + * RestrictSqeOp: %s (bitmap) + * RestrictSqeFlagsAllowed: %c (u8) + * RestrictSqeFlagsRequired: %c (u8) + */ + + if (sscanf(str, "SqThread: %d", &pid) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "SqThreadCpu: %d", &iofe->sq_thread_cpu) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "UserFiles: %u", &nr) != 1) + goto end; + if (nr) { + /* Not supported, yet */ + pr_warn("Registered files dump unsupported\n"); + return -ENOTSUP; + do { + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (!strncmp(str, "UserBufs", sizeof("UserBufs") - 1)) + break; + /* skip line, we use eBPF iterator to collect the file + * set registered with io_uring */ + } while (true); + } + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "UserBufs: %u", &nr) != 1) + goto end; + for (int i = 0; i < nr; i++) { + long long unsigned int address; + unsigned int idx, len; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "%5u: 0x%llx/%u", &idx, &address, &len) != 3) + goto end; + + if (io_uring_push_buf(ctx, idx, address, len)) + goto end; + } + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (!strncmp(str, "Personalities", sizeof("Personalities") - 1)) { + for (;;) { + struct io_uring_personality_desc desc = {}; + struct io_uring_group_desc *g, *gtmp; + char *tok; + int id; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + str = str + strspn(str, " "); + if (!strncmp(str, "PollList", sizeof("PollList") - 1)) + break; + else if (sscanf(str, "%5d", &id) != 1) + goto end; + desc.id = id; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + str = str + strspn(str, " "); + if (sscanf(str, " Uid: %u %u %u %u", &desc.uid, &desc.euid, &desc.suid, &desc.fsuid) != 4) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, " Gid: %u %u %u %u", &desc.gid, &desc.egid, &desc.sgid, &desc.fsgid) != 4) + goto end; + + INIT_LIST_HEAD(&desc.group_list); + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + str = strstr(str, ":"); + tok = str + 2; + while ((tok = strtok(tok, " "))) { + struct io_uring_group_desc *gdesc; + + gdesc = xzalloc(sizeof(*gdesc)); + if (!gdesc) + goto end_free; + INIT_LIST_HEAD(&gdesc->list); + + if (sscanf(tok, "%u", &gdesc->group) != 1) + goto end_free; + list_add_tail(&gdesc->list, &desc.group_list); + desc.nr_groups++; + tok = NULL; + } + + /* CapEff */ + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end_free; + str = strstr(str, ":"); + str += 2; + if (cap_parse(str, desc.cap_eff)) + goto end_free; + + if (io_uring_push_personality(ctx, &desc)) + goto end_free; + continue; + end_free: + list_for_each_entry_safe(g, gtmp, &desc.group_list, list) + xfree(g); + goto end; + } + } + + /* PollList: */ + for (; str; str = breadline(f)) { + if (IS_ERR(str)) + goto end; + /* Skip leading space */ + str = str + strspn(str, " "); + if (!strncmp(str, "op", sizeof("op") - 1)) + continue; + else + break; + } + if (IS_ERR_OR_NULL(str)) + goto end; + + /* str obtained from above */ + if (sscanf(str, "Locked: %d", &r) != 1) + goto end; + if (!r) { + pr_err("fdinfo read for io_uring could not take ctx->uring_lock inside kernel\n" + "This indicates that the ring is not idle, hence cannot proceed\n"); + goto end; + } + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "SqThreadIdle: %u", &iofe->sq_thread_idle) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "SetupFlags: %u", &iofe->setup_flags) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "SqEntries: %u", &iofe->sq_entries) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "CqEntries: %u", &iofe->cq_entries) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "SqOffArray: %u", &iofe->sq_off_array) != 1) + goto end; + + /* Printing restrictions is optional */ + str = breadline(f); + if (IS_ERR(str)) + goto end; + if (!str) + return 0; + nr = 0; + /* Upper bits are unused in bitmap */ + if (sscanf(str, "RestrictRegisterOp: %x,%x", &nr, &iofe->reg_op) != 2) { + /* 32-bit long? */ + if (sscanf(str, "RestrictRegisterOp: %x", &iofe->reg_op) != 1) + goto end; + } + BUG_ON(nr); + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "RestrictSqeOp: %x,%x", &nr, &iofe->sqe_op) != 2) { + if (sscanf(str, "RestrictSqeOp: %x", &iofe->sqe_op) != 1) + goto end; + } + BUG_ON(nr); + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "RestrictSqeFlagsAllowed: 0x%x", &iofe->sqe_flags_allowed) != 1) + goto end; + + str = breadline(f); + if (IS_ERR_OR_NULL(str)) + goto end; + if (sscanf(str, "RestrictSqeFlagsRequired: 0x%x", &iofe->sqe_flags_required) != 1) + goto end; + iofe->restrictions = true; + + return 0; +end: + pr_err("Incomplete io_uring fdinfo support\n"); + return -1; +} + #define fdinfo_field(str, field) !strncmp(str, field ":", sizeof(field)) static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked); @@ -2030,6 +2311,21 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) entry_met = true; continue; } + if (fdinfo_field(str, "ino")) { + if (type != FD_TYPES__IO_URING) + goto parse_err; + + str = breadline(&f); + if (IS_ERR_OR_NULL(str)) + goto parse_err; + + ret = parse_io_uring(&f, str, arg); + if (ret) + goto parse_err; + + entry_met = true; + continue; + } } exit_code = 0; diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index ff16b9f5bed..9c267de20b1 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -68,6 +68,7 @@ #include "images/bpfmap-file.pb-c.h" #include "images/bpfmap-data.pb-c.h" #include "images/apparmor.pb-c.h" +#include "images/io_uring.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; diff --git a/criu/util.c b/criu/util.c index 06124c22058..423db961dd8 100644 --- a/criu/util.c +++ b/criu/util.c @@ -187,6 +187,7 @@ static void vma_opt_str(const struct vma_area *v, char *opt) opt2s(VMA_ANON_PRIVATE, "ap"); opt2s(VMA_AREA_SYSVIPC, "sysv"); opt2s(VMA_AREA_SOCKET, "sk"); + opt2s(VMA_AREA_IO_URING, "io_uring"); #undef opt2s } diff --git a/images/Makefile b/images/Makefile index 2eaeb7cad2d..58e585ad523 100644 --- a/images/Makefile +++ b/images/Makefile @@ -71,6 +71,7 @@ proto-obj-y += img-streamer.o proto-obj-y += bpfmap-file.o proto-obj-y += bpfmap-data.o proto-obj-y += apparmor.o +proto-obj-y += io_uring.o CFLAGS += -iquote $(obj)/ diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 88f1c11860f..75303154489 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -20,6 +20,7 @@ import "pipe.proto"; import "tty.proto"; import "memfd.proto"; import "bpfmap-file.proto"; +import "io_uring.proto"; enum fd_types { UND = 0; @@ -42,6 +43,7 @@ enum fd_types { TIMERFD = 17; MEMFD = 18; BPFMAP = 19; + IO_URING = 20; /* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -57,25 +59,26 @@ message fdinfo_entry { } message file_entry { - required fd_types type = 1; - required uint32 id = 2; - optional reg_file_entry reg = 3; - optional inet_sk_entry isk = 4; - optional ns_file_entry nsf = 5; - optional packet_sock_entry psk = 6; - optional netlink_sk_entry nlsk = 7; - optional eventfd_file_entry efd = 8; - optional eventpoll_file_entry epfd = 9; - optional signalfd_entry sgfd = 10; - optional tunfile_entry tunf = 11; - optional timerfd_entry tfd = 12; - optional inotify_file_entry ify = 13; - optional fanotify_file_entry ffy = 14; - optional ext_file_entry ext = 15; - optional unix_sk_entry usk = 16; - optional fifo_entry fifo = 17; - optional pipe_entry pipe = 18; - optional tty_file_entry tty = 19; - optional memfd_file_entry memfd = 20; - optional bpfmap_file_entry bpf = 21; + required fd_types type = 1; + required uint32 id = 2; + optional reg_file_entry reg = 3; + optional inet_sk_entry isk = 4; + optional ns_file_entry nsf = 5; + optional packet_sock_entry psk = 6; + optional netlink_sk_entry nlsk = 7; + optional eventfd_file_entry efd = 8; + optional eventpoll_file_entry epfd = 9; + optional signalfd_entry sgfd = 10; + optional tunfile_entry tunf = 11; + optional timerfd_entry tfd = 12; + optional inotify_file_entry ify = 13; + optional fanotify_file_entry ffy = 14; + optional ext_file_entry ext = 15; + optional unix_sk_entry usk = 16; + optional fifo_entry fifo = 17; + optional pipe_entry pipe = 18; + optional tty_file_entry tty = 19; + optional memfd_file_entry memfd = 20; + optional bpfmap_file_entry bpf = 21; + optional io_uring_file_entry io_uring = 22; } diff --git a/images/io_uring.proto b/images/io_uring.proto new file mode 100644 index 00000000000..cb933d0b565 --- /dev/null +++ b/images/io_uring.proto @@ -0,0 +1,72 @@ +// SPDX-License-Identifier: MIT + +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; + +message io_uring_group_id { + required uint32 group = 1; + required string group_name = 2; +} + +message io_uring_personality_id { + required uint32 uid = 1; + required uint32 euid = 2; + required uint32 suid = 3; + required uint32 fsuid = 4; + required uint32 gid = 5; + required uint32 egid = 6; + required uint32 sgid = 7; + required uint32 fsgid = 8; + required string uid_name = 9; + required string euid_name = 10; + required string suid_name = 11; + required string fsuid_name = 12; + required string gid_name = 13; + required string egid_name = 14; + required string sgid_name = 15; + required string fsgid_name = 16; + required uint64 cap_eff = 17; + repeated io_uring_group_id group_id = 18; +} + +message io_uring_file_entry { + required uint32 id = 1; + required uint32 flags = 2 [(criu).flags = "rfile.flags"]; + required uint64 pos = 3; + required fown_entry fown = 4; + /* Instance */ + required uint32 setup_flags = 5; + required uint32 sq_thread_cpu = 6; + required uint32 sq_thread_idle = 7; + required uint64 nr_user_bufs = 8; + required uint64 nr_user_files = 9; + required uint32 sq_entries = 10; + required uint32 cq_entries = 11; + required uint32 sq_off_array = 12; + required uint32 inode = 13; + /* Ring */ + required uint32 sq_head = 14; + required uint32 sq_tail = 15; + required uint32 cqe_head = 16; + required uint32 cqe_tail = 17; + required uint32 sq_ring_mask = 18; + required uint32 cq_ring_mask = 19; + /* Restrictions */ + required bool restrictions = 20; + required uint32 reg_op = 21; + required uint32 sqe_op = 22; + required uint32 sqe_flags_allowed = 23; + required uint32 sqe_flags_required = 24; + /* Personality */ + repeated io_uring_personality_id pers_id = 25; + optional sint32 mnt_id = 26 [default = -1]; +} + +message io_uring_data_entry { + required uint32 id = 1; + required uint32 sqe_bytes = 2; /* Bytes required for SQEs */ + required uint32 cqe_bytes = 3; /* Bytes required for CQEs */ + required uint32 sq_arr_bytes = 4; /* Bytes required for SQ array */ +} diff --git a/lib/py/images/images.py b/lib/py/images/images.py index 300b1cc69a6..46de8fe70b4 100644 --- a/lib/py/images/images.py +++ b/lib/py/images/images.py @@ -384,6 +384,21 @@ def skip(self, f, pload): f.seek(pload.bytes, os.SEEK_CUR) return pload.bytes +class io_uring_data_extra_handler: + def load(self, f, pload): + size = pload.sqe_bytes + pload.cqe_bytes + pload.sq_arr_bytes + data = f.read(size) + return base64.encodebytes(data).decode('utf-8') + + def dump(self, extra, f, pload): + data = base64.decodebytes(extra) + f.write(data) + + def skip(self, f, pload): + size = pload.sqe_bytes + pload.cqe_bytes + pload.sq_arr_bytes + f.seek(size, os.SEEK_CUR) + return size + class ipc_sem_set_handler: def load(self, f, pbuff): entry = pb2dict.pb2dict(pbuff) @@ -561,6 +576,9 @@ def skip(self, f, pbuff): 'BPFMAP_FILE': entry_handler(pb.bpfmap_file_entry), 'BPFMAP_DATA': entry_handler(pb.bpfmap_data_entry, bpfmap_data_extra_handler()), + 'IO_URING_FILE': entry_handler(pb.io_uring_file_entry), + 'IO_URING_DATA': entry_handler(pb.io_uring_data_entry, + io_uring_data_extra_handler()), 'APPARMOR': entry_handler(pb.apparmor_entry), }