Skip to content

Commit

Permalink
Introduce io_uring dump/restore support
Browse files Browse the repository at this point in the history
Signed-off-by: Kumar Kartikeya Dwivedi <[email protected]>
  • Loading branch information
kkdwivedi committed Aug 30, 2021
1 parent 5f539fc commit 9cb8f3c
Show file tree
Hide file tree
Showing 20 changed files with 1,648 additions and 35 deletions.
1 change: 1 addition & 0 deletions criu/Makefile.crtools
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ obj-y += fsnotify.o
obj-y += image-desc.o
obj-y += image.o
obj-y += img-streamer.o
obj-y += io_uring.o
obj-y += ipc_ns.o
obj-y += irmap.o
obj-y += kcmp-ids.o
Expand Down
57 changes: 55 additions & 2 deletions criu/cr-dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
#include "pidfd-store.h"
#include "apparmor.h"
#include "asm/dump.h"
#include "io_uring.h"

/*
* Architectures can overwrite this function to restore register sets that
Expand Down Expand Up @@ -191,10 +192,11 @@ struct cr_imgset *glob_imgset;

static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds)
{
char buf[PATH_MAX] = {};
struct dirent *de;
DIR *fd_dir;
int size = 0;
int n;
int n, pidfd;
DIR *fd_dir;

pr_info("\n");
pr_info("Collecting fds (pid: %d)\n", pid);
Expand All @@ -204,6 +206,55 @@ static int collect_fds(pid_t pid, struct parasite_drain_fd **dfds)
if (!fd_dir)
return -1;

/* Before collecting fds, we need to bring io_uring to steady state,
* since it can install fds into task's fdtable, and if we do it later,
* during actual io_uring dump, we will miss dumping these files.
*/
if (!kdat.has_pidfd_open) {
pr_err("pidfd_open system call not supported\n");
return -1;
}
if (!kdat.has_pidfd_getfd) {
pr_err("pidfd_getfd system call not supported\n");
return -1;
}

pidfd = syscall(SYS_pidfd_open, pid, 0);
if (pidfd < 0) {
pr_err("Failed to open pidfd for pid %d\n", pid);
return -1;
}

while ((de = readdir(fd_dir))) {
if (dir_dots(de))
continue;

n = dirfd(fd_dir);
if (n == -1) {
close(pidfd);
return -1;
}

n = readlinkat(n, de->d_name, buf, sizeof(buf));
if (n == -1) {
close(pidfd);
return -1;
}

if (is_io_uring_link(buf)) {
if (io_uring_synchronize_fd(syscall(__NR_pidfd_getfd, pidfd, atoi(de->d_name), 0))) {
pr_err("Failed to synchronize io_uring fd %d for pid %d\n", atoi(de->d_name), pid);
close(pidfd);
return -1;
}
}
}

close(pidfd);

/* Collect fds now */
rewinddir(fd_dir);

n = 0;
while ((de = readdir(fd_dir))) {
if (dir_dots(de))
Expand Down Expand Up @@ -489,6 +540,8 @@ static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, const struc
ret = check_sysvipc_map_dump(pid, vma);
else if (vma_entry_is(vma, VMA_AREA_SOCKET))
ret = dump_socket_map(vma_area);
else if (vma_entry_is(vma, VMA_AREA_IO_URING))
ret = dump_io_uring_map(vma_area);
else
ret = 0;
if (ret)
Expand Down
3 changes: 2 additions & 1 deletion criu/cr-restore.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "servicefd.h"
#include "image.h"
#include "img-streamer.h"
#include "io_uring.h"
#include "util.h"
#include "util-pie.h"
#include "criu-log.h"
Expand Down Expand Up @@ -277,7 +278,7 @@ static struct collect_image_info *cinfos_files[] = {
&unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, &packet_sk_cinfo,
&netlink_sk_cinfo, &eventfd_cinfo, &epoll_cinfo, &epoll_tfd_cinfo, &signalfd_cinfo,
&tunfile_cinfo, &timerfd_cinfo, &inotify_cinfo, &inotify_mark_cinfo, &fanotify_cinfo,
&fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo,
&fanotify_mark_cinfo, &ext_file_cinfo, &memfd_cinfo, &io_uring_cinfo,
};

/* These images are required to restore namespaces */
Expand Down
3 changes: 3 additions & 0 deletions criu/files.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include "kerndat.h"
#include "fdstore.h"
#include "bpfmap.h"
#include "io_uring.h"

#include "protobuf.h"
#include "util.h"
Expand Down Expand Up @@ -536,6 +537,8 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts,
else if (is_bpfmap_link(link))
ops = &bpfmap_dump_ops;
#endif
else if (is_io_uring_link(link))
ops = &io_uring_dump_ops;
else
return dump_unsupp_fd(&p, lfd, "anon", link, e);

Expand Down
2 changes: 2 additions & 0 deletions criu/image-desc.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
FD_ENTRY_F(BPFMAP_FILE, "bpfmap-file", O_NOBUF),
FD_ENTRY_F(BPFMAP_DATA, "bpfmap-data", O_NOBUF),
FD_ENTRY(APPARMOR, "apparmor"),
FD_ENTRY_F(IO_URING_FILE, "io_uring-file", O_NOBUF),
FD_ENTRY_F(IO_URING_DATA, "io_uring-data", O_NOBUF),

[CR_FD_STATS] = {
.fmt = "stats-%s",
Expand Down
2 changes: 2 additions & 0 deletions criu/include/image-desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ enum {
CR_FD_MEMFD_INODE,
CR_FD_BPFMAP_FILE,
CR_FD_BPFMAP_DATA,
CR_FD_IO_URING_FILE,
CR_FD_IO_URING_DATA,
_CR_FD_GLOB_TO,

CR_FD_TMPFS_IMG,
Expand Down
11 changes: 6 additions & 5 deletions criu/include/image.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,12 @@
#define VMA_ANON_SHARED (1 << 8)
#define VMA_ANON_PRIVATE (1 << 9)

#define VMA_AREA_SYSVIPC (1 << 10)
#define VMA_AREA_SOCKET (1 << 11)
#define VMA_AREA_VVAR (1 << 12)
#define VMA_AREA_AIORING (1 << 13)
#define VMA_AREA_MEMFD (1 << 14)
#define VMA_AREA_SYSVIPC (1 << 10)
#define VMA_AREA_SOCKET (1 << 11)
#define VMA_AREA_VVAR (1 << 12)
#define VMA_AREA_AIORING (1 << 13)
#define VMA_AREA_MEMFD (1 << 14)
#define VMA_AREA_IO_URING (1 << 15)

#define VMA_CLOSE (1 << 28)
#define VMA_NO_PROT_WRITE (1 << 29)
Expand Down
108 changes: 108 additions & 0 deletions criu/include/io_uring.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#ifndef __CR_IO_URING_H__
#define __CR_IO_URING_H__

#include <linux/capability.h>

#include "files.h"
#include "io_uring.pb-c.h"

/* Definitions */
struct __io_uring_restriction {
__u16 opcode;
union {
__u8 register_op; /* IORING_RESTRICTION_REGISTER_OP */
__u8 sqe_op; /* IORING_RESTRICTION_SQE_OP */
__u8 sqe_flags; /* IORING_RESTRICTION_SQE_FLAGS_* */
};
__u8 resv;
__u32 resv2[3];
};

#ifndef IORING_SETUP_IOPOLL
#define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */
#endif
#ifndef IORING_SETUP_SQPOLL
#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
#endif
#ifndef IORING_SETUP_SQ_AFF
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
#endif
#ifndef IORING_SETUP_CQSIZE
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
#endif
#ifndef IORING_SETUP_ATTACH_WQ
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
#endif
#ifndef IORING_SETUP_R_DISABLED
#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
#endif

#ifndef IORING_OFF_SQ_RING
#define IORING_OFF_SQ_RING 0ULL
#endif
#ifndef IORING_OFF_CQ_RING
#define IORING_OFF_CQ_RING 0x8000000ULL
#endif
#ifndef IORING_OFF_SQES
#define IORING_OFF_SQES 0x10000000ULL
#endif

#ifndef IOSQE_IO_DRAIN
#define IOSQE_IO_DRAIN (1U << 1)
#endif

#define __IORING_RESTRICTION_REGISTER_OP 0
#define __IORING_RESTRICTION_SQE_OP 1
#define __IORING_RESTRICTION_SQE_FLAGS_ALLOWED 2
#define __IORING_RESTRICTION_SQE_FLAGS_REQUIRED 3
#define __IORING_REGISTER_PERSONALITY 9
#define __IORING_REGISTER_RESTRICTIONS 11
#define __IORING_REGISTER_ENABLE_RINGS 12

struct io_uring_file_info {
IoUringFileEntry *iofe;
struct file_desc d;
};

struct io_uring_data_info {
IoUringDataEntry *iode;
};

struct io_uring_group_desc {
struct list_head list;
gid_t group;
char group_name[32];
};

struct io_uring_personality_desc {
int id;
uid_t uid;
uid_t euid;
uid_t suid;
uid_t fsuid;
gid_t gid;
gid_t egid;
gid_t sgid;
gid_t fsgid;
u32 cap_eff[CR_CAP_SIZE];
size_t nr_groups;
struct list_head group_list;
};

struct io_uring_ctx;

extern struct collect_image_info io_uring_cinfo;
extern struct collect_image_info io_uring_data_cinfo;
extern const struct fdtype_ops io_uring_dump_ops;

int is_io_uring_link(char *link);
int io_uring_synchronize_fd(int fd);
int collect_io_uring_map(struct vma_area *vma);
int dump_io_uring_map(struct vma_area *vma);
int add_one_io_uring_mapping(uint64_t offset, ino_t inode);

int io_uring_push_buf(struct io_uring_ctx *ctx, unsigned int idx, long long unsigned int address, unsigned int len);
int io_uring_push_personality(struct io_uring_ctx *ctx, struct io_uring_personality_desc *desc);
IoUringFileEntry *io_uring_get_iofe(struct io_uring_ctx *ctx);

#endif /* __CR_IO_URING_H__ */
2 changes: 2 additions & 0 deletions criu/include/magic.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@
#define BPFMAP_FILE_MAGIC 0x57506142 /* Alapayevsk */
#define BPFMAP_DATA_MAGIC 0x64324033 /* Arkhangelsk */
#define APPARMOR_MAGIC 0x59423047 /* Nikolskoye */
#define IO_URING_FILE_MAGIC 0x55403656 /* Butyn */
#define IO_URING_DATA_MAGIC 0x54194822 /* Ulyanovsk */

#define IFADDR_MAGIC RAW_IMAGE_MAGIC
#define ROUTE_MAGIC RAW_IMAGE_MAGIC
Expand Down
2 changes: 2 additions & 0 deletions criu/include/protobuf-desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ enum {
PB_BPFMAP_FILE,
PB_BPFMAP_DATA,
PB_APPARMOR,
PB_IO_URING_FILE,
PB_IO_URING_DATA,

/* PB_AUTOGEN_STOP */

Expand Down
5 changes: 4 additions & 1 deletion criu/include/vma.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ struct vma_area {

union {
struct /* for dump */ {
int vm_socket_id;
union {
int vm_socket_id;
int io_uring_id;
};

char *aufs_rpath; /* path from aufs root */
char *aufs_fpath; /* full path from global root */
Expand Down
Loading

0 comments on commit 9cb8f3c

Please sign in to comment.