From 9a98e31898edb973bca97d6d94d90b4a8cdeccce Mon Sep 17 00:00:00 2001 From: hdzhoujie Date: Thu, 8 Dec 2022 13:01:50 +0800 Subject: [PATCH] dump and restore cpu affinity of each thread. Add one entry of thread_cpuallow_entry into thread_core_entry to save cpu affinity info. Restore it after threads restored but before running. Add option --with-cpu-affinity to enable this function at restore. Signed-off-by: hdzhoujie Signed-off-by: He jingxian Signed-off-by: Sang Yan --- .../arch/arm/plugins/std/syscalls/syscall.def | 1 + .../plugins/std/syscalls/syscall-ppc64.tbl | 1 + .../plugins/std/syscalls/syscall-s390.tbl | 1 + .../x86/plugins/std/syscalls/syscall_32.tbl | 1 + .../x86/plugins/std/syscalls/syscall_64.tbl | 1 + criu/config.c | 1 + criu/cr-dump.c | 12 ++++++ criu/cr-restore.c | 35 ++++++++++++++++- criu/crtools.c | 2 + criu/include/cr_options.h | 3 ++ criu/include/restorer.h | 5 +++ criu/pie/restorer.c | 38 +++++++++++++++++++ criu/pstree.c | 19 ++++++++++ images/core.proto | 6 +++ images/pstree.proto | 1 + 15 files changed, 126 insertions(+), 1 deletion(-) diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 8bcc3cc50a..f4c4f6a376 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -118,6 +118,7 @@ fsopen 430 430 (char *fsname, unsigned int flags) fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) clone3 435 435 (struct clone_args *uargs, size_t size) +sched_setaffinity 122 241 (int fd, size_t cpusetsize, const cpu_set_t *mask) pidfd_open 434 434 (pid_t pid, unsigned int flags) openat2 437 437 (int dirfd, char *pathname, struct open_how *how, size_t size) pidfd_getfd 438 438 (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index af40d71045..c90f7ba7df 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -114,6 +114,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_sched_setaffinity 222 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index 6a349e1cb7..1db1512a9c 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -114,6 +114,7 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) +__NR_sched_setaffinity 239 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) __NR_pidfd_open 434 sys_pidfd_open (pid_t pid, unsigned int flags) __NR_openat2 437 sys_openat2 (int dirfd, char *pathname, struct open_how *how, size_t size) __NR_pidfd_getfd 438 sys_pidfd_getfd (int pidfd, int targetfd, unsigned int flags) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index a119a59b2e..bdd337cad5 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -63,6 +63,7 @@ __NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char * __NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior) __NR_gettid 224 sys_gettid (void) __NR_futex 240 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_sched_setaffinity 241 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) __NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info) __NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info) __NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 16dd86e791..e7f6c791da 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -73,6 +73,7 @@ __NR_mount 165 sys_mount (char *dev_nmae, char *dir_name, char *type, unsign __NR_umount2 166 sys_umount2 (char *name, int flags) __NR_gettid 186 sys_gettid (void) __NR_futex 202 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3) +__NR_sched_setaffinity 203 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask) __NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info) __NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx) __NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo) diff --git a/criu/config.c b/criu/config.c index 9f02ae9928..bc83a243ab 100644 --- a/criu/config.c +++ b/criu/config.c @@ -697,6 +697,7 @@ int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, { "cgroup-yard", required_argument, 0, 1096 }, { "pre-dump-mode", required_argument, 0, 1097 }, { "file-validation", required_argument, 0, 1098 }, + BOOL_OPT("with-cpu-affinity", &opts.with_cpu_affinity), BOOL_OPT("skip-file-rwx-check", &opts.skip_file_rwx_check), { "lsm-mount-context", required_argument, 0, 1099 }, { "network-lock", required_argument, 0, 1100 }, diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 63eb627fc2..a5f3ea2f92 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -140,6 +140,7 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) { int ret; struct sched_param sp; + cpu_set_t cpumask; BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */ @@ -185,6 +186,17 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc) tc->has_sched_nice = true; tc->sched_nice = ret; + if (opts.with_cpu_affinity) { + pr_info("\tdumping allowed cpus for %d\n", pid); + ret = syscall(__NR_sched_getaffinity, pid, sizeof(cpumask), &cpumask); + if (ret < 0) { + pr_perror("Can't get sched affinity for %d", pid); + return -1; + } + tc->allowed_cpus->has_cpumask = true; + memcpy(tc->allowed_cpus->cpumask, &cpumask, sizeof(cpu_set_t)); + } + return 0; } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 974202f16f..7411b80552 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -120,6 +120,7 @@ static int prepare_restorer_blob(void); static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core); static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core); static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core); +static int prepare_allowed_cpus(int pid, struct task_restore_args *ta, CoreEntry *leader_core); /* * Architectures can overwrite this function to restore registers that are not @@ -918,6 +919,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (prepare_signals(pid, ta, core)) return -1; + if (prepare_allowed_cpus(pid, ta, core)) + return -1; + if (prepare_posix_timers(pid, ta, core)) return -1; @@ -3290,6 +3294,34 @@ static int prepare_signals(int pid, struct task_restore_args *ta, CoreEntry *lea return ret; } +static int prepare_allowed_cpus(int pid, struct task_restore_args *ta, CoreEntry *leader_core) +{ + int i; + cpu_set_t *cpumask; + bool *has_cpumask; + + if (!opts.with_cpu_affinity) { + return 0; + } + + ta->allowed_cpus = (char *)rst_mem_align_cpos(RM_PRIVATE); + for (i = 0; i < current->nr_threads; i++) { + has_cpumask = rst_mem_alloc(sizeof(bool), RM_PRIVATE); + if (!has_cpumask) + return -1; + memcpy(has_cpumask, &(current->core[i]->thread_core->allowed_cpus->has_cpumask), sizeof(bool)); + + if (!(*has_cpumask)) + continue; + + cpumask = rst_mem_alloc(sizeof(cpu_set_t), RM_PRIVATE); + if (!cpumask) + return -1; + memcpy(cpumask, current->core[i]->thread_core->allowed_cpus->cpumask, sizeof(cpu_set_t)); + } + return 0; +} + extern void __gcov_flush(void) __attribute__((weak)); void __gcov_flush(void) { @@ -3740,6 +3772,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns RST_MEM_FIXUP_PPTR(task_args->timerfd); RST_MEM_FIXUP_PPTR(task_args->posix_timers); RST_MEM_FIXUP_PPTR(task_args->siginfo); + RST_MEM_FIXUP_PPTR(task_args->allowed_cpus); RST_MEM_FIXUP_PPTR(task_args->rlims); RST_MEM_FIXUP_PPTR(task_args->helpers); RST_MEM_FIXUP_PPTR(task_args->zombies); @@ -3900,7 +3933,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns task_args->thread_args = thread_args; task_args->auto_dedup = opts.auto_dedup; - + task_args->with_cpu_affinity = opts.with_cpu_affinity; /* * In the restorer we need to know if it is SELinux or not. For SELinux * we must change the process context before creating threads. For diff --git a/criu/crtools.c b/criu/crtools.c index ac05bc8215..e779dd70b0 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -509,6 +509,8 @@ int main(int argc, char *argv[], char *envp[]) " --file-validation METHOD\n" " pass the validation method to be used; argument\n" " can be 'filesize' or 'buildid' (default).\n" + " --with-cpu-affinity Allow to restore cpu affinity. Only for hosts with\n" + " same cpu quantity.\n" " --skip-file-rwx-check\n" " Skip checking file permissions\n" " (r/w/x for u/g/o) on restore.\n" diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index c7e98c756c..dfbd92b253 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -236,6 +236,9 @@ struct cr_options { * explicitly request it as it comes with many limitations. */ int unprivileged; + + /* restore cpu affinity */ + int with_cpu_affinity; }; extern struct cr_options opts; diff --git a/criu/include/restorer.h b/criu/include/restorer.h index bc0beb5cbb..25c5eb0ea2 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -171,6 +172,8 @@ struct task_restore_args { siginfo_t *siginfo; unsigned int siginfo_n; + char *allowed_cpus; + struct rst_tcp_sock *tcp_socks; unsigned int tcp_socks_n; @@ -240,6 +243,8 @@ struct task_restore_args { uid_t uid; u32 cap_eff[CR_CAP_SIZE]; + + bool with_cpu_affinity; } __aligned(64); /* diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 99cff1f7d0..9e33542220 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -430,6 +430,40 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group) return 0; } +static int restore_cpu_affinity(struct task_restore_args *args) +{ + int i; + int pid; + int ret; + cpu_set_t *cpumask; + char *allowed_cpus; + bool *has_cpumask; + + if (!args->with_cpu_affinity) { + return 0; + } + + allowed_cpus = args->allowed_cpus; + for (i = 0; i < args->nr_threads; i++) { + has_cpumask = (bool *)allowed_cpus; + allowed_cpus += sizeof(bool); + if (!(*has_cpumask)) { + continue; + } + + pid = args->thread_args[i].pid; + cpumask = (cpu_set_t *)allowed_cpus; + ret = sys_sched_setaffinity(pid, sizeof(cpu_set_t), cpumask); + if (ret) { + pr_err("\t Restore %d cpumask failed.\n", pid); + return ret; + } + allowed_cpus += sizeof(cpu_set_t); + } + + return 0; +} + static int restore_rseq(struct rst_rseq_param *rseq) { int ret; @@ -1968,6 +2002,10 @@ long __export_restore_task(struct task_restore_args *args) pr_info("%ld: Restored\n", sys_getpid()); + ret = restore_cpu_affinity(args); + if (ret) + goto core_restore_end; + restore_finish_stage(task_entries_local, CR_STATE_RESTORE); if (wait_helpers(args) < 0) diff --git a/criu/pstree.c b/criu/pstree.c index 72c4a3502a..06c98c71f0 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -58,11 +58,13 @@ CoreEntry *core_entry_alloc(int th, int tsk) CredsEntry *ce = NULL; sz += sizeof(ThreadCoreEntry) + sizeof(ThreadSasEntry) + sizeof(CredsEntry); + sz += sizeof(ThreadAllowedcpusEntry); sz += CR_CAP_SIZE * sizeof(ce->cap_inh[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]); sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]); + sz += sizeof(cpu_set_t); /* * @groups are dynamic and allocated * on demand. @@ -127,6 +129,11 @@ CoreEntry *core_entry_alloc(int th, int tsk) ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0])); ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0])); + core->thread_core->allowed_cpus = xptr_pull(&m, ThreadAllowedcpusEntry); + thread_allowedcpus_entry__init(core->thread_core->allowed_cpus); + core->thread_core->allowed_cpus->n_cpumask = sizeof(cpu_set_t) / sizeof(uint64_t); + core->thread_core->allowed_cpus->cpumask = xptr_pull_s(&m, sizeof(cpu_set_t)); + if (arch_alloc_thread_info(core)) { xfree(core); core = NULL; @@ -278,6 +285,7 @@ int dump_pstree(struct pstree_item *root_item) PstreeEntry e = PSTREE_ENTRY__INIT; int ret = -1, i; struct cr_img *img; + unsigned int nr_cpus; pr_info("\n"); pr_info("Dumping pstree (pid: %d)\n", root_item->pid->real); @@ -301,6 +309,7 @@ int dump_pstree(struct pstree_item *root_item) } } + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); img = open_image(CR_FD_PSTREE, O_DUMP); if (!img) return -1; @@ -313,6 +322,7 @@ int dump_pstree(struct pstree_item *root_item) e.pgid = item->pgid; e.sid = item->sid; e.n_threads = item->nr_threads; + e.nr_cpus = nr_cpus; e.threads = xmalloc(sizeof(e.threads[0]) * e.n_threads); if (!e.threads) @@ -532,6 +542,7 @@ static int read_one_pstree_item(struct cr_img *img, pid_t *pid_max) struct pstree_item *pi; PstreeEntry *e; int ret, i; + unsigned int nr_cpus; ret = pb_read_one_eof(img, &e, PB_PSTREE); if (ret <= 0) @@ -543,6 +554,14 @@ static int read_one_pstree_item(struct cr_img *img, pid_t *pid_max) goto err; BUG_ON(pi->pid->state != TASK_UNDEF); + if (opts.with_cpu_affinity) { + nr_cpus = sysconf(_SC_NPROCESSORS_CONF); + if (e->nr_cpus > nr_cpus) { + pr_err("different number of cpus in cpu affinity restore\n"); + goto err; + } + } + /* * All pids should be added in the tree to be able to find * free pid-s for helpers. pstree_item for these pid-s will diff --git a/images/core.proto b/images/core.proto index bc8b7a4885..c49244ad47 100644 --- a/images/core.proto +++ b/images/core.proto @@ -87,6 +87,11 @@ message thread_sas_entry { required uint32 ss_flags = 3; } +message thread_allowedcpus_entry { + required bool has_cpumask = 1 [default=false]; + repeated uint64 cpumask = 2 [packed=true]; +} + message thread_core_entry { required uint64 futex_rla = 1; required uint32 futex_rla_len = 2; @@ -107,6 +112,7 @@ message thread_core_entry { optional uint64 blk_sigset_extended = 14; optional rseq_entry rseq_entry = 15; required uint32 cg_set = 16; + optional thread_allowedcpus_entry allowed_cpus = 17; } message task_rlimits_entry { diff --git a/images/pstree.proto b/images/pstree.proto index fca284cb73..79c3febcf3 100644 --- a/images/pstree.proto +++ b/images/pstree.proto @@ -8,4 +8,5 @@ message pstree_entry { required uint32 pgid = 3; required uint32 sid = 4; repeated uint32 threads = 5; + optional uint32 nr_cpus = 6; }