From 74ba508f60c3650595297b33a4cdfd02e443194e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 3 Mar 2012 18:58:11 -0800 Subject: userns: Remove unnecessary cast to struct user_struct when copying cred->user. In struct cred the user member is and has always been declared struct user_struct *user. At most a constant struct cred will have a constant pointer to non-constant user_struct so remove this unnecessary cast. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/sys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index e7006eb6c1e4..f7a43514ac65 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -209,7 +209,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = (struct user_struct *) cred->user; + user = cred->user; if (!who) who = cred->uid; else if ((who != cred->uid) && @@ -274,7 +274,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - user = (struct user_struct *) cred->user; + user = cred->user; if (!who) who = cred->uid; else if ((who != cred->uid) && -- cgit v1.2.2 From c4a4d603796c727b9555867571f89483be9c565e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 16 Nov 2011 23:15:31 -0800 Subject: userns: Use cred->user_ns instead of cred->user->user_ns Optimize performance and prepare for the removal of the user_ns reference from user_struct. Remove the slow long walk through cred->user->user_ns and instead go straight to cred->user_ns. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/sys.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index f7a43514ac65..82d8714bbede 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -133,11 +133,11 @@ static bool set_one_prio_perm(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred = __task_cred(p); - if (pcred->user->user_ns == cred->user->user_ns && + if (pcred->user_ns == cred->user_ns && (pcred->uid == cred->euid || pcred->euid == cred->euid)) return true; - if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) + if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) return true; return false; } @@ -1498,7 +1498,7 @@ static int check_prlimit_permission(struct task_struct *task) return 0; tcred = __task_cred(task); - if (cred->user->user_ns == tcred->user->user_ns && + if (cred->user_ns == tcred->user_ns && (cred->uid == tcred->euid && cred->uid == tcred->suid && cred->uid == tcred->uid && @@ -1506,7 +1506,7 @@ static int check_prlimit_permission(struct task_struct *task) cred->gid == tcred->sgid && cred->gid == tcred->gid)) return 0; - if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) + if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) return 0; return -EPERM; -- cgit v1.2.2 From 7a4e7408c5cadb240e068a662251754a562355e3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 14 Nov 2011 14:29:51 -0800 Subject: userns: Add kuid_t and kgid_t and associated infrastructure in uidgid.h Start distinguishing between internal kernel uids and gids and values that userspace can use. This is done by introducing two new types: kuid_t and kgid_t. These types and their associated functions are infrastructure are declared in the new header uidgid.h. Ultimately there will be a different implementation of the mapping functions for use with user namespaces. But to keep it simple we introduce the mapping functions first to separate the meat from the mechanical code conversions. Export overflowuid and overflowgid so we can use from_kuid_munged and from_kgid_munged in modular code. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/sys.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index 82d8714bbede..71852417cfc8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -93,10 +93,8 @@ int overflowuid = DEFAULT_OVERFLOWUID; int overflowgid = DEFAULT_OVERFLOWGID; -#ifdef CONFIG_UID16 EXPORT_SYMBOL(overflowuid); EXPORT_SYMBOL(overflowgid); -#endif /* * the same as above, but for filesystems which can only store a 16-bit -- cgit v1.2.2 From 7b44ab978b77a91b327058a0f4db7e6fcdb90b92 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 16 Nov 2011 23:20:58 -0800 Subject: userns: Disassociate user_struct from the user_namespace. Modify alloc_uid to take a kuid and make the user hash table global. Stop holding a reference to the user namespace in struct user_struct. This simplifies the code and makes the per user accounting not care about which user namespace a uid happens to appear in. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/sys.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index 71852417cfc8..f0c43b4b6657 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -175,6 +175,8 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) const struct cred *cred = current_cred(); int error = -EINVAL; struct pid *pgrp; + kuid_t cred_uid; + kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) goto out; @@ -207,18 +209,22 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: + cred_uid = make_kuid(cred->user_ns, cred->uid); + uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) - who = cred->uid; - else if ((who != cred->uid) && - !(user = find_user(who))) + uid = cred_uid; + else if (!uid_eq(uid, cred_uid) && + !(user = find_user(uid))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) { - if (__task_cred(p)->uid == who) + const struct cred *tcred = __task_cred(p); + kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid); + if (uid_eq(tcred_uid, uid)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); - if (who != cred->uid) + if (!uid_eq(uid, cred_uid)) free_uid(user); /* For find_user() */ break; } @@ -242,6 +248,8 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) const struct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp; + kuid_t cred_uid; + kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; @@ -272,21 +280,25 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: + cred_uid = make_kuid(cred->user_ns, cred->uid); + uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) - who = cred->uid; - else if ((who != cred->uid) && - !(user = find_user(who))) + uid = cred_uid; + else if (!uid_eq(uid, cred_uid) && + !(user = find_user(uid))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) { - if (__task_cred(p)->uid == who) { + const struct cred *tcred = __task_cred(p); + kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid); + if (uid_eq(tcred_uid, uid)) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } } while_each_thread(g, p); - if (who != cred->uid) + if (!uid_eq(uid, cred_uid)) free_uid(user); /* for find_user() */ break; } @@ -629,7 +641,7 @@ static int set_user(struct cred *new) { struct user_struct *new_user; - new_user = alloc_uid(current_user_ns(), new->uid); + new_user = alloc_uid(make_kuid(new->user_ns, new->uid)); if (!new_user) return -EAGAIN; -- cgit v1.2.2 From 259e5e6c75a910f3b5e656151dc602f53f9d7548 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 12 Apr 2012 16:47:50 -0500 Subject: Add PR_{GET,SET}_NO_NEW_PRIVS to prevent execve from granting privs With this change, calling prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) disables privilege granting operations at execve-time. For example, a process will not be able to execute a setuid binary to change their uid or gid if this bit is set. The same is true for file capabilities. Additionally, LSM_UNSAFE_NO_NEW_PRIVS is defined to ensure that LSMs respect the requested behavior. To determine if the NO_NEW_PRIVS bit is set, a task may call prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0); It returns 1 if set and 0 if it is not set. If any of the arguments are non-zero, it will return -1 and set errno to -EINVAL. (PR_SET_NO_NEW_PRIVS behaves similarly.) This functionality is desired for the proposed seccomp filter patch series. By using PR_SET_NO_NEW_PRIVS, it allows a task to modify the system call behavior for itself and its child tasks without being able to impact the behavior of a more privileged task. Another potential use is making certain privileged operations unprivileged. For example, chroot may be considered "safe" if it cannot affect privileged tasks. Note, this patch causes execve to fail when PR_SET_NO_NEW_PRIVS is set and AppArmor is in use. It is fixed in a subsequent patch. Signed-off-by: Andy Lutomirski Signed-off-by: Will Drewry Acked-by: Eric Paris Acked-by: Kees Cook v18: updated change desc v17: using new define values as per 3.4 Signed-off-by: James Morris --- kernel/sys.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index e7006eb6c1e4..b82568b7d201 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1979,6 +1979,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = put_user(me->signal->is_child_subreaper, (int __user *) arg2); break; + case PR_SET_NO_NEW_PRIVS: + if (arg2 != 1 || arg3 || arg4 || arg5) + return -EINVAL; + + current->no_new_privs = 1; + break; + case PR_GET_NO_NEW_PRIVS: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; + return current->no_new_privs ? 1 : 0; default: error = -EINVAL; break; -- cgit v1.2.2 From e2cfabdfd075648216f99c2c03821cf3f47c1727 Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Thu, 12 Apr 2012 16:47:57 -0500 Subject: seccomp: add system call filtering using BPF [This patch depends on luto@mit.edu's no_new_privs patch: https://lkml.org/lkml/2012/1/30/264 The whole series including Andrew's patches can be found here: https://github.com/redpig/linux/tree/seccomp Complete diff here: https://github.com/redpig/linux/compare/1dc65fed...seccomp ] This patch adds support for seccomp mode 2. Mode 2 introduces the ability for unprivileged processes to install system call filtering policy expressed in terms of a Berkeley Packet Filter (BPF) program. This program will be evaluated in the kernel for each system call the task makes and computes a result based on data in the format of struct seccomp_data. A filter program may be installed by calling: struct sock_fprog fprog = { ... }; ... prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &fprog); The return value of the filter program determines if the system call is allowed to proceed or denied. If the first filter program installed allows prctl(2) calls, then the above call may be made repeatedly by a task to further reduce its access to the kernel. All attached programs must be evaluated before a system call will be allowed to proceed. Filter programs will be inherited across fork/clone and execve. However, if the task attaching the filter is unprivileged (!CAP_SYS_ADMIN) the no_new_privs bit will be set on the task. This ensures that unprivileged tasks cannot attach filters that affect privileged tasks (e.g., setuid binary). There are a number of benefits to this approach. A few of which are as follows: - BPF has been exposed to userland for a long time - BPF optimization (and JIT'ing) are well understood - Userland already knows its ABI: system call numbers and desired arguments - No time-of-check-time-of-use vulnerable data accesses are possible. - system call arguments are loaded on access only to minimize copying required for system call policy decisions. Mode 2 support is restricted to architectures that enable HAVE_ARCH_SECCOMP_FILTER. In this patch, the primary dependency is on syscall_get_arguments(). The full desired scope of this feature will add a few minor additional requirements expressed later in this series. Based on discussion, SECCOMP_RET_ERRNO and SECCOMP_RET_TRACE seem to be the desired additional functionality. No architectures are enabled in this patch. Signed-off-by: Will Drewry Acked-by: Serge Hallyn Reviewed-by: Indan Zupancic Acked-by: Eric Paris Reviewed-by: Kees Cook v18: - rebase to v3.4-rc2 - s/chk/check/ (akpm@linux-foundation.org,jmorris@namei.org) - allocate with GFP_KERNEL|__GFP_NOWARN (indan@nul.nu) - add a comment for get_u32 regarding endianness (akpm@) - fix other typos, style mistakes (akpm@) - added acked-by v17: - properly guard seccomp filter needed headers (leann@ubuntu.com) - tighten return mask to 0x7fff0000 v16: - no change v15: - add a 4 instr penalty when counting a path to account for seccomp_filter size (indan@nul.nu) - drop the max insns to 256KB (indan@nul.nu) - return ENOMEM if the max insns limit has been hit (indan@nul.nu) - move IP checks after args (indan@nul.nu) - drop !user_filter check (indan@nul.nu) - only allow explicit bpf codes (indan@nul.nu) - exit_code -> exit_sig v14: - put/get_seccomp_filter takes struct task_struct (indan@nul.nu,keescook@chromium.org) - adds seccomp_chk_filter and drops general bpf_run/chk_filter user - add seccomp_bpf_load for use by net/core/filter.c - lower max per-process/per-hierarchy: 1MB - moved nnp/capability check prior to allocation (all of the above: indan@nul.nu) v13: - rebase on to 88ebdda6159ffc15699f204c33feb3e431bf9bdc v12: - added a maximum instruction count per path (indan@nul.nu,oleg@redhat.com) - removed copy_seccomp (keescook@chromium.org,indan@nul.nu) - reworded the prctl_set_seccomp comment (indan@nul.nu) v11: - reorder struct seccomp_data to allow future args expansion (hpa@zytor.com) - style clean up, @compat dropped, compat_sock_fprog32 (indan@nul.nu) - do_exit(SIGSYS) (keescook@chromium.org, luto@mit.edu) - pare down Kconfig doc reference. - extra comment clean up v10: - seccomp_data has changed again to be more aesthetically pleasing (hpa@zytor.com) - calling convention is noted in a new u32 field using syscall_get_arch. This allows for cross-calling convention tasks to use seccomp filters. (hpa@zytor.com) - lots of clean up (thanks, Indan!) v9: - n/a v8: - use bpf_chk_filter, bpf_run_filter. update load_fns - Lots of fixes courtesy of indan@nul.nu: -- fix up load behavior, compat fixups, and merge alloc code, -- renamed pc and dropped __packed, use bool compat. -- Added a hidden CONFIG_SECCOMP_FILTER to synthesize non-arch dependencies v7: (massive overhaul thanks to Indan, others) - added CONFIG_HAVE_ARCH_SECCOMP_FILTER - merged into seccomp.c - minimal seccomp_filter.h - no config option (part of seccomp) - no new prctl - doesn't break seccomp on systems without asm/syscall.h (works but arg access always fails) - dropped seccomp_init_task, extra free functions, ... - dropped the no-asm/syscall.h code paths - merges with network sk_run_filter and sk_chk_filter v6: - fix memory leak on attach compat check failure - require no_new_privs || CAP_SYS_ADMIN prior to filter installation. (luto@mit.edu) - s/seccomp_struct_/seccomp_/ for macros/functions (amwang@redhat.com) - cleaned up Kconfig (amwang@redhat.com) - on block, note if the call was compat (so the # means something) v5: - uses syscall_get_arguments (indan@nul.nu,oleg@redhat.com, mcgrathr@chromium.org) - uses union-based arg storage with hi/lo struct to handle endianness. Compromises between the two alternate proposals to minimize extra arg shuffling and account for endianness assuming userspace uses offsetof(). (mcgrathr@chromium.org, indan@nul.nu) - update Kconfig description - add include/seccomp_filter.h and add its installation - (naive) on-demand syscall argument loading - drop seccomp_t (eparis@redhat.com) v4: - adjusted prctl to make room for PR_[SG]ET_NO_NEW_PRIVS - now uses current->no_new_privs (luto@mit.edu,torvalds@linux-foundation.com) - assign names to seccomp modes (rdunlap@xenotime.net) - fix style issues (rdunlap@xenotime.net) - reworded Kconfig entry (rdunlap@xenotime.net) v3: - macros to inline (oleg@redhat.com) - init_task behavior fixed (oleg@redhat.com) - drop creator entry and extra NULL check (oleg@redhat.com) - alloc returns -EINVAL on bad sizing (serge.hallyn@canonical.com) - adds tentative use of "always_unprivileged" as per torvalds@linux-foundation.org and luto@mit.edu v2: - (patch 2 only) Signed-off-by: James Morris --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index b82568b7d201..ba0ae8eea6fb 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1908,7 +1908,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = prctl_get_seccomp(); break; case PR_SET_SECCOMP: - error = prctl_set_seccomp(arg2); + error = prctl_set_seccomp(arg2, (char __user *)arg3); break; case PR_GET_TSC: error = GET_TSC_CTL(arg2); -- cgit v1.2.2 From 078de5f706ece36afd73bb4b8283314132d2dfdf Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 8 Feb 2012 07:00:08 -0800 Subject: userns: Store uid and gid values in struct cred with kuid_t and kgid_t types cred.h and a few trivial users of struct cred are changed. The rest of the users of struct cred are left for other patches as there are too many changes to make in one go and leave the change reviewable. If the user namespace is disabled and CONFIG_UIDGID_STRICT_TYPE_CHECKS are disabled the code will contiue to compile and behave correctly. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/sys.c | 26 +++++++++----------------- 1 file changed, 9 insertions(+), 17 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index f0c43b4b6657..39962818c008 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -175,7 +175,6 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) const struct cred *cred = current_cred(); int error = -EINVAL; struct pid *pgrp; - kuid_t cred_uid; kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) @@ -209,22 +208,19 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - cred_uid = make_kuid(cred->user_ns, cred->uid); uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) - uid = cred_uid; - else if (!uid_eq(uid, cred_uid) && + uid = cred->uid; + else if (!uid_eq(uid, cred->uid) && !(user = find_user(uid))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) { - const struct cred *tcred = __task_cred(p); - kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid); - if (uid_eq(tcred_uid, uid)) + if (uid_eq(task_uid(p), uid)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); - if (!uid_eq(uid, cred_uid)) + if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } @@ -248,7 +244,6 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) const struct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp; - kuid_t cred_uid; kuid_t uid; if (which > PRIO_USER || which < PRIO_PROCESS) @@ -280,25 +275,22 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: - cred_uid = make_kuid(cred->user_ns, cred->uid); uid = make_kuid(cred->user_ns, who); user = cred->user; if (!who) - uid = cred_uid; - else if (!uid_eq(uid, cred_uid) && + uid = cred->uid; + else if (!uid_eq(uid, cred->uid) && !(user = find_user(uid))) goto out_unlock; /* No processes for this user */ do_each_thread(g, p) { - const struct cred *tcred = __task_cred(p); - kuid_t tcred_uid = make_kuid(tcred->user_ns, tcred->uid); - if (uid_eq(tcred_uid, uid)) { + if (uid_eq(task_uid(p), uid)) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } } while_each_thread(g, p); - if (!uid_eq(uid, cred_uid)) + if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } @@ -641,7 +633,7 @@ static int set_user(struct cred *new) { struct user_struct *new_user; - new_user = alloc_uid(make_kuid(new->user_ns, new->uid)); + new_user = alloc_uid(new->uid); if (!new_user) return -EAGAIN; -- cgit v1.2.2 From a29c33f4e506e1dae7e0985b6328046535becbf8 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 7 Feb 2012 18:51:01 -0800 Subject: userns: Convert setting and getting uid and gid system calls to use kuid and kgid Convert setregid, setgid, setreuid, setuid, setresuid, getresuid, setresgid, getresgid, setfsuid, setfsgid, getuid, geteuid, getgid, getegid, waitpid, waitid, wait4. Convert userspace uids and gids into kuids and kgids before being placed on struct cred. Convert struct cred kuids and kgids into userspace uids and gids when returning them. Signed-off-by: Eric W. Biederman --- kernel/sys.c | 216 +++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 149 insertions(+), 67 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index 39962818c008..aff09f208eb3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -555,9 +555,19 @@ void ctrl_alt_del(void) */ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kgid_t krgid, kegid; + + krgid = make_kgid(ns, rgid); + kegid = make_kgid(ns, egid); + + if ((rgid != (gid_t) -1) && !gid_valid(krgid)) + return -EINVAL; + if ((egid != (gid_t) -1) && !gid_valid(kegid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -566,25 +576,25 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) retval = -EPERM; if (rgid != (gid_t) -1) { - if (old->gid == rgid || - old->egid == rgid || + if (gid_eq(old->gid, krgid) || + gid_eq(old->egid, krgid) || nsown_capable(CAP_SETGID)) - new->gid = rgid; + new->gid = krgid; else goto error; } if (egid != (gid_t) -1) { - if (old->gid == egid || - old->egid == egid || - old->sgid == egid || + if (gid_eq(old->gid, kegid) || + gid_eq(old->egid, kegid) || + gid_eq(old->sgid, kegid) || nsown_capable(CAP_SETGID)) - new->egid = egid; + new->egid = kegid; else goto error; } if (rgid != (gid_t) -1 || - (egid != (gid_t) -1 && egid != old->gid)) + (egid != (gid_t) -1 && !gid_eq(kegid, old->gid))) new->sgid = new->egid; new->fsgid = new->egid; @@ -602,9 +612,15 @@ error: */ SYSCALL_DEFINE1(setgid, gid_t, gid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kgid_t kgid; + + kgid = make_kgid(ns, gid); + if (!gid_valid(kgid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -613,9 +629,9 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) retval = -EPERM; if (nsown_capable(CAP_SETGID)) - new->gid = new->egid = new->sgid = new->fsgid = gid; - else if (gid == old->gid || gid == old->sgid) - new->egid = new->fsgid = gid; + new->gid = new->egid = new->sgid = new->fsgid = kgid; + else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) + new->egid = new->fsgid = kgid; else goto error; @@ -672,9 +688,19 @@ static int set_user(struct cred *new) */ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kuid_t kruid, keuid; + + kruid = make_kuid(ns, ruid); + keuid = make_kuid(ns, euid); + + if ((ruid != (uid_t) -1) && !uid_valid(kruid)) + return -EINVAL; + if ((euid != (uid_t) -1) && !uid_valid(keuid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -683,29 +709,29 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) retval = -EPERM; if (ruid != (uid_t) -1) { - new->uid = ruid; - if (old->uid != ruid && - old->euid != ruid && + new->uid = kruid; + if (!uid_eq(old->uid, kruid) && + !uid_eq(old->euid, kruid) && !nsown_capable(CAP_SETUID)) goto error; } if (euid != (uid_t) -1) { - new->euid = euid; - if (old->uid != euid && - old->euid != euid && - old->suid != euid && + new->euid = keuid; + if (!uid_eq(old->uid, keuid) && + !uid_eq(old->euid, keuid) && + !uid_eq(old->suid, keuid) && !nsown_capable(CAP_SETUID)) goto error; } - if (new->uid != old->uid) { + if (!uid_eq(new->uid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } if (ruid != (uid_t) -1 || - (euid != (uid_t) -1 && euid != old->uid)) + (euid != (uid_t) -1 && !uid_eq(keuid, old->uid))) new->suid = new->euid; new->fsuid = new->euid; @@ -733,9 +759,15 @@ error: */ SYSCALL_DEFINE1(setuid, uid_t, uid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kuid_t kuid; + + kuid = make_kuid(ns, uid); + if (!uid_valid(kuid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -744,17 +776,17 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) retval = -EPERM; if (nsown_capable(CAP_SETUID)) { - new->suid = new->uid = uid; - if (uid != old->uid) { + new->suid = new->uid = kuid; + if (!uid_eq(kuid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } - } else if (uid != old->uid && uid != new->suid) { + } else if (!uid_eq(kuid, old->uid) && !uid_eq(kuid, new->suid)) { goto error; } - new->fsuid = new->euid = uid; + new->fsuid = new->euid = kuid; retval = security_task_fix_setuid(new, old, LSM_SETID_ID); if (retval < 0) @@ -774,9 +806,24 @@ error: */ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kuid_t kruid, keuid, ksuid; + + kruid = make_kuid(ns, ruid); + keuid = make_kuid(ns, euid); + ksuid = make_kuid(ns, suid); + + if ((ruid != (uid_t) -1) && !uid_valid(kruid)) + return -EINVAL; + + if ((euid != (uid_t) -1) && !uid_valid(keuid)) + return -EINVAL; + + if ((suid != (uid_t) -1) && !uid_valid(ksuid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -786,29 +833,29 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) retval = -EPERM; if (!nsown_capable(CAP_SETUID)) { - if (ruid != (uid_t) -1 && ruid != old->uid && - ruid != old->euid && ruid != old->suid) + if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && + !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) goto error; - if (euid != (uid_t) -1 && euid != old->uid && - euid != old->euid && euid != old->suid) + if (euid != (uid_t) -1 && !uid_eq(keuid, old->uid) && + !uid_eq(keuid, old->euid) && !uid_eq(keuid, old->suid)) goto error; - if (suid != (uid_t) -1 && suid != old->uid && - suid != old->euid && suid != old->suid) + if (suid != (uid_t) -1 && !uid_eq(ksuid, old->uid) && + !uid_eq(ksuid, old->euid) && !uid_eq(ksuid, old->suid)) goto error; } if (ruid != (uid_t) -1) { - new->uid = ruid; - if (ruid != old->uid) { + new->uid = kruid; + if (!uid_eq(kruid, old->uid)) { retval = set_user(new); if (retval < 0) goto error; } } if (euid != (uid_t) -1) - new->euid = euid; + new->euid = keuid; if (suid != (uid_t) -1) - new->suid = suid; + new->suid = ksuid; new->fsuid = new->euid; retval = security_task_fix_setuid(new, old, LSM_SETID_RES); @@ -822,14 +869,19 @@ error: return retval; } -SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) +SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t __user *, suidp) { const struct cred *cred = current_cred(); int retval; + uid_t ruid, euid, suid; + + ruid = from_kuid_munged(cred->user_ns, cred->uid); + euid = from_kuid_munged(cred->user_ns, cred->euid); + suid = from_kuid_munged(cred->user_ns, cred->suid); - if (!(retval = put_user(cred->uid, ruid)) && - !(retval = put_user(cred->euid, euid))) - retval = put_user(cred->suid, suid); + if (!(retval = put_user(ruid, ruidp)) && + !(retval = put_user(euid, euidp))) + retval = put_user(suid, suidp); return retval; } @@ -839,9 +891,22 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __u */ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) { + struct user_namespace *ns = current_user_ns(); const struct cred *old; struct cred *new; int retval; + kgid_t krgid, kegid, ksgid; + + krgid = make_kgid(ns, rgid); + kegid = make_kgid(ns, egid); + ksgid = make_kgid(ns, sgid); + + if ((rgid != (gid_t) -1) && !gid_valid(krgid)) + return -EINVAL; + if ((egid != (gid_t) -1) && !gid_valid(kegid)) + return -EINVAL; + if ((sgid != (gid_t) -1) && !gid_valid(ksgid)) + return -EINVAL; new = prepare_creds(); if (!new) @@ -850,23 +915,23 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) retval = -EPERM; if (!nsown_capable(CAP_SETGID)) { - if (rgid != (gid_t) -1 && rgid != old->gid && - rgid != old->egid && rgid != old->sgid) + if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && + !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) goto error; - if (egid != (gid_t) -1 && egid != old->gid && - egid != old->egid && egid != old->sgid) + if (egid != (gid_t) -1 && !gid_eq(kegid, old->gid) && + !gid_eq(kegid, old->egid) && !gid_eq(kegid, old->sgid)) goto error; - if (sgid != (gid_t) -1 && sgid != old->gid && - sgid != old->egid && sgid != old->sgid) + if (sgid != (gid_t) -1 && !gid_eq(ksgid, old->gid) && + !gid_eq(ksgid, old->egid) && !gid_eq(ksgid, old->sgid)) goto error; } if (rgid != (gid_t) -1) - new->gid = rgid; + new->gid = krgid; if (egid != (gid_t) -1) - new->egid = egid; + new->egid = kegid; if (sgid != (gid_t) -1) - new->sgid = sgid; + new->sgid = ksgid; new->fsgid = new->egid; return commit_creds(new); @@ -876,14 +941,19 @@ error: return retval; } -SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) +SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t __user *, sgidp) { const struct cred *cred = current_cred(); int retval; + gid_t rgid, egid, sgid; + + rgid = from_kgid_munged(cred->user_ns, cred->gid); + egid = from_kgid_munged(cred->user_ns, cred->egid); + sgid = from_kgid_munged(cred->user_ns, cred->sgid); - if (!(retval = put_user(cred->gid, rgid)) && - !(retval = put_user(cred->egid, egid))) - retval = put_user(cred->sgid, sgid); + if (!(retval = put_user(rgid, rgidp)) && + !(retval = put_user(egid, egidp))) + retval = put_user(sgid, sgidp); return retval; } @@ -900,18 +970,24 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) const struct cred *old; struct cred *new; uid_t old_fsuid; + kuid_t kuid; + + old = current_cred(); + old_fsuid = from_kuid_munged(old->user_ns, old->fsuid); + + kuid = make_kuid(old->user_ns, uid); + if (!uid_valid(kuid)) + return old_fsuid; new = prepare_creds(); if (!new) - return current_fsuid(); - old = current_cred(); - old_fsuid = old->fsuid; + return old_fsuid; - if (uid == old->uid || uid == old->euid || - uid == old->suid || uid == old->fsuid || + if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || + uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || nsown_capable(CAP_SETUID)) { - if (uid != old_fsuid) { - new->fsuid = uid; + if (!uid_eq(kuid, old->fsuid)) { + new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) goto change_okay; } @@ -933,18 +1009,24 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) const struct cred *old; struct cred *new; gid_t old_fsgid; + kgid_t kgid; + + old = current_cred(); + old_fsgid = from_kgid_munged(old->user_ns, old->fsgid); + + kgid = make_kgid(old->user_ns, gid); + if (!gid_valid(kgid)) + return old_fsgid; new = prepare_creds(); if (!new) - return current_fsgid(); - old = current_cred(); - old_fsgid = old->fsgid; + return old_fsgid; - if (gid == old->gid || gid == old->egid || - gid == old->sgid || gid == old->fsgid || + if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || + gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || nsown_capable(CAP_SETGID)) { - if (gid != old_fsgid) { - new->fsgid = gid; + if (!gid_eq(kgid, old->fsgid)) { + new->fsgid = kgid; goto change_okay; } } @@ -1503,10 +1585,10 @@ static int check_prlimit_permission(struct task_struct *task) if (cred->user_ns == tcred->user_ns && (cred->uid == tcred->euid && cred->uid == tcred->suid && - cred->uid == tcred->uid && + cred->uid == tcred->uid && cred->gid == tcred->egid && cred->gid == tcred->sgid && - cred->gid == tcred->gid)) + cred->gid == tcred->gid)) return 0; if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) return 0; -- cgit v1.2.2 From 5af662030e5db1a5560fd917250d5d688a6be586 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 3 Mar 2012 20:21:47 -0800 Subject: userns: Convert ptrace, kill, set_priority permission checks to work with kuids and kgids Update the permission checks to use the new uid_eq and gid_eq helpers and remove the now unnecessary user_ns equality comparison. Acked-by: Serge Hallyn Signed-off-by: Eric W. Biederman --- kernel/sys.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index aff09f208eb3..f484077b6b14 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -131,9 +131,8 @@ static bool set_one_prio_perm(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred = __task_cred(p); - if (pcred->user_ns == cred->user_ns && - (pcred->uid == cred->euid || - pcred->euid == cred->euid)) + if (uid_eq(pcred->uid, cred->euid) || + uid_eq(pcred->euid, cred->euid)) return true; if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) return true; @@ -1582,13 +1581,12 @@ static int check_prlimit_permission(struct task_struct *task) return 0; tcred = __task_cred(task); - if (cred->user_ns == tcred->user_ns && - (cred->uid == tcred->euid && - cred->uid == tcred->suid && - cred->uid == tcred->uid && - cred->gid == tcred->egid && - cred->gid == tcred->sgid && - cred->gid == tcred->gid)) + if (uid_eq(cred->uid, tcred->euid) && + uid_eq(cred->uid, tcred->suid) && + uid_eq(cred->uid, tcred->uid) && + gid_eq(cred->gid, tcred->egid) && + gid_eq(cred->gid, tcred->sgid) && + gid_eq(cred->gid, tcred->gid)) return 0; if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) return 0; -- cgit v1.2.2 From 499eea6bf9c06df3bf4549954aee6fb3427946ed Mon Sep 17 00:00:00 2001 From: Sasikantha babu Date: Thu, 31 May 2012 16:26:07 -0700 Subject: sethostname/setdomainname: notify userspace when there is a change in uts_kern_table sethostname() and setdomainname() notify userspace on failure (without modifying uts_kern_table). Change things so that we only notify userspace on success, when uts_kern_table was actually modified. Signed-off-by: Sasikantha babu Cc: Paul Gortmaker Cc: Greg Kroah-Hartman Cc: WANG Cong Reviewed-by: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index 6df42624e454..8b71cef3bf1a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1378,8 +1378,8 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) memcpy(u->nodename, tmp, len); memset(u->nodename + len, 0, sizeof(u->nodename) - len); errno = 0; + uts_proc_notify(UTS_PROC_HOSTNAME); } - uts_proc_notify(UTS_PROC_HOSTNAME); up_write(&uts_sem); return errno; } @@ -1429,8 +1429,8 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) memcpy(u->domainname, tmp, len); memset(u->domainname + len, 0, sizeof(u->domainname) - len); errno = 0; + uts_proc_notify(UTS_PROC_DOMAINNAME); } - uts_proc_notify(UTS_PROC_DOMAINNAME); up_write(&uts_sem); return errno; } -- cgit v1.2.2 From 81ab6e7b26b453a795d46f2616ed0e31d97f05b9 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 31 May 2012 16:26:15 -0700 Subject: kmod: convert two call sites to call_usermodehelper_fns() Both kernel/sys.c && security/keys/request_key.c where inlining the exact same code as call_usermodehelper_fns(); So simply convert these sites to directly use call_usermodehelper_fns(). Signed-off-by: Boaz Harrosh Cc: Oleg Nesterov Cc: Tetsuo Handa Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index 8b71cef3bf1a..6e81aa7e4688 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2114,7 +2114,6 @@ int orderly_poweroff(bool force) NULL }; int ret = -ENOMEM; - struct subprocess_info *info; if (argv == NULL) { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", @@ -2122,18 +2121,16 @@ int orderly_poweroff(bool force) goto out; } - info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); - if (info == NULL) { - argv_free(argv); - goto out; - } - - call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL); + ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, + NULL, argv_cleanup, NULL); +out: + if (likely(!ret)) + return 0; - ret = call_usermodehelper_exec(info, UMH_NO_WAIT); + if (ret == -ENOMEM) + argv_free(argv); - out: - if (ret && force) { + if (force) { printk(KERN_WARNING "Failed to start orderly shutdown: " "forcing the issue\n"); -- cgit v1.2.2 From fe8c7f5cbf91124987106faa3bdf0c8b955c4cf7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:45 -0700 Subject: c/r: prctl: extend PR_SET_MM to set up more mm_struct entries During checkpoint we dump whole process memory to a file and the dump includes process stack memory. But among stack data itself, the stack carries additional parameters such as command line arguments, environment data and auxiliary vector. So when we do restore procedure and once we've restored stack data itself we need to setup mm_struct::arg_start/end, env_start/end, so restored process would be able to find command line arguments and environment data it had at checkpoint time. The same applies to auxiliary vector. For this reason additional PR_SET_MM_(ARG_START | ARG_END | ENV_START | ENV_END | AUXV) codes are introduced. Signed-off-by: Cyrill Gorcunov Acked-by: Kees Cook Cc: Tejun Heo Cc: Andrew Vagin Cc: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 134 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 83 insertions(+), 51 deletions(-) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index 6e81aa7e4688..8b544972e46e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1784,17 +1784,23 @@ SYSCALL_DEFINE1(umask, int, mask) } #ifdef CONFIG_CHECKPOINT_RESTORE +static bool vma_flags_mismatch(struct vm_area_struct *vma, + unsigned long required, + unsigned long banned) +{ + return (vma->vm_flags & required) != required || + (vma->vm_flags & banned); +} + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { unsigned long rlim = rlimit(RLIMIT_DATA); - unsigned long vm_req_flags; - unsigned long vm_bad_flags; - struct vm_area_struct *vma; - int error = 0; struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int error; - if (arg4 | arg5) + if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) return -EINVAL; if (!capable(CAP_SYS_RESOURCE)) @@ -1803,58 +1809,23 @@ static int prctl_set_mm(int opt, unsigned long addr, if (addr >= TASK_SIZE) return -EINVAL; + error = -EINVAL; + down_read(&mm->mmap_sem); vma = find_vma(mm, addr); - if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) { - /* It must be existing VMA */ - if (!vma || vma->vm_start > addr) - goto out; - } - - error = -EINVAL; switch (opt) { case PR_SET_MM_START_CODE: + mm->start_code = addr; + break; case PR_SET_MM_END_CODE: - vm_req_flags = VM_READ | VM_EXEC; - vm_bad_flags = VM_WRITE | VM_MAYSHARE; - - if ((vma->vm_flags & vm_req_flags) != vm_req_flags || - (vma->vm_flags & vm_bad_flags)) - goto out; - - if (opt == PR_SET_MM_START_CODE) - mm->start_code = addr; - else - mm->end_code = addr; + mm->end_code = addr; break; - case PR_SET_MM_START_DATA: - case PR_SET_MM_END_DATA: - vm_req_flags = VM_READ | VM_WRITE; - vm_bad_flags = VM_EXEC | VM_MAYSHARE; - - if ((vma->vm_flags & vm_req_flags) != vm_req_flags || - (vma->vm_flags & vm_bad_flags)) - goto out; - - if (opt == PR_SET_MM_START_DATA) - mm->start_data = addr; - else - mm->end_data = addr; + mm->start_data = addr; break; - - case PR_SET_MM_START_STACK: - -#ifdef CONFIG_STACK_GROWSUP - vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP; -#else - vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN; -#endif - if ((vma->vm_flags & vm_req_flags) != vm_req_flags) - goto out; - - mm->start_stack = addr; + case PR_SET_MM_END_DATA: + mm->end_data = addr; break; case PR_SET_MM_START_BRK: @@ -1881,16 +1852,77 @@ static int prctl_set_mm(int opt, unsigned long addr, mm->brk = addr; break; + /* + * If command line arguments and environment + * are placed somewhere else on stack, we can + * set them up here, ARG_START/END to setup + * command line argumets and ENV_START/END + * for environment. + */ + case PR_SET_MM_START_STACK: + case PR_SET_MM_ARG_START: + case PR_SET_MM_ARG_END: + case PR_SET_MM_ENV_START: + case PR_SET_MM_ENV_END: + if (!vma) { + error = -EFAULT; + goto out; + } +#ifdef CONFIG_STACK_GROWSUP + if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSUP, 0)) +#else + if (vma_flags_mismatch(vma, VM_READ | VM_WRITE | VM_GROWSDOWN, 0)) +#endif + goto out; + if (opt == PR_SET_MM_START_STACK) + mm->start_stack = addr; + else if (opt == PR_SET_MM_ARG_START) + mm->arg_start = addr; + else if (opt == PR_SET_MM_ARG_END) + mm->arg_end = addr; + else if (opt == PR_SET_MM_ENV_START) + mm->env_start = addr; + else if (opt == PR_SET_MM_ENV_END) + mm->env_end = addr; + break; + + /* + * This doesn't move auxiliary vector itself + * since it's pinned to mm_struct, but allow + * to fill vector with new values. It's up + * to a caller to provide sane values here + * otherwise user space tools which use this + * vector might be unhappy. + */ + case PR_SET_MM_AUXV: { + unsigned long user_auxv[AT_VECTOR_SIZE]; + + if (arg4 > sizeof(user_auxv)) + goto out; + up_read(&mm->mmap_sem); + + if (copy_from_user(user_auxv, (const void __user *)addr, arg4)) + return -EFAULT; + + /* Make sure the last entry is always AT_NULL */ + user_auxv[AT_VECTOR_SIZE - 2] = 0; + user_auxv[AT_VECTOR_SIZE - 1] = 0; + + BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); + + task_lock(current); + memcpy(mm->saved_auxv, user_auxv, arg4); + task_unlock(current); + + return 0; + } default: - error = -EINVAL; goto out; } error = 0; - out: up_read(&mm->mmap_sem); - return error; } #else /* CONFIG_CHECKPOINT_RESTORE */ -- cgit v1.2.2 From b32dfe377102ce668775f8b6b1461f7ad428f8b6 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 31 May 2012 16:26:46 -0700 Subject: c/r: prctl: add ability to set new mm_struct::exe_file When we do restore we would like to have a way to setup a former mm_struct::exe_file so that /proc/pid/exe would point to the original executable file a process had at checkpoint time. For this the PR_SET_MM_EXE_FILE code is introduced. This option takes a file descriptor which will be set as a source for new /proc/$pid/exe symlink. Note it allows to change /proc/$pid/exe if there are no VM_EXECUTABLE vmas present for current process, simply because this feature is a special to C/R and mm::num_exe_file_vmas become meaningless after that. To minimize the amount of transition the /proc/pid/exe symlink might have, this feature is implemented in one-shot manner. Thus once changed the symlink can't be changed again. This should help sysadmins to monitor the symlinks over all process running in a system. In particular one could make a snapshot of processes and ring alarm if there unexpected changes of /proc/pid/exe's in a system. Note -- this feature is available iif CONFIG_CHECKPOINT_RESTORE is set and the caller must have CAP_SYS_RESOURCE capability granted, otherwise the request to change symlink will be rejected. Signed-off-by: Cyrill Gorcunov Reviewed-by: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Pavel Emelyanov Cc: Kees Cook Cc: Tejun Heo Cc: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'kernel/sys.c') diff --git a/kernel/sys.c b/kernel/sys.c index 8b544972e46e..9ff89cb9657a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include #include #include @@ -1792,6 +1794,57 @@ static bool vma_flags_mismatch(struct vm_area_struct *vma, (vma->vm_flags & banned); } +static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) +{ + struct file *exe_file; + struct dentry *dentry; + int err; + + /* + * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's + * remain. So perform a quick test first. + */ + if (mm->num_exe_file_vmas) + return -EBUSY; + + exe_file = fget(fd); + if (!exe_file) + return -EBADF; + + dentry = exe_file->f_path.dentry; + + /* + * Because the original mm->exe_file points to executable file, make + * sure that this one is executable as well, to avoid breaking an + * overall picture. + */ + err = -EACCES; + if (!S_ISREG(dentry->d_inode->i_mode) || + exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) + goto exit; + + err = inode_permission(dentry->d_inode, MAY_EXEC); + if (err) + goto exit; + + /* + * The symlink can be changed only once, just to disallow arbitrary + * transitions malicious software might bring in. This means one + * could make a snapshot over all processes running and monitor + * /proc/pid/exe changes to notice unusual activity if needed. + */ + down_write(&mm->mmap_sem); + if (likely(!mm->exe_file)) + set_mm_exe_file(mm, exe_file); + else + err = -EBUSY; + up_write(&mm->mmap_sem); + +exit: + fput(exe_file); + return err; +} + static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { @@ -1806,6 +1859,9 @@ static int prctl_set_mm(int opt, unsigned long addr, if (!capable(CAP_SYS_RESOURCE)) return -EPERM; + if (opt == PR_SET_MM_EXE_FILE) + return prctl_set_mm_exe_file(mm, (unsigned int)addr); + if (addr >= TASK_SIZE) return -EINVAL; -- cgit v1.2.2