aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-17 15:31:40 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-17 15:31:40 -0500
commit87c31b39abcb6fb6bd7d111200c9627a594bf6a9 (patch)
treeab2e5331fea9b823cb92719d0954a9141451c931
parentf045bbb9fa1bf6f507ad4de12d4e3471d8f672f1 (diff)
parentdb86da7cb76f797a1a8b445166a15cb922c6ff85 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull user namespace related fixes from Eric Biederman: "As these are bug fixes almost all of thes changes are marked for backporting to stable. The first change (implicitly adding MNT_NODEV on remount) addresses a regression that was created when security issues with unprivileged remount were closed. I go on to update the remount test to make it easy to detect if this issue reoccurs. Then there are a handful of mount and umount related fixes. Then half of the changes deal with the a recently discovered design bug in the permission checks of gid_map. Unix since the beginning has allowed setting group permissions on files to less than the user and other permissions (aka ---rwx---rwx). As the unix permission checks stop as soon as a group matches, and setgroups allows setting groups that can not later be dropped, results in a situtation where it is possible to legitimately use a group to assign fewer privileges to a process. Which means dropping a group can increase a processes privileges. The fix I have adopted is that gid_map is now no longer writable without privilege unless the new file /proc/self/setgroups has been set to permanently disable setgroups. The bulk of user namespace using applications even the applications using applications using user namespaces without privilege remain unaffected by this change. Unfortunately this ix breaks a couple user space applications, that were relying on the problematic behavior (one of which was tools/selftests/mount/unprivileged-remount-test.c). To hopefully prevent needing a regression fix on top of my security fix I rounded folks who work with the container implementations mostly like to be affected and encouraged them to test the changes. > So far nothing broke on my libvirt-lxc test bed. :-) > Tested with openSUSE 13.2 and libvirt 1.2.9. > Tested-by: Richard Weinberger <richard@nod.at> > Tested on Fedora20 with libvirt 1.2.11, works fine. > Tested-by: Chen Hanxiao <chenhanxiao@cn.fujitsu.com> > Ok, thanks - yes, unprivileged lxc is working fine with your kernels. > Just to be sure I was testing the right thing I also tested using > my unprivileged nsexec testcases, and they failed on setgroup/setgid > as now expected, and succeeded there without your patches. > Tested-by: Serge Hallyn <serge.hallyn@ubuntu.com> > I tested this with Sandstorm. It breaks as is and it works if I add > the setgroups thing. > Tested-by: Andy Lutomirski <luto@amacapital.net> # breaks things as designed :(" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: userns: Unbreak the unprivileged remount tests userns; Correct the comment in map_write userns: Allow setting gid_maps without privilege when setgroups is disabled userns: Add a knob to disable setgroups on a per user namespace basis userns: Rename id_map_mutex to userns_state_mutex userns: Only allow the creator of the userns unprivileged mappings userns: Check euid no fsuid when establishing an unprivileged uid mapping userns: Don't allow unprivileged creation of gid mappings userns: Don't allow setgroups until a gid mapping has been setablished userns: Document what the invariant required for safe unprivileged mappings. groups: Consolidate the setgroups permission checks mnt: Clear mnt_expire during pivot_root mnt: Carefully set CL_UNPRIVILEGED in clone_mnt mnt: Move the clear of MNT_LOCKED from copy_tree to it's callers. umount: Do not allow unmounting rootfs. umount: Disallow unprivileged mount force mnt: Update unprivileged remount test mnt: Implicitly add MNT_NODEV on remount when it was implicitly added by mount
-rw-r--r--arch/s390/kernel/compat_linux.c2
-rw-r--r--fs/namespace.c18
-rw-r--r--fs/pnode.c1
-rw-r--r--fs/proc/base.c53
-rw-r--r--include/linux/cred.h1
-rw-r--r--include/linux/user_namespace.h12
-rw-r--r--kernel/groups.c11
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/user_namespace.c124
-rw-r--r--tools/testing/selftests/mount/unprivileged-remount-test.c204
11 files changed, 374 insertions, 55 deletions
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index ca38139423ae..437e61159279 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -249,7 +249,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis
249 struct group_info *group_info; 249 struct group_info *group_info;
250 int retval; 250 int retval;
251 251
252 if (!capable(CAP_SETGID)) 252 if (!may_setgroups())
253 return -EPERM; 253 return -EPERM;
254 if ((unsigned)gidsetsize > NGROUPS_MAX) 254 if ((unsigned)gidsetsize > NGROUPS_MAX)
255 return -EINVAL; 255 return -EINVAL;
diff --git a/fs/namespace.c b/fs/namespace.c
index 30df6e7dd807..820af6a1dd6b 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -963,7 +963,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
963 } 963 }
964 964
965 /* Don't allow unprivileged users to reveal what is under a mount */ 965 /* Don't allow unprivileged users to reveal what is under a mount */
966 if ((flag & CL_UNPRIVILEGED) && list_empty(&old->mnt_expire)) 966 if ((flag & CL_UNPRIVILEGED) &&
967 (!(flag & CL_EXPIRE) || list_empty(&old->mnt_expire)))
967 mnt->mnt.mnt_flags |= MNT_LOCKED; 968 mnt->mnt.mnt_flags |= MNT_LOCKED;
968 969
969 atomic_inc(&sb->s_active); 970 atomic_inc(&sb->s_active);
@@ -1544,6 +1545,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
1544 goto dput_and_out; 1545 goto dput_and_out;
1545 if (mnt->mnt.mnt_flags & MNT_LOCKED) 1546 if (mnt->mnt.mnt_flags & MNT_LOCKED)
1546 goto dput_and_out; 1547 goto dput_and_out;
1548 retval = -EPERM;
1549 if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
1550 goto dput_and_out;
1547 1551
1548 retval = do_umount(mnt, flags); 1552 retval = do_umount(mnt, flags);
1549dput_and_out: 1553dput_and_out:
@@ -1606,7 +1610,6 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
1606 if (IS_ERR(q)) 1610 if (IS_ERR(q))
1607 return q; 1611 return q;
1608 1612
1609 q->mnt.mnt_flags &= ~MNT_LOCKED;
1610 q->mnt_mountpoint = mnt->mnt_mountpoint; 1613 q->mnt_mountpoint = mnt->mnt_mountpoint;
1611 1614
1612 p = mnt; 1615 p = mnt;
@@ -2097,7 +2100,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
2097 } 2100 }
2098 if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && 2101 if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
2099 !(mnt_flags & MNT_NODEV)) { 2102 !(mnt_flags & MNT_NODEV)) {
2100 return -EPERM; 2103 /* Was the nodev implicitly added in mount? */
2104 if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
2105 !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
2106 mnt_flags |= MNT_NODEV;
2107 } else {
2108 return -EPERM;
2109 }
2101 } 2110 }
2102 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && 2111 if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
2103 !(mnt_flags & MNT_NOSUID)) { 2112 !(mnt_flags & MNT_NOSUID)) {
@@ -2958,6 +2967,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
2958 /* mount new_root on / */ 2967 /* mount new_root on / */
2959 attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp); 2968 attach_mnt(new_mnt, real_mount(root_parent.mnt), root_mp);
2960 touch_mnt_namespace(current->nsproxy->mnt_ns); 2969 touch_mnt_namespace(current->nsproxy->mnt_ns);
2970 /* A moved mount should not expire automatically */
2971 list_del_init(&new_mnt->mnt_expire);
2961 unlock_mount_hash(); 2972 unlock_mount_hash();
2962 chroot_fs_refs(&root, &new); 2973 chroot_fs_refs(&root, &new);
2963 put_mountpoint(root_mp); 2974 put_mountpoint(root_mp);
@@ -3002,6 +3013,7 @@ static void __init init_mount_tree(void)
3002 3013
3003 root.mnt = mnt; 3014 root.mnt = mnt;
3004 root.dentry = mnt->mnt_root; 3015 root.dentry = mnt->mnt_root;
3016 mnt->mnt_flags |= MNT_LOCKED;
3005 3017
3006 set_fs_pwd(current->fs, &root); 3018 set_fs_pwd(current->fs, &root);
3007 set_fs_root(current->fs, &root); 3019 set_fs_root(current->fs, &root);
diff --git a/fs/pnode.c b/fs/pnode.c
index aae331a5d03b..260ac8f898a4 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -242,6 +242,7 @@ static int propagate_one(struct mount *m)
242 child = copy_tree(last_source, last_source->mnt.mnt_root, type); 242 child = copy_tree(last_source, last_source->mnt.mnt_root, type);
243 if (IS_ERR(child)) 243 if (IS_ERR(child))
244 return PTR_ERR(child); 244 return PTR_ERR(child);
245 child->mnt.mnt_flags &= ~MNT_LOCKED;
245 mnt_set_mountpoint(m, mp, child); 246 mnt_set_mountpoint(m, mp, child);
246 last_dest = m; 247 last_dest = m;
247 last_source = child; 248 last_source = child;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 590aeda5af12..3f3d7aeb0712 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2464,6 +2464,57 @@ static const struct file_operations proc_projid_map_operations = {
2464 .llseek = seq_lseek, 2464 .llseek = seq_lseek,
2465 .release = proc_id_map_release, 2465 .release = proc_id_map_release,
2466}; 2466};
2467
2468static int proc_setgroups_open(struct inode *inode, struct file *file)
2469{
2470 struct user_namespace *ns = NULL;
2471 struct task_struct *task;
2472 int ret;
2473
2474 ret = -ESRCH;
2475 task = get_proc_task(inode);
2476 if (task) {
2477 rcu_read_lock();
2478 ns = get_user_ns(task_cred_xxx(task, user_ns));
2479 rcu_read_unlock();
2480 put_task_struct(task);
2481 }
2482 if (!ns)
2483 goto err;
2484
2485 if (file->f_mode & FMODE_WRITE) {
2486 ret = -EACCES;
2487 if (!ns_capable(ns, CAP_SYS_ADMIN))
2488 goto err_put_ns;
2489 }
2490
2491 ret = single_open(file, &proc_setgroups_show, ns);
2492 if (ret)
2493 goto err_put_ns;
2494
2495 return 0;
2496err_put_ns:
2497 put_user_ns(ns);
2498err:
2499 return ret;
2500}
2501
2502static int proc_setgroups_release(struct inode *inode, struct file *file)
2503{
2504 struct seq_file *seq = file->private_data;
2505 struct user_namespace *ns = seq->private;
2506 int ret = single_release(inode, file);
2507 put_user_ns(ns);
2508 return ret;
2509}
2510
2511static const struct file_operations proc_setgroups_operations = {
2512 .open = proc_setgroups_open,
2513 .write = proc_setgroups_write,
2514 .read = seq_read,
2515 .llseek = seq_lseek,
2516 .release = proc_setgroups_release,
2517};
2467#endif /* CONFIG_USER_NS */ 2518#endif /* CONFIG_USER_NS */
2468 2519
2469static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, 2520static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
@@ -2572,6 +2623,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2572 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), 2623 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
2573 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), 2624 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
2574 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), 2625 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
2626 REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
2575#endif 2627#endif
2576#ifdef CONFIG_CHECKPOINT_RESTORE 2628#ifdef CONFIG_CHECKPOINT_RESTORE
2577 REG("timers", S_IRUGO, proc_timers_operations), 2629 REG("timers", S_IRUGO, proc_timers_operations),
@@ -2916,6 +2968,7 @@ static const struct pid_entry tid_base_stuff[] = {
2916 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), 2968 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
2917 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), 2969 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
2918 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), 2970 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
2971 REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations),
2919#endif 2972#endif
2920}; 2973};
2921 2974
diff --git a/include/linux/cred.h b/include/linux/cred.h
index b2d0820837c4..2fb2ca2127ed 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -68,6 +68,7 @@ extern void groups_free(struct group_info *);
68extern int set_current_groups(struct group_info *); 68extern int set_current_groups(struct group_info *);
69extern void set_groups(struct cred *, struct group_info *); 69extern void set_groups(struct cred *, struct group_info *);
70extern int groups_search(const struct group_info *, kgid_t); 70extern int groups_search(const struct group_info *, kgid_t);
71extern bool may_setgroups(void);
71 72
72/* access the groups "array" with this macro */ 73/* access the groups "array" with this macro */
73#define GROUP_AT(gi, i) \ 74#define GROUP_AT(gi, i) \
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 4cf06c140e21..8297e5b341d8 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -18,6 +18,10 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */
18 } extent[UID_GID_MAP_MAX_EXTENTS]; 18 } extent[UID_GID_MAP_MAX_EXTENTS];
19}; 19};
20 20
21#define USERNS_SETGROUPS_ALLOWED 1UL
22
23#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED
24
21struct user_namespace { 25struct user_namespace {
22 struct uid_gid_map uid_map; 26 struct uid_gid_map uid_map;
23 struct uid_gid_map gid_map; 27 struct uid_gid_map gid_map;
@@ -28,6 +32,7 @@ struct user_namespace {
28 kuid_t owner; 32 kuid_t owner;
29 kgid_t group; 33 kgid_t group;
30 struct ns_common ns; 34 struct ns_common ns;
35 unsigned long flags;
31 36
32 /* Register of per-UID persistent keyrings for this namespace */ 37 /* Register of per-UID persistent keyrings for this namespace */
33#ifdef CONFIG_PERSISTENT_KEYRINGS 38#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -64,6 +69,9 @@ extern const struct seq_operations proc_projid_seq_operations;
64extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); 69extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
65extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); 70extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
66extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); 71extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
72extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
73extern int proc_setgroups_show(struct seq_file *m, void *v);
74extern bool userns_may_setgroups(const struct user_namespace *ns);
67#else 75#else
68 76
69static inline struct user_namespace *get_user_ns(struct user_namespace *ns) 77static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
@@ -88,6 +96,10 @@ static inline void put_user_ns(struct user_namespace *ns)
88{ 96{
89} 97}
90 98
99static inline bool userns_may_setgroups(const struct user_namespace *ns)
100{
101 return true;
102}
91#endif 103#endif
92 104
93#endif /* _LINUX_USER_H */ 105#endif /* _LINUX_USER_H */
diff --git a/kernel/groups.c b/kernel/groups.c
index 451698f86cfa..664411f171b5 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -6,6 +6,7 @@
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/security.h> 7#include <linux/security.h>
8#include <linux/syscalls.h> 8#include <linux/syscalls.h>
9#include <linux/user_namespace.h>
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
10 11
11/* init to 2 - one for init_task, one to ensure it is never freed */ 12/* init to 2 - one for init_task, one to ensure it is never freed */
@@ -213,6 +214,14 @@ out:
213 return i; 214 return i;
214} 215}
215 216
217bool may_setgroups(void)
218{
219 struct user_namespace *user_ns = current_user_ns();
220
221 return ns_capable(user_ns, CAP_SETGID) &&
222 userns_may_setgroups(user_ns);
223}
224
216/* 225/*
217 * SMP: Our groups are copy-on-write. We can set them safely 226 * SMP: Our groups are copy-on-write. We can set them safely
218 * without another task interfering. 227 * without another task interfering.
@@ -223,7 +232,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
223 struct group_info *group_info; 232 struct group_info *group_info;
224 int retval; 233 int retval;
225 234
226 if (!ns_capable(current_user_ns(), CAP_SETGID)) 235 if (!may_setgroups())
227 return -EPERM; 236 return -EPERM;
228 if ((unsigned)gidsetsize > NGROUPS_MAX) 237 if ((unsigned)gidsetsize > NGROUPS_MAX)
229 return -EINVAL; 238 return -EINVAL;
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 602e5bbbceff..d58cc4d8f0d1 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
176 struct group_info *group_info; 176 struct group_info *group_info;
177 int retval; 177 int retval;
178 178
179 if (!ns_capable(current_user_ns(), CAP_SETGID)) 179 if (!may_setgroups())
180 return -EPERM; 180 return -EPERM;
181 if ((unsigned)gidsetsize > NGROUPS_MAX) 181 if ((unsigned)gidsetsize > NGROUPS_MAX)
182 return -EINVAL; 182 return -EINVAL;
diff --git a/kernel/user.c b/kernel/user.c
index 69b800aebf13..b069ccbfb0b0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -54,6 +54,7 @@ struct user_namespace init_user_ns = {
54#ifdef CONFIG_USER_NS 54#ifdef CONFIG_USER_NS
55 .ns.ops = &userns_operations, 55 .ns.ops = &userns_operations,
56#endif 56#endif
57 .flags = USERNS_INIT_FLAGS,
57#ifdef CONFIG_PERSISTENT_KEYRINGS 58#ifdef CONFIG_PERSISTENT_KEYRINGS
58 .persistent_keyring_register_sem = 59 .persistent_keyring_register_sem =
59 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), 60 __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 1491ad00388f..4109f8320684 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -24,6 +24,7 @@
24#include <linux/fs_struct.h> 24#include <linux/fs_struct.h>
25 25
26static struct kmem_cache *user_ns_cachep __read_mostly; 26static struct kmem_cache *user_ns_cachep __read_mostly;
27static DEFINE_MUTEX(userns_state_mutex);
27 28
28static bool new_idmap_permitted(const struct file *file, 29static bool new_idmap_permitted(const struct file *file,
29 struct user_namespace *ns, int cap_setid, 30 struct user_namespace *ns, int cap_setid,
@@ -100,6 +101,11 @@ int create_user_ns(struct cred *new)
100 ns->owner = owner; 101 ns->owner = owner;
101 ns->group = group; 102 ns->group = group;
102 103
104 /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
105 mutex_lock(&userns_state_mutex);
106 ns->flags = parent_ns->flags;
107 mutex_unlock(&userns_state_mutex);
108
103 set_cred_user_ns(new, ns); 109 set_cred_user_ns(new, ns);
104 110
105#ifdef CONFIG_PERSISTENT_KEYRINGS 111#ifdef CONFIG_PERSISTENT_KEYRINGS
@@ -584,9 +590,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
584 return false; 590 return false;
585} 591}
586 592
587
588static DEFINE_MUTEX(id_map_mutex);
589
590static ssize_t map_write(struct file *file, const char __user *buf, 593static ssize_t map_write(struct file *file, const char __user *buf,
591 size_t count, loff_t *ppos, 594 size_t count, loff_t *ppos,
592 int cap_setid, 595 int cap_setid,
@@ -603,7 +606,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
603 ssize_t ret = -EINVAL; 606 ssize_t ret = -EINVAL;
604 607
605 /* 608 /*
606 * The id_map_mutex serializes all writes to any given map. 609 * The userns_state_mutex serializes all writes to any given map.
607 * 610 *
608 * Any map is only ever written once. 611 * Any map is only ever written once.
609 * 612 *
@@ -621,7 +624,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
621 * order and smp_rmb() is guaranteed that we don't have crazy 624 * order and smp_rmb() is guaranteed that we don't have crazy
622 * architectures returning stale data. 625 * architectures returning stale data.
623 */ 626 */
624 mutex_lock(&id_map_mutex); 627 mutex_lock(&userns_state_mutex);
625 628
626 ret = -EPERM; 629 ret = -EPERM;
627 /* Only allow one successful write to the map */ 630 /* Only allow one successful write to the map */
@@ -641,7 +644,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
641 if (!page) 644 if (!page)
642 goto out; 645 goto out;
643 646
644 /* Only allow <= page size writes at the beginning of the file */ 647 /* Only allow < page size writes at the beginning of the file */
645 ret = -EINVAL; 648 ret = -EINVAL;
646 if ((*ppos != 0) || (count >= PAGE_SIZE)) 649 if ((*ppos != 0) || (count >= PAGE_SIZE))
647 goto out; 650 goto out;
@@ -751,7 +754,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
751 *ppos = count; 754 *ppos = count;
752 ret = count; 755 ret = count;
753out: 756out:
754 mutex_unlock(&id_map_mutex); 757 mutex_unlock(&userns_state_mutex);
755 if (page) 758 if (page)
756 free_page(page); 759 free_page(page);
757 return ret; 760 return ret;
@@ -813,16 +816,21 @@ static bool new_idmap_permitted(const struct file *file,
813 struct user_namespace *ns, int cap_setid, 816 struct user_namespace *ns, int cap_setid,
814 struct uid_gid_map *new_map) 817 struct uid_gid_map *new_map)
815{ 818{
816 /* Allow mapping to your own filesystem ids */ 819 const struct cred *cred = file->f_cred;
817 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { 820 /* Don't allow mappings that would allow anything that wouldn't
821 * be allowed without the establishment of unprivileged mappings.
822 */
823 if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
824 uid_eq(ns->owner, cred->euid)) {
818 u32 id = new_map->extent[0].lower_first; 825 u32 id = new_map->extent[0].lower_first;
819 if (cap_setid == CAP_SETUID) { 826 if (cap_setid == CAP_SETUID) {
820 kuid_t uid = make_kuid(ns->parent, id); 827 kuid_t uid = make_kuid(ns->parent, id);
821 if (uid_eq(uid, file->f_cred->fsuid)) 828 if (uid_eq(uid, cred->euid))
822 return true; 829 return true;
823 } else if (cap_setid == CAP_SETGID) { 830 } else if (cap_setid == CAP_SETGID) {
824 kgid_t gid = make_kgid(ns->parent, id); 831 kgid_t gid = make_kgid(ns->parent, id);
825 if (gid_eq(gid, file->f_cred->fsgid)) 832 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
833 gid_eq(gid, cred->egid))
826 return true; 834 return true;
827 } 835 }
828 } 836 }
@@ -842,6 +850,100 @@ static bool new_idmap_permitted(const struct file *file,
842 return false; 850 return false;
843} 851}
844 852
853int proc_setgroups_show(struct seq_file *seq, void *v)
854{
855 struct user_namespace *ns = seq->private;
856 unsigned long userns_flags = ACCESS_ONCE(ns->flags);
857
858 seq_printf(seq, "%s\n",
859 (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
860 "allow" : "deny");
861 return 0;
862}
863
864ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
865 size_t count, loff_t *ppos)
866{
867 struct seq_file *seq = file->private_data;
868 struct user_namespace *ns = seq->private;
869 char kbuf[8], *pos;
870 bool setgroups_allowed;
871 ssize_t ret;
872
873 /* Only allow a very narrow range of strings to be written */
874 ret = -EINVAL;
875 if ((*ppos != 0) || (count >= sizeof(kbuf)))
876 goto out;
877
878 /* What was written? */
879 ret = -EFAULT;
880 if (copy_from_user(kbuf, buf, count))
881 goto out;
882 kbuf[count] = '\0';
883 pos = kbuf;
884
885 /* What is being requested? */
886 ret = -EINVAL;
887 if (strncmp(pos, "allow", 5) == 0) {
888 pos += 5;
889 setgroups_allowed = true;
890 }
891 else if (strncmp(pos, "deny", 4) == 0) {
892 pos += 4;
893 setgroups_allowed = false;
894 }
895 else
896 goto out;
897
898 /* Verify there is not trailing junk on the line */
899 pos = skip_spaces(pos);
900 if (*pos != '\0')
901 goto out;
902
903 ret = -EPERM;
904 mutex_lock(&userns_state_mutex);
905 if (setgroups_allowed) {
906 /* Enabling setgroups after setgroups has been disabled
907 * is not allowed.
908 */
909 if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
910 goto out_unlock;
911 } else {
912 /* Permanently disabling setgroups after setgroups has
913 * been enabled by writing the gid_map is not allowed.
914 */
915 if (ns->gid_map.nr_extents != 0)
916 goto out_unlock;
917 ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
918 }
919 mutex_unlock(&userns_state_mutex);
920
921 /* Report a successful write */
922 *ppos = count;
923 ret = count;
924out:
925 return ret;
926out_unlock:
927 mutex_unlock(&userns_state_mutex);
928 goto out;
929}
930
931bool userns_may_setgroups(const struct user_namespace *ns)
932{
933 bool allowed;
934
935 mutex_lock(&userns_state_mutex);
936 /* It is not safe to use setgroups until a gid mapping in
937 * the user namespace has been established.
938 */
939 allowed = ns->gid_map.nr_extents != 0;
940 /* Is setgroups allowed? */
941 allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
942 mutex_unlock(&userns_state_mutex);
943
944 return allowed;
945}
946
845static inline struct user_namespace *to_user_ns(struct ns_common *ns) 947static inline struct user_namespace *to_user_ns(struct ns_common *ns)
846{ 948{
847 return container_of(ns, struct user_namespace, ns); 949 return container_of(ns, struct user_namespace, ns);
diff --git a/tools/testing/selftests/mount/unprivileged-remount-test.c b/tools/testing/selftests/mount/unprivileged-remount-test.c
index 1b3ff2fda4d0..517785052f1c 100644
--- a/tools/testing/selftests/mount/unprivileged-remount-test.c
+++ b/tools/testing/selftests/mount/unprivileged-remount-test.c
@@ -6,6 +6,8 @@
6#include <sys/types.h> 6#include <sys/types.h>
7#include <sys/mount.h> 7#include <sys/mount.h>
8#include <sys/wait.h> 8#include <sys/wait.h>
9#include <sys/vfs.h>
10#include <sys/statvfs.h>
9#include <stdlib.h> 11#include <stdlib.h>
10#include <unistd.h> 12#include <unistd.h>
11#include <fcntl.h> 13#include <fcntl.h>
@@ -32,11 +34,14 @@
32# define CLONE_NEWPID 0x20000000 34# define CLONE_NEWPID 0x20000000
33#endif 35#endif
34 36
37#ifndef MS_REC
38# define MS_REC 16384
39#endif
35#ifndef MS_RELATIME 40#ifndef MS_RELATIME
36#define MS_RELATIME (1 << 21) 41# define MS_RELATIME (1 << 21)
37#endif 42#endif
38#ifndef MS_STRICTATIME 43#ifndef MS_STRICTATIME
39#define MS_STRICTATIME (1 << 24) 44# define MS_STRICTATIME (1 << 24)
40#endif 45#endif
41 46
42static void die(char *fmt, ...) 47static void die(char *fmt, ...)
@@ -48,17 +53,14 @@ static void die(char *fmt, ...)
48 exit(EXIT_FAILURE); 53 exit(EXIT_FAILURE);
49} 54}
50 55
51static void write_file(char *filename, char *fmt, ...) 56static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap)
52{ 57{
53 char buf[4096]; 58 char buf[4096];
54 int fd; 59 int fd;
55 ssize_t written; 60 ssize_t written;
56 int buf_len; 61 int buf_len;
57 va_list ap;
58 62
59 va_start(ap, fmt);
60 buf_len = vsnprintf(buf, sizeof(buf), fmt, ap); 63 buf_len = vsnprintf(buf, sizeof(buf), fmt, ap);
61 va_end(ap);
62 if (buf_len < 0) { 64 if (buf_len < 0) {
63 die("vsnprintf failed: %s\n", 65 die("vsnprintf failed: %s\n",
64 strerror(errno)); 66 strerror(errno));
@@ -69,6 +71,8 @@ static void write_file(char *filename, char *fmt, ...)
69 71
70 fd = open(filename, O_WRONLY); 72 fd = open(filename, O_WRONLY);
71 if (fd < 0) { 73 if (fd < 0) {
74 if ((errno == ENOENT) && enoent_ok)
75 return;
72 die("open of %s failed: %s\n", 76 die("open of %s failed: %s\n",
73 filename, strerror(errno)); 77 filename, strerror(errno));
74 } 78 }
@@ -87,6 +91,65 @@ static void write_file(char *filename, char *fmt, ...)
87 } 91 }
88} 92}
89 93
94static void maybe_write_file(char *filename, char *fmt, ...)
95{
96 va_list ap;
97
98 va_start(ap, fmt);
99 vmaybe_write_file(true, filename, fmt, ap);
100 va_end(ap);
101
102}
103
104static void write_file(char *filename, char *fmt, ...)
105{
106 va_list ap;
107
108 va_start(ap, fmt);
109 vmaybe_write_file(false, filename, fmt, ap);
110 va_end(ap);
111
112}
113
114static int read_mnt_flags(const char *path)
115{
116 int ret;
117 struct statvfs stat;
118 int mnt_flags;
119
120 ret = statvfs(path, &stat);
121 if (ret != 0) {
122 die("statvfs of %s failed: %s\n",
123 path, strerror(errno));
124 }
125 if (stat.f_flag & ~(ST_RDONLY | ST_NOSUID | ST_NODEV | \
126 ST_NOEXEC | ST_NOATIME | ST_NODIRATIME | ST_RELATIME | \
127 ST_SYNCHRONOUS | ST_MANDLOCK)) {
128 die("Unrecognized mount flags\n");
129 }
130 mnt_flags = 0;
131 if (stat.f_flag & ST_RDONLY)
132 mnt_flags |= MS_RDONLY;
133 if (stat.f_flag & ST_NOSUID)
134 mnt_flags |= MS_NOSUID;
135 if (stat.f_flag & ST_NODEV)
136 mnt_flags |= MS_NODEV;
137 if (stat.f_flag & ST_NOEXEC)
138 mnt_flags |= MS_NOEXEC;
139 if (stat.f_flag & ST_NOATIME)
140 mnt_flags |= MS_NOATIME;
141 if (stat.f_flag & ST_NODIRATIME)
142 mnt_flags |= MS_NODIRATIME;
143 if (stat.f_flag & ST_RELATIME)
144 mnt_flags |= MS_RELATIME;
145 if (stat.f_flag & ST_SYNCHRONOUS)
146 mnt_flags |= MS_SYNCHRONOUS;
147 if (stat.f_flag & ST_MANDLOCK)
148 mnt_flags |= ST_MANDLOCK;
149
150 return mnt_flags;
151}
152
90static void create_and_enter_userns(void) 153static void create_and_enter_userns(void)
91{ 154{
92 uid_t uid; 155 uid_t uid;
@@ -100,13 +163,10 @@ static void create_and_enter_userns(void)
100 strerror(errno)); 163 strerror(errno));
101 } 164 }
102 165
166 maybe_write_file("/proc/self/setgroups", "deny");
103 write_file("/proc/self/uid_map", "0 %d 1", uid); 167 write_file("/proc/self/uid_map", "0 %d 1", uid);
104 write_file("/proc/self/gid_map", "0 %d 1", gid); 168 write_file("/proc/self/gid_map", "0 %d 1", gid);
105 169
106 if (setgroups(0, NULL) != 0) {
107 die("setgroups failed: %s\n",
108 strerror(errno));
109 }
110 if (setgid(0) != 0) { 170 if (setgid(0) != 0) {
111 die ("setgid(0) failed %s\n", 171 die ("setgid(0) failed %s\n",
112 strerror(errno)); 172 strerror(errno));
@@ -118,7 +178,8 @@ static void create_and_enter_userns(void)
118} 178}
119 179
120static 180static
121bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags) 181bool test_unpriv_remount(const char *fstype, const char *mount_options,
182 int mount_flags, int remount_flags, int invalid_flags)
122{ 183{
123 pid_t child; 184 pid_t child;
124 185
@@ -151,9 +212,11 @@ bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags)
151 strerror(errno)); 212 strerror(errno));
152 } 213 }
153 214
154 if (mount("testing", "/tmp", "ramfs", mount_flags, NULL) != 0) { 215 if (mount("testing", "/tmp", fstype, mount_flags, mount_options) != 0) {
155 die("mount of /tmp failed: %s\n", 216 die("mount of %s with options '%s' on /tmp failed: %s\n",
156 strerror(errno)); 217 fstype,
218 mount_options? mount_options : "",
219 strerror(errno));
157 } 220 }
158 221
159 create_and_enter_userns(); 222 create_and_enter_userns();
@@ -181,62 +244,127 @@ bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags)
181 244
182static bool test_unpriv_remount_simple(int mount_flags) 245static bool test_unpriv_remount_simple(int mount_flags)
183{ 246{
184 return test_unpriv_remount(mount_flags, mount_flags, 0); 247 return test_unpriv_remount("ramfs", NULL, mount_flags, mount_flags, 0);
185} 248}
186 249
187static bool test_unpriv_remount_atime(int mount_flags, int invalid_flags) 250static bool test_unpriv_remount_atime(int mount_flags, int invalid_flags)
188{ 251{
189 return test_unpriv_remount(mount_flags, mount_flags, invalid_flags); 252 return test_unpriv_remount("ramfs", NULL, mount_flags, mount_flags,
253 invalid_flags);
254}
255
256static bool test_priv_mount_unpriv_remount(void)
257{
258 pid_t child;
259 int ret;
260 const char *orig_path = "/dev";
261 const char *dest_path = "/tmp";
262 int orig_mnt_flags, remount_mnt_flags;
263
264 child = fork();
265 if (child == -1) {
266 die("fork failed: %s\n",
267 strerror(errno));
268 }
269 if (child != 0) { /* parent */
270 pid_t pid;
271 int status;
272 pid = waitpid(child, &status, 0);
273 if (pid == -1) {
274 die("waitpid failed: %s\n",
275 strerror(errno));
276 }
277 if (pid != child) {
278 die("waited for %d got %d\n",
279 child, pid);
280 }
281 if (!WIFEXITED(status)) {
282 die("child did not terminate cleanly\n");
283 }
284 return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false;
285 }
286
287 orig_mnt_flags = read_mnt_flags(orig_path);
288
289 create_and_enter_userns();
290 ret = unshare(CLONE_NEWNS);
291 if (ret != 0) {
292 die("unshare(CLONE_NEWNS) failed: %s\n",
293 strerror(errno));
294 }
295
296 ret = mount(orig_path, dest_path, "bind", MS_BIND | MS_REC, NULL);
297 if (ret != 0) {
298 die("recursive bind mount of %s onto %s failed: %s\n",
299 orig_path, dest_path, strerror(errno));
300 }
301
302 ret = mount(dest_path, dest_path, "none",
303 MS_REMOUNT | MS_BIND | orig_mnt_flags , NULL);
304 if (ret != 0) {
305 /* system("cat /proc/self/mounts"); */
306 die("remount of /tmp failed: %s\n",
307 strerror(errno));
308 }
309
310 remount_mnt_flags = read_mnt_flags(dest_path);
311 if (orig_mnt_flags != remount_mnt_flags) {
312 die("Mount flags unexpectedly changed during remount of %s originally mounted on %s\n",
313 dest_path, orig_path);
314 }
315 exit(EXIT_SUCCESS);
190} 316}
191 317
192int main(int argc, char **argv) 318int main(int argc, char **argv)
193{ 319{
194 if (!test_unpriv_remount_simple(MS_RDONLY|MS_NODEV)) { 320 if (!test_unpriv_remount_simple(MS_RDONLY)) {
195 die("MS_RDONLY malfunctions\n"); 321 die("MS_RDONLY malfunctions\n");
196 } 322 }
197 if (!test_unpriv_remount_simple(MS_NODEV)) { 323 if (!test_unpriv_remount("devpts", "newinstance", MS_NODEV, MS_NODEV, 0)) {
198 die("MS_NODEV malfunctions\n"); 324 die("MS_NODEV malfunctions\n");
199 } 325 }
200 if (!test_unpriv_remount_simple(MS_NOSUID|MS_NODEV)) { 326 if (!test_unpriv_remount_simple(MS_NOSUID)) {
201 die("MS_NOSUID malfunctions\n"); 327 die("MS_NOSUID malfunctions\n");
202 } 328 }
203 if (!test_unpriv_remount_simple(MS_NOEXEC|MS_NODEV)) { 329 if (!test_unpriv_remount_simple(MS_NOEXEC)) {
204 die("MS_NOEXEC malfunctions\n"); 330 die("MS_NOEXEC malfunctions\n");
205 } 331 }
206 if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODEV, 332 if (!test_unpriv_remount_atime(MS_RELATIME,
207 MS_NOATIME|MS_NODEV)) 333 MS_NOATIME))
208 { 334 {
209 die("MS_RELATIME malfunctions\n"); 335 die("MS_RELATIME malfunctions\n");
210 } 336 }
211 if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODEV, 337 if (!test_unpriv_remount_atime(MS_STRICTATIME,
212 MS_NOATIME|MS_NODEV)) 338 MS_NOATIME))
213 { 339 {
214 die("MS_STRICTATIME malfunctions\n"); 340 die("MS_STRICTATIME malfunctions\n");
215 } 341 }
216 if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODEV, 342 if (!test_unpriv_remount_atime(MS_NOATIME,
217 MS_STRICTATIME|MS_NODEV)) 343 MS_STRICTATIME))
218 { 344 {
219 die("MS_RELATIME malfunctions\n"); 345 die("MS_NOATIME malfunctions\n");
220 } 346 }
221 if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME|MS_NODEV, 347 if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME,
222 MS_NOATIME|MS_NODEV)) 348 MS_NOATIME))
223 { 349 {
224 die("MS_RELATIME malfunctions\n"); 350 die("MS_RELATIME|MS_NODIRATIME malfunctions\n");
225 } 351 }
226 if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME|MS_NODEV, 352 if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME,
227 MS_NOATIME|MS_NODEV)) 353 MS_NOATIME))
228 { 354 {
229 die("MS_RELATIME malfunctions\n"); 355 die("MS_STRICTATIME|MS_NODIRATIME malfunctions\n");
230 } 356 }
231 if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME|MS_NODEV, 357 if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME,
232 MS_STRICTATIME|MS_NODEV)) 358 MS_STRICTATIME))
233 { 359 {
234 die("MS_RELATIME malfunctions\n"); 360 die("MS_NOATIME|MS_DIRATIME malfunctions\n");
235 } 361 }
236 if (!test_unpriv_remount(MS_STRICTATIME|MS_NODEV, MS_NODEV, 362 if (!test_unpriv_remount("ramfs", NULL, MS_STRICTATIME, 0, MS_NOATIME))
237 MS_NOATIME|MS_NODEV))
238 { 363 {
239 die("Default atime malfunctions\n"); 364 die("Default atime malfunctions\n");
240 } 365 }
366 if (!test_priv_mount_unpriv_remount()) {
367 die("Mount flags unexpectedly changed after remount\n");
368 }
241 return EXIT_SUCCESS; 369 return EXIT_SUCCESS;
242} 370}