aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/user_namespace.c
diff options
context:
space:
mode:
authorEric W. Biederman <ebiederm@xmission.com>2011-11-17 03:11:58 -0500
committerEric W. Biederman <ebiederm@xmission.com>2012-04-26 05:01:39 -0400
commit22d917d80e842829d0ca0a561967d728eb1d6303 (patch)
treeb01e0566e136d3004fa9198e4cb1969fc6feff6c /kernel/user_namespace.c
parent783291e6900292521a3895583785e0c04a56c5b3 (diff)
userns: Rework the user_namespace adding uid/gid mapping support
- Convert the old uid mapping functions into compatibility wrappers - Add a uid/gid mapping layer from user space uid and gids to kernel internal uids and gids that is extent based for simplicty and speed. * Working with number space after mapping uids/gids into their kernel internal version adds only mapping complexity over what we have today, leaving the kernel code easy to understand and test. - Add proc files /proc/self/uid_map /proc/self/gid_map These files display the mapping and allow a mapping to be added if a mapping does not exist. - Allow entering the user namespace without a uid or gid mapping. Since we are starting with an existing user our uids and gids still have global mappings so are still valid and useful they just don't have local mappings. The requirement for things to work are global uid and gid so it is odd but perfectly fine not to have a local uid and gid mapping. Not requiring global uid and gid mappings greatly simplifies the logic of setting up the uid and gid mappings by allowing the mappings to be set after the namespace is created which makes the slight weirdness worth it. - Make the mappings in the initial user namespace to the global uid/gid space explicit. Today it is an identity mapping but in the future we may want to twist this for debugging, similar to what we do with jiffies. - Document the memory ordering requirements of setting the uid and gid mappings. We only allow the mappings to be set once and there are no pointers involved so the requirments are trivial but a little atypical. Performance: In this scheme for the permission checks the performance is expected to stay the same as the actuall machine instructions should remain the same. The worst case I could think of is ls -l on a large directory where all of the stat results need to be translated with from kuids and kgids to uids and gids. So I benchmarked that case on my laptop with a dual core hyperthread Intel i5-2520M cpu with 3M of cpu cache. My benchmark consisted of going to single user mode where nothing else was running. On an ext4 filesystem opening 1,000,000 files and looping through all of the files 1000 times and calling fstat on the individuals files. This was to ensure I was benchmarking stat times where the inodes were in the kernels cache, but the inode values were not in the processors cache. My results: v3.4-rc1: ~= 156ns (unmodified v3.4-rc1 with user namespace support disabled) v3.4-rc1-userns-: ~= 155ns (v3.4-rc1 with my user namespace patches and user namespace support disabled) v3.4-rc1-userns+: ~= 164ns (v3.4-rc1 with my user namespace patches and user namespace support enabled) All of the configurations ran in roughly 120ns when I performed tests that ran in the cpu cache. So in summary the performance impact is: 1ns improvement in the worst case with user namespace support compiled out. 8ns aka 5% slowdown in the worst case with user namespace support compiled in. Acked-by: Serge Hallyn <serge.hallyn@canonical.com> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Diffstat (limited to 'kernel/user_namespace.c')
-rw-r--r--kernel/user_namespace.c545
1 files changed, 504 insertions, 41 deletions
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index ed08836558e0..7eff867bfac5 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -12,9 +12,19 @@
12#include <linux/highuid.h> 12#include <linux/highuid.h>
13#include <linux/cred.h> 13#include <linux/cred.h>
14#include <linux/securebits.h> 14#include <linux/securebits.h>
15#include <linux/keyctl.h>
16#include <linux/key-type.h>
17#include <keys/user-type.h>
18#include <linux/seq_file.h>
19#include <linux/fs.h>
20#include <linux/uaccess.h>
21#include <linux/ctype.h>
15 22
16static struct kmem_cache *user_ns_cachep __read_mostly; 23static struct kmem_cache *user_ns_cachep __read_mostly;
17 24
25static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
26 struct uid_gid_map *map);
27
18/* 28/*
19 * Create a new user namespace, deriving the creator from the user in the 29 * Create a new user namespace, deriving the creator from the user in the
20 * passed credentials, and replacing that user with the new root user for the 30 * passed credentials, and replacing that user with the new root user for the
@@ -26,7 +36,6 @@ static struct kmem_cache *user_ns_cachep __read_mostly;
26int create_user_ns(struct cred *new) 36int create_user_ns(struct cred *new)
27{ 37{
28 struct user_namespace *ns, *parent_ns = new->user_ns; 38 struct user_namespace *ns, *parent_ns = new->user_ns;
29 struct user_struct *root_user;
30 kuid_t owner = make_kuid(new->user_ns, new->euid); 39 kuid_t owner = make_kuid(new->user_ns, new->euid);
31 kgid_t group = make_kgid(new->user_ns, new->egid); 40 kgid_t group = make_kgid(new->user_ns, new->egid);
32 41
@@ -38,29 +47,15 @@ int create_user_ns(struct cred *new)
38 !kgid_has_mapping(parent_ns, group)) 47 !kgid_has_mapping(parent_ns, group))
39 return -EPERM; 48 return -EPERM;
40 49
41 ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); 50 ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
42 if (!ns) 51 if (!ns)
43 return -ENOMEM; 52 return -ENOMEM;
44 53
45 kref_init(&ns->kref); 54 kref_init(&ns->kref);
46
47 /* Alloc new root user. */
48 root_user = alloc_uid(make_kuid(ns, 0));
49 if (!root_user) {
50 kmem_cache_free(user_ns_cachep, ns);
51 return -ENOMEM;
52 }
53
54 /* set the new root user in the credentials under preparation */
55 ns->parent = parent_ns; 55 ns->parent = parent_ns;
56 ns->owner = owner; 56 ns->owner = owner;
57 ns->group = group; 57 ns->group = group;
58 free_uid(new->user); 58
59 new->user = root_user;
60 new->uid = new->euid = new->suid = new->fsuid = 0;
61 new->gid = new->egid = new->sgid = new->fsgid = 0;
62 put_group_info(new->group_info);
63 new->group_info = get_group_info(&init_groups);
64 /* Start with the same capabilities as init but useless for doing 59 /* Start with the same capabilities as init but useless for doing
65 * anything as the capabilities are bound to the new user namespace. 60 * anything as the capabilities are bound to the new user namespace.
66 */ 61 */
@@ -93,44 +88,512 @@ void free_user_ns(struct kref *kref)
93} 88}
94EXPORT_SYMBOL(free_user_ns); 89EXPORT_SYMBOL(free_user_ns);
95 90
96uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) 91static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
97{ 92{
98 struct user_namespace *tmp; 93 unsigned idx, extents;
94 u32 first, last, id2;
99 95
100 if (likely(to == cred->user_ns)) 96 id2 = id + count - 1;
101 return uid;
102 97
103 /* Is cred->user the creator of the target user_ns 98 /* Find the matching extent */
104 * or the creator of one of it's parents? 99 extents = map->nr_extents;
105 */ 100 smp_read_barrier_depends();
106 for ( tmp = to; tmp != &init_user_ns; tmp = tmp->parent ) { 101 for (idx = 0; idx < extents; idx++) {
107 if (uid_eq(cred->user->uid, tmp->owner)) { 102 first = map->extent[idx].first;
108 return (uid_t)0; 103 last = first + map->extent[idx].count - 1;
109 } 104 if (id >= first && id <= last &&
105 (id2 >= first && id2 <= last))
106 break;
107 }
108 /* Map the id or note failure */
109 if (idx < extents)
110 id = (id - first) + map->extent[idx].lower_first;
111 else
112 id = (u32) -1;
113
114 return id;
115}
116
117static u32 map_id_down(struct uid_gid_map *map, u32 id)
118{
119 unsigned idx, extents;
120 u32 first, last;
121
122 /* Find the matching extent */
123 extents = map->nr_extents;
124 smp_read_barrier_depends();
125 for (idx = 0; idx < extents; idx++) {
126 first = map->extent[idx].first;
127 last = first + map->extent[idx].count - 1;
128 if (id >= first && id <= last)
129 break;
130 }
131 /* Map the id or note failure */
132 if (idx < extents)
133 id = (id - first) + map->extent[idx].lower_first;
134 else
135 id = (u32) -1;
136
137 return id;
138}
139
140static u32 map_id_up(struct uid_gid_map *map, u32 id)
141{
142 unsigned idx, extents;
143 u32 first, last;
144
145 /* Find the matching extent */
146 extents = map->nr_extents;
147 smp_read_barrier_depends();
148 for (idx = 0; idx < extents; idx++) {
149 first = map->extent[idx].lower_first;
150 last = first + map->extent[idx].count - 1;
151 if (id >= first && id <= last)
152 break;
110 } 153 }
154 /* Map the id or note failure */
155 if (idx < extents)
156 id = (id - first) + map->extent[idx].first;
157 else
158 id = (u32) -1;
159
160 return id;
161}
162
163/**
164 * make_kuid - Map a user-namespace uid pair into a kuid.
165 * @ns: User namespace that the uid is in
166 * @uid: User identifier
167 *
168 * Maps a user-namespace uid pair into a kernel internal kuid,
169 * and returns that kuid.
170 *
171 * When there is no mapping defined for the user-namespace uid
172 * pair INVALID_UID is returned. Callers are expected to test
173 * for and handle handle INVALID_UID being returned. INVALID_UID
174 * may be tested for using uid_valid().
175 */
176kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
177{
178 /* Map the uid to a global kernel uid */
179 return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
180}
181EXPORT_SYMBOL(make_kuid);
182
183/**
184 * from_kuid - Create a uid from a kuid user-namespace pair.
185 * @targ: The user namespace we want a uid in.
186 * @kuid: The kernel internal uid to start with.
187 *
188 * Map @kuid into the user-namespace specified by @targ and
189 * return the resulting uid.
190 *
191 * There is always a mapping into the initial user_namespace.
192 *
193 * If @kuid has no mapping in @targ (uid_t)-1 is returned.
194 */
195uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
196{
197 /* Map the uid from a global kernel uid */
198 return map_id_up(&targ->uid_map, __kuid_val(kuid));
199}
200EXPORT_SYMBOL(from_kuid);
201
202/**
203 * from_kuid_munged - Create a uid from a kuid user-namespace pair.
204 * @targ: The user namespace we want a uid in.
205 * @kuid: The kernel internal uid to start with.
206 *
207 * Map @kuid into the user-namespace specified by @targ and
208 * return the resulting uid.
209 *
210 * There is always a mapping into the initial user_namespace.
211 *
212 * Unlike from_kuid from_kuid_munged never fails and always
213 * returns a valid uid. This makes from_kuid_munged appropriate
214 * for use in syscalls like stat and getuid where failing the
215 * system call and failing to provide a valid uid are not an
216 * options.
217 *
218 * If @kuid has no mapping in @targ overflowuid is returned.
219 */
220uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
221{
222 uid_t uid;
223 uid = from_kuid(targ, kuid);
224
225 if (uid == (uid_t) -1)
226 uid = overflowuid;
227 return uid;
228}
229EXPORT_SYMBOL(from_kuid_munged);
230
231/**
232 * make_kgid - Map a user-namespace gid pair into a kgid.
233 * @ns: User namespace that the gid is in
234 * @uid: group identifier
235 *
236 * Maps a user-namespace gid pair into a kernel internal kgid,
237 * and returns that kgid.
238 *
239 * When there is no mapping defined for the user-namespace gid
240 * pair INVALID_GID is returned. Callers are expected to test
241 * for and handle INVALID_GID being returned. INVALID_GID may be
242 * tested for using gid_valid().
243 */
244kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
245{
246 /* Map the gid to a global kernel gid */
247 return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
248}
249EXPORT_SYMBOL(make_kgid);
250
251/**
252 * from_kgid - Create a gid from a kgid user-namespace pair.
253 * @targ: The user namespace we want a gid in.
254 * @kgid: The kernel internal gid to start with.
255 *
256 * Map @kgid into the user-namespace specified by @targ and
257 * return the resulting gid.
258 *
259 * There is always a mapping into the initial user_namespace.
260 *
261 * If @kgid has no mapping in @targ (gid_t)-1 is returned.
262 */
263gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
264{
265 /* Map the gid from a global kernel gid */
266 return map_id_up(&targ->gid_map, __kgid_val(kgid));
267}
268EXPORT_SYMBOL(from_kgid);
269
270/**
271 * from_kgid_munged - Create a gid from a kgid user-namespace pair.
272 * @targ: The user namespace we want a gid in.
273 * @kgid: The kernel internal gid to start with.
274 *
275 * Map @kgid into the user-namespace specified by @targ and
276 * return the resulting gid.
277 *
278 * There is always a mapping into the initial user_namespace.
279 *
280 * Unlike from_kgid from_kgid_munged never fails and always
281 * returns a valid gid. This makes from_kgid_munged appropriate
282 * for use in syscalls like stat and getgid where failing the
283 * system call and failing to provide a valid gid are not options.
284 *
285 * If @kgid has no mapping in @targ overflowgid is returned.
286 */
287gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
288{
289 gid_t gid;
290 gid = from_kgid(targ, kgid);
291
292 if (gid == (gid_t) -1)
293 gid = overflowgid;
294 return gid;
295}
296EXPORT_SYMBOL(from_kgid_munged);
297
298static int uid_m_show(struct seq_file *seq, void *v)
299{
300 struct user_namespace *ns = seq->private;
301 struct uid_gid_extent *extent = v;
302 struct user_namespace *lower_ns;
303 uid_t lower;
111 304
112 /* No useful relationship so no mapping */ 305 lower_ns = current_user_ns();
113 return overflowuid; 306 if ((lower_ns == ns) && lower_ns->parent)
307 lower_ns = lower_ns->parent;
308
309 lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));
310
311 seq_printf(seq, "%10u %10u %10u\n",
312 extent->first,
313 lower,
314 extent->count);
315
316 return 0;
114} 317}
115 318
116gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) 319static int gid_m_show(struct seq_file *seq, void *v)
117{ 320{
118 struct user_namespace *tmp; 321 struct user_namespace *ns = seq->private;
322 struct uid_gid_extent *extent = v;
323 struct user_namespace *lower_ns;
324 gid_t lower;
119 325
120 if (likely(to == cred->user_ns)) 326 lower_ns = current_user_ns();
121 return gid; 327 if ((lower_ns == ns) && lower_ns->parent)
328 lower_ns = lower_ns->parent;
122 329
123 /* Is cred->user the creator of the target user_ns 330 lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));
124 * or the creator of one of it's parents? 331
332 seq_printf(seq, "%10u %10u %10u\n",
333 extent->first,
334 lower,
335 extent->count);
336
337 return 0;
338}
339
340static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map)
341{
342 struct uid_gid_extent *extent = NULL;
343 loff_t pos = *ppos;
344
345 if (pos < map->nr_extents)
346 extent = &map->extent[pos];
347
348 return extent;
349}
350
351static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
352{
353 struct user_namespace *ns = seq->private;
354
355 return m_start(seq, ppos, &ns->uid_map);
356}
357
358static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
359{
360 struct user_namespace *ns = seq->private;
361
362 return m_start(seq, ppos, &ns->gid_map);
363}
364
365static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
366{
367 (*pos)++;
368 return seq->op->start(seq, pos);
369}
370
371static void m_stop(struct seq_file *seq, void *v)
372{
373 return;
374}
375
376struct seq_operations proc_uid_seq_operations = {
377 .start = uid_m_start,
378 .stop = m_stop,
379 .next = m_next,
380 .show = uid_m_show,
381};
382
383struct seq_operations proc_gid_seq_operations = {
384 .start = gid_m_start,
385 .stop = m_stop,
386 .next = m_next,
387 .show = gid_m_show,
388};
389
390static DEFINE_MUTEX(id_map_mutex);
391
392static ssize_t map_write(struct file *file, const char __user *buf,
393 size_t count, loff_t *ppos,
394 int cap_setid,
395 struct uid_gid_map *map,
396 struct uid_gid_map *parent_map)
397{
398 struct seq_file *seq = file->private_data;
399 struct user_namespace *ns = seq->private;
400 struct uid_gid_map new_map;
401 unsigned idx;
402 struct uid_gid_extent *extent, *last = NULL;
403 unsigned long page = 0;
404 char *kbuf, *pos, *next_line;
405 ssize_t ret = -EINVAL;
406
407 /*
408 * The id_map_mutex serializes all writes to any given map.
409 *
410 * Any map is only ever written once.
411 *
412 * An id map fits within 1 cache line on most architectures.
413 *
414 * On read nothing needs to be done unless you are on an
415 * architecture with a crazy cache coherency model like alpha.
416 *
417 * There is a one time data dependency between reading the
418 * count of the extents and the values of the extents. The
419 * desired behavior is to see the values of the extents that
420 * were written before the count of the extents.
421 *
422 * To achieve this smp_wmb() is used on guarantee the write
423 * order and smp_read_barrier_depends() is guaranteed that we
424 * don't have crazy architectures returning stale data.
425 *
426 */
427 mutex_lock(&id_map_mutex);
428
429 ret = -EPERM;
430 /* Only allow one successful write to the map */
431 if (map->nr_extents != 0)
432 goto out;
433
434 /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
435 * over the user namespace in order to set the id mapping.
125 */ 436 */
126 for ( tmp = to; tmp != &init_user_ns; tmp = tmp->parent ) { 437 if (!ns_capable(ns, cap_setid))
127 if (uid_eq(cred->user->uid, tmp->owner)) { 438 goto out;
128 return (gid_t)0; 439
440 /* Get a buffer */
441 ret = -ENOMEM;
442 page = __get_free_page(GFP_TEMPORARY);
443 kbuf = (char *) page;
444 if (!page)
445 goto out;
446
447 /* Only allow <= page size writes at the beginning of the file */
448 ret = -EINVAL;
449 if ((*ppos != 0) || (count >= PAGE_SIZE))
450 goto out;
451
452 /* Slurp in the user data */
453 ret = -EFAULT;
454 if (copy_from_user(kbuf, buf, count))
455 goto out;
456 kbuf[count] = '\0';
457
458 /* Parse the user data */
459 ret = -EINVAL;
460 pos = kbuf;
461 new_map.nr_extents = 0;
462 for (;pos; pos = next_line) {
463 extent = &new_map.extent[new_map.nr_extents];
464
465 /* Find the end of line and ensure I don't look past it */
466 next_line = strchr(pos, '\n');
467 if (next_line) {
468 *next_line = '\0';
469 next_line++;
470 if (*next_line == '\0')
471 next_line = NULL;
129 } 472 }
473
474 pos = skip_spaces(pos);
475 extent->first = simple_strtoul(pos, &pos, 10);
476 if (!isspace(*pos))
477 goto out;
478
479 pos = skip_spaces(pos);
480 extent->lower_first = simple_strtoul(pos, &pos, 10);
481 if (!isspace(*pos))
482 goto out;
483
484 pos = skip_spaces(pos);
485 extent->count = simple_strtoul(pos, &pos, 10);
486 if (*pos && !isspace(*pos))
487 goto out;
488
489 /* Verify there is not trailing junk on the line */
490 pos = skip_spaces(pos);
491 if (*pos != '\0')
492 goto out;
493
494 /* Verify we have been given valid starting values */
495 if ((extent->first == (u32) -1) ||
496 (extent->lower_first == (u32) -1 ))
497 goto out;
498
499 /* Verify count is not zero and does not cause the extent to wrap */
500 if ((extent->first + extent->count) <= extent->first)
501 goto out;
502 if ((extent->lower_first + extent->count) <= extent->lower_first)
503 goto out;
504
505 /* For now only accept extents that are strictly in order */
506 if (last &&
507 (((last->first + last->count) > extent->first) ||
508 ((last->lower_first + last->count) > extent->lower_first)))
509 goto out;
510
511 new_map.nr_extents++;
512 last = extent;
513
514 /* Fail if the file contains too many extents */
515 if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
516 (next_line != NULL))
517 goto out;
130 } 518 }
519 /* Be very certaint the new map actually exists */
520 if (new_map.nr_extents == 0)
521 goto out;
522
523 ret = -EPERM;
524 /* Validate the user is allowed to use user id's mapped to. */
525 if (!new_idmap_permitted(ns, cap_setid, &new_map))
526 goto out;
527
528 /* Map the lower ids from the parent user namespace to the
529 * kernel global id space.
530 */
531 for (idx = 0; idx < new_map.nr_extents; idx++) {
532 u32 lower_first;
533 extent = &new_map.extent[idx];
534
535 lower_first = map_id_range_down(parent_map,
536 extent->lower_first,
537 extent->count);
538
539 /* Fail if we can not map the specified extent to
540 * the kernel global id space.
541 */
542 if (lower_first == (u32) -1)
543 goto out;
544
545 extent->lower_first = lower_first;
546 }
547
548 /* Install the map */
549 memcpy(map->extent, new_map.extent,
550 new_map.nr_extents*sizeof(new_map.extent[0]));
551 smp_wmb();
552 map->nr_extents = new_map.nr_extents;
553
554 *ppos = count;
555 ret = count;
556out:
557 mutex_unlock(&id_map_mutex);
558 if (page)
559 free_page(page);
560 return ret;
561}
562
563ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
564{
565 struct seq_file *seq = file->private_data;
566 struct user_namespace *ns = seq->private;
567
568 if (!ns->parent)
569 return -EPERM;
570
571 return map_write(file, buf, size, ppos, CAP_SETUID,
572 &ns->uid_map, &ns->parent->uid_map);
573}
574
575ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos)
576{
577 struct seq_file *seq = file->private_data;
578 struct user_namespace *ns = seq->private;
579
580 if (!ns->parent)
581 return -EPERM;
582
583 return map_write(file, buf, size, ppos, CAP_SETGID,
584 &ns->gid_map, &ns->parent->gid_map);
585}
586
587static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
588 struct uid_gid_map *new_map)
589{
590 /* Allow the specified ids if we have the appropriate capability
591 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
592 */
593 if (ns_capable(ns->parent, cap_setid))
594 return true;
131 595
132 /* No useful relationship so no mapping */ 596 return false;
133 return overflowgid;
134} 597}
135 598
136static __init int user_namespaces_init(void) 599static __init int user_namespaces_init(void)