aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPavel Emelyanov <xemul@openvz.org>2007-10-19 02:39:54 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-19 14:53:37 -0400
commitcf7b708c8d1d7a27736771bcf4c457b332b0f818 (patch)
tree10f80257b052313b283f18ddfe35145882e0b47f
parenta6f5e06378970a2687332c2d54046245fcff1e7e (diff)
Make access to task's nsproxy lighter
When someone wants to deal with some other taks's namespaces it has to lock the task and then to get the desired namespace if the one exists. This is slow on read-only paths and may be impossible in some cases. E.g. Oleg recently noticed a race between unshare() and the (sent for review in cgroups) pid namespaces - when the task notifies the parent it has to know the parent's namespace, but taking the task_lock() is impossible there - the code is under write locked tasklist lock. On the other hand switching the namespace on task (daemonize) and releasing the namespace (after the last task exit) is rather rare operation and we can sacrifice its speed to solve the issues above. The access to other task namespaces is proposed to be performed like this: rcu_read_lock(); nsproxy = task_nsproxy(tsk); if (nsproxy != NULL) { / * * work with the namespaces here * e.g. get the reference on one of them * / } / * * NULL task_nsproxy() means that this task is * almost dead (zombie) * / rcu_read_unlock(); This patch has passed the review by Eric and Oleg :) and, of course, tested. [clg@fr.ibm.com: fix unshare()] [ebiederm@xmission.com: Update get_net_ns_by_pid] Signed-off-by: Pavel Emelyanov <xemul@openvz.org> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Cc: Oleg Nesterov <oleg@tv-sign.ru> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/proc/base.c27
-rw-r--r--include/linux/nsproxy.h43
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/nsproxy.c40
-rw-r--r--net/core/rtnetlink.c8
6 files changed, 91 insertions, 45 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index fbff900fd5ad..6afca09a6534 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -350,18 +350,21 @@ struct proc_mounts {
350static int mounts_open(struct inode *inode, struct file *file) 350static int mounts_open(struct inode *inode, struct file *file)
351{ 351{
352 struct task_struct *task = get_proc_task(inode); 352 struct task_struct *task = get_proc_task(inode);
353 struct nsproxy *nsp;
353 struct mnt_namespace *ns = NULL; 354 struct mnt_namespace *ns = NULL;
354 struct proc_mounts *p; 355 struct proc_mounts *p;
355 int ret = -EINVAL; 356 int ret = -EINVAL;
356 357
357 if (task) { 358 if (task) {
358 task_lock(task); 359 rcu_read_lock();
359 if (task->nsproxy) { 360 nsp = task_nsproxy(task);
360 ns = task->nsproxy->mnt_ns; 361 if (nsp) {
362 ns = nsp->mnt_ns;
361 if (ns) 363 if (ns)
362 get_mnt_ns(ns); 364 get_mnt_ns(ns);
363 } 365 }
364 task_unlock(task); 366 rcu_read_unlock();
367
365 put_task_struct(task); 368 put_task_struct(task);
366 } 369 }
367 370
@@ -424,16 +427,20 @@ static int mountstats_open(struct inode *inode, struct file *file)
424 427
425 if (!ret) { 428 if (!ret) {
426 struct seq_file *m = file->private_data; 429 struct seq_file *m = file->private_data;
430 struct nsproxy *nsp;
427 struct mnt_namespace *mnt_ns = NULL; 431 struct mnt_namespace *mnt_ns = NULL;
428 struct task_struct *task = get_proc_task(inode); 432 struct task_struct *task = get_proc_task(inode);
429 433
430 if (task) { 434 if (task) {
431 task_lock(task); 435 rcu_read_lock();
432 if (task->nsproxy) 436 nsp = task_nsproxy(task);
433 mnt_ns = task->nsproxy->mnt_ns; 437 if (nsp) {
434 if (mnt_ns) 438 mnt_ns = nsp->mnt_ns;
435 get_mnt_ns(mnt_ns); 439 if (mnt_ns)
436 task_unlock(task); 440 get_mnt_ns(mnt_ns);
441 }
442 rcu_read_unlock();
443
437 put_task_struct(task); 444 put_task_struct(task);
438 } 445 }
439 446
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index f1eca68751a9..0e66b57631fc 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -32,8 +32,39 @@ struct nsproxy {
32}; 32};
33extern struct nsproxy init_nsproxy; 33extern struct nsproxy init_nsproxy;
34 34
35/*
36 * the namespaces access rules are:
37 *
38 * 1. only current task is allowed to change tsk->nsproxy pointer or
39 * any pointer on the nsproxy itself
40 *
41 * 2. when accessing (i.e. reading) current task's namespaces - no
42 * precautions should be taken - just dereference the pointers
43 *
44 * 3. the access to other task namespaces is performed like this
45 * rcu_read_lock();
46 * nsproxy = task_nsproxy(tsk);
47 * if (nsproxy != NULL) {
48 * / *
49 * * work with the namespaces here
50 * * e.g. get the reference on one of them
51 * * /
52 * } / *
53 * * NULL task_nsproxy() means that this task is
54 * * almost dead (zombie)
55 * * /
56 * rcu_read_unlock();
57 *
58 */
59
60static inline struct nsproxy *task_nsproxy(struct task_struct *tsk)
61{
62 return rcu_dereference(tsk->nsproxy);
63}
64
35int copy_namespaces(unsigned long flags, struct task_struct *tsk); 65int copy_namespaces(unsigned long flags, struct task_struct *tsk);
36void get_task_namespaces(struct task_struct *tsk); 66void exit_task_namespaces(struct task_struct *tsk);
67void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
37void free_nsproxy(struct nsproxy *ns); 68void free_nsproxy(struct nsproxy *ns);
38int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, 69int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
39 struct fs_struct *); 70 struct fs_struct *);
@@ -45,15 +76,9 @@ static inline void put_nsproxy(struct nsproxy *ns)
45 } 76 }
46} 77}
47 78
48static inline void exit_task_namespaces(struct task_struct *p) 79static inline void get_nsproxy(struct nsproxy *ns)
49{ 80{
50 struct nsproxy *ns = p->nsproxy; 81 atomic_inc(&ns->count);
51 if (ns) {
52 task_lock(p);
53 p->nsproxy = NULL;
54 task_unlock(p);
55 put_nsproxy(ns);
56 }
57} 82}
58 83
59#ifdef CONFIG_CGROUP_NS 84#ifdef CONFIG_CGROUP_NS
diff --git a/kernel/exit.c b/kernel/exit.c
index d22aefabb129..db9764186d5a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -400,9 +400,10 @@ void daemonize(const char *name, ...)
400 current->fs = fs; 400 current->fs = fs;
401 atomic_inc(&fs->count); 401 atomic_inc(&fs->count);
402 402
403 exit_task_namespaces(current); 403 if (current->nsproxy != init_task.nsproxy) {
404 current->nsproxy = init_task.nsproxy; 404 get_nsproxy(init_task.nsproxy);
405 get_task_namespaces(current); 405 switch_task_namespaces(current, init_task.nsproxy);
406 }
406 407
407 exit_files(current); 408 exit_files(current);
408 current->files = init_task.files; 409 current->files = init_task.files;
diff --git a/kernel/fork.c b/kernel/fork.c
index 2deaf481efab..d2f4a420a5b9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1632,7 +1632,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1632 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; 1632 struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
1633 struct files_struct *fd, *new_fd = NULL; 1633 struct files_struct *fd, *new_fd = NULL;
1634 struct sem_undo_list *new_ulist = NULL; 1634 struct sem_undo_list *new_ulist = NULL;
1635 struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; 1635 struct nsproxy *new_nsproxy = NULL;
1636 1636
1637 check_unshare_flags(&unshare_flags); 1637 check_unshare_flags(&unshare_flags);
1638 1638
@@ -1662,14 +1662,13 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
1662 1662
1663 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) { 1663 if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) {
1664 1664
1665 task_lock(current);
1666
1667 if (new_nsproxy) { 1665 if (new_nsproxy) {
1668 old_nsproxy = current->nsproxy; 1666 switch_task_namespaces(current, new_nsproxy);
1669 current->nsproxy = new_nsproxy; 1667 new_nsproxy = NULL;
1670 new_nsproxy = old_nsproxy;
1671 } 1668 }
1672 1669
1670 task_lock(current);
1671
1673 if (new_fs) { 1672 if (new_fs) {
1674 fs = current->fs; 1673 fs = current->fs;
1675 current->fs = new_fs; 1674 current->fs = new_fs;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index e981c61304f1..c8ef7c2992ed 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,19 +26,6 @@ static struct kmem_cache *nsproxy_cachep;
26 26
27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); 27struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy);
28 28
29static inline void get_nsproxy(struct nsproxy *ns)
30{
31 atomic_inc(&ns->count);
32}
33
34void get_task_namespaces(struct task_struct *tsk)
35{
36 struct nsproxy *ns = tsk->nsproxy;
37 if (ns) {
38 get_nsproxy(ns);
39 }
40}
41
42/* 29/*
43 * creates a copy of "orig" with refcount 1. 30 * creates a copy of "orig" with refcount 1.
44 */ 31 */
@@ -216,6 +203,33 @@ out:
216 return err; 203 return err;
217} 204}
218 205
206void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
207{
208 struct nsproxy *ns;
209
210 might_sleep();
211
212 ns = p->nsproxy;
213
214 rcu_assign_pointer(p->nsproxy, new);
215
216 if (ns && atomic_dec_and_test(&ns->count)) {
217 /*
218 * wait for others to get what they want from this nsproxy.
219 *
220 * cannot release this nsproxy via the call_rcu() since
221 * put_mnt_ns() will want to sleep
222 */
223 synchronize_rcu();
224 free_nsproxy(ns);
225 }
226}
227
228void exit_task_namespaces(struct task_struct *p)
229{
230 switch_task_namespaces(p, NULL);
231}
232
219static int __init nsproxy_cache_init(void) 233static int __init nsproxy_cache_init(void)
220{ 234{
221 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 235 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 1072d16696c3..4a2640d38261 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -744,10 +744,10 @@ static struct net *get_net_ns_by_pid(pid_t pid)
744 rcu_read_lock(); 744 rcu_read_lock();
745 tsk = find_task_by_pid(pid); 745 tsk = find_task_by_pid(pid);
746 if (tsk) { 746 if (tsk) {
747 task_lock(tsk); 747 struct nsproxy *nsproxy;
748 if (tsk->nsproxy) 748 nsproxy = task_nsproxy(tsk);
749 net = get_net(tsk->nsproxy->net_ns); 749 if (nsproxy)
750 task_unlock(tsk); 750 net = get_net(nsproxy->net_ns);
751 } 751 }
752 rcu_read_unlock(); 752 rcu_read_unlock();
753 return net; 753 return net;