aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJack Miller <millerjo@us.ibm.com>2014-08-08 17:23:19 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-08 18:57:26 -0400
commitab602f799159393143d567e5c04b936fec79d6bd (patch)
treeae2ebcc0d372cf0efb91cf2b95fca33f304a69da
parent9687fd9101afaa1c4b1de7ffd2f9d7e53f45b29f (diff)
shm: make exit_shm work proportional to task activity
This is small set of patches our team has had kicking around for a few versions internally that fixes tasks getting hung on shm_exit when there are many threads hammering it at once. Anton wrote a simple test to cause the issue: http://ozlabs.org/~anton/junkcode/bust_shm_exit.c Before applying this patchset, this test code will cause either hanging tracebacks or pthread out of memory errors. After this patchset, it will still produce output like: root@somehost:~# ./bust_shm_exit 1024 160 ... INFO: rcu_sched detected stalls on CPUs/tasks: {} (detected by 116, t=2111 jiffies, g=241, c=240, q=7113) INFO: Stall ended before state dump start ... But the task will continue to run along happily, so we consider this an improvement over hanging, even if it's a bit noisy. This patch (of 3): exit_shm obtains the ipc_ns shm rwsem for write and holds it while it walks every shared memory segment in the namespace. Thus the amount of work is related to the number of shm segments in the namespace not the number of segments that might need to be cleaned. In addition, this occurs after the task has been notified the thread has exited, so the number of tasks waiting for the ns shm rwsem can grow without bound until memory is exausted. Add a list to the task struct of all shmids allocated by this task. Init the list head in copy_process. Use the ns->rwsem for locking. Add segments after id is added, remove before removing from id. On unshare of NEW_IPCNS orphan any ids as if the task had exited, similar to handling of semaphore undo. I chose a define for the init sequence since its a simple list init, otherwise it would require a function call to avoid include loops between the semaphore code and the task struct. Converting the list_del to list_del_init for the unshare cases would remove the exit followed by init, but I left it blow up if not inited. Signed-off-by: Milton Miller <miltonm@bga.com> Signed-off-by: Jack Miller <millerjo@us.ibm.com> Cc: Davidlohr Bueso <davidlohr@hp.com> Cc: Manfred Spraul <manfred@colorfullife.com> Cc: Anton Blanchard <anton@samba.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/shm.h16
-rw-r--r--ipc/shm.c22
-rw-r--r--kernel/fork.c6
4 files changed, 34 insertions, 12 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b21e9218c0fd..db2f6474e95e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -33,6 +33,7 @@ struct sched_param {
33 33
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/sem.h> 35#include <linux/sem.h>
36#include <linux/shm.h>
36#include <linux/signal.h> 37#include <linux/signal.h>
37#include <linux/compiler.h> 38#include <linux/compiler.h>
38#include <linux/completion.h> 39#include <linux/completion.h>
@@ -1385,6 +1386,7 @@ struct task_struct {
1385#ifdef CONFIG_SYSVIPC 1386#ifdef CONFIG_SYSVIPC
1386/* ipc stuff */ 1387/* ipc stuff */
1387 struct sysv_sem sysvsem; 1388 struct sysv_sem sysvsem;
1389 struct sysv_shm sysvshm;
1388#endif 1390#endif
1389#ifdef CONFIG_DETECT_HUNG_TASK 1391#ifdef CONFIG_DETECT_HUNG_TASK
1390/* hung task detection */ 1392/* hung task detection */
diff --git a/include/linux/shm.h b/include/linux/shm.h
index 57d77709fbe2..fd206387048a 100644
--- a/include/linux/shm.h
+++ b/include/linux/shm.h
@@ -1,6 +1,7 @@
1#ifndef _LINUX_SHM_H_ 1#ifndef _LINUX_SHM_H_
2#define _LINUX_SHM_H_ 2#define _LINUX_SHM_H_
3 3
4#include <linux/list.h>
4#include <asm/page.h> 5#include <asm/page.h>
5#include <uapi/linux/shm.h> 6#include <uapi/linux/shm.h>
6#include <asm/shmparam.h> 7#include <asm/shmparam.h>
@@ -20,6 +21,7 @@ struct shmid_kernel /* private to the kernel */
20 21
21 /* The task created the shm object. NULL if the task is dead. */ 22 /* The task created the shm object. NULL if the task is dead. */
22 struct task_struct *shm_creator; 23 struct task_struct *shm_creator;
24 struct list_head shm_clist; /* list by creator */
23}; 25};
24 26
25/* shm_mode upper byte flags */ 27/* shm_mode upper byte flags */
@@ -44,11 +46,20 @@ struct shmid_kernel /* private to the kernel */
44#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) 46#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
45 47
46#ifdef CONFIG_SYSVIPC 48#ifdef CONFIG_SYSVIPC
49struct sysv_shm {
50 struct list_head shm_clist;
51};
52
47long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, 53long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr,
48 unsigned long shmlba); 54 unsigned long shmlba);
49extern int is_file_shm_hugepages(struct file *file); 55extern int is_file_shm_hugepages(struct file *file);
50extern void exit_shm(struct task_struct *task); 56void exit_shm(struct task_struct *task);
57#define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist)
51#else 58#else
59struct sysv_shm {
60 /* empty */
61};
62
52static inline long do_shmat(int shmid, char __user *shmaddr, 63static inline long do_shmat(int shmid, char __user *shmaddr,
53 int shmflg, unsigned long *addr, 64 int shmflg, unsigned long *addr,
54 unsigned long shmlba) 65 unsigned long shmlba)
@@ -62,6 +73,9 @@ static inline int is_file_shm_hugepages(struct file *file)
62static inline void exit_shm(struct task_struct *task) 73static inline void exit_shm(struct task_struct *task)
63{ 74{
64} 75}
76static inline void shm_init_task(struct task_struct *task)
77{
78}
65#endif 79#endif
66 80
67#endif /* _LINUX_SHM_H_ */ 81#endif /* _LINUX_SHM_H_ */
diff --git a/ipc/shm.c b/ipc/shm.c
index 89fc354156cb..1fc3a61b443b 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -178,6 +178,7 @@ static void shm_rcu_free(struct rcu_head *head)
178 178
179static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) 179static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
180{ 180{
181 list_del(&s->shm_clist);
181 ipc_rmid(&shm_ids(ns), &s->shm_perm); 182 ipc_rmid(&shm_ids(ns), &s->shm_perm);
182} 183}
183 184
@@ -268,14 +269,10 @@ static void shm_close(struct vm_area_struct *vma)
268} 269}
269 270
270/* Called with ns->shm_ids(ns).rwsem locked */ 271/* Called with ns->shm_ids(ns).rwsem locked */
271static int shm_try_destroy_current(int id, void *p, void *data) 272static void shm_mark_orphan(struct shmid_kernel *shp, struct ipc_namespace *ns)
272{ 273{
273 struct ipc_namespace *ns = data; 274 if (WARN_ON(shp->shm_creator != current)) /* Remove me when it works */
274 struct kern_ipc_perm *ipcp = p; 275 return;
275 struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm);
276
277 if (shp->shm_creator != current)
278 return 0;
279 276
280 /* 277 /*
281 * Mark it as orphaned to destroy the segment when 278 * Mark it as orphaned to destroy the segment when
@@ -289,13 +286,12 @@ static int shm_try_destroy_current(int id, void *p, void *data)
289 * is not set, it shouldn't be deleted here. 286 * is not set, it shouldn't be deleted here.
290 */ 287 */
291 if (!ns->shm_rmid_forced) 288 if (!ns->shm_rmid_forced)
292 return 0; 289 return;
293 290
294 if (shm_may_destroy(ns, shp)) { 291 if (shm_may_destroy(ns, shp)) {
295 shm_lock_by_ptr(shp); 292 shm_lock_by_ptr(shp);
296 shm_destroy(ns, shp); 293 shm_destroy(ns, shp);
297 } 294 }
298 return 0;
299} 295}
300 296
301/* Called with ns->shm_ids(ns).rwsem locked */ 297/* Called with ns->shm_ids(ns).rwsem locked */
@@ -333,14 +329,17 @@ void shm_destroy_orphaned(struct ipc_namespace *ns)
333void exit_shm(struct task_struct *task) 329void exit_shm(struct task_struct *task)
334{ 330{
335 struct ipc_namespace *ns = task->nsproxy->ipc_ns; 331 struct ipc_namespace *ns = task->nsproxy->ipc_ns;
332 struct shmid_kernel *shp, *n;
336 333
337 if (shm_ids(ns).in_use == 0) 334 if (shm_ids(ns).in_use == 0)
338 return; 335 return;
339 336
340 /* Destroy all already created segments, but not mapped yet */ 337 /* Destroy all already created segments, but not mapped yet */
341 down_write(&shm_ids(ns).rwsem); 338 down_write(&shm_ids(ns).rwsem);
342 if (shm_ids(ns).in_use) 339 list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist)
343 idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); 340 shm_mark_orphan(shp, ns);
341 /* remove the list head from any segments still attached */
342 list_del(&task->sysvshm.shm_clist);
344 up_write(&shm_ids(ns).rwsem); 343 up_write(&shm_ids(ns).rwsem);
345} 344}
346 345
@@ -561,6 +560,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
561 shp->shm_nattch = 0; 560 shp->shm_nattch = 0;
562 shp->shm_file = file; 561 shp->shm_file = file;
563 shp->shm_creator = current; 562 shp->shm_creator = current;
563 list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
564 564
565 /* 565 /*
566 * shmid gets reported as "inode#" in /proc/pid/maps. 566 * shmid gets reported as "inode#" in /proc/pid/maps.
diff --git a/kernel/fork.c b/kernel/fork.c
index 86da59e165ad..fa9124322cd4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1362,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1362 if (retval) 1362 if (retval)
1363 goto bad_fork_cleanup_policy; 1363 goto bad_fork_cleanup_policy;
1364 /* copy all the process information */ 1364 /* copy all the process information */
1365 shm_init_task(p);
1365 retval = copy_semundo(clone_flags, p); 1366 retval = copy_semundo(clone_flags, p);
1366 if (retval) 1367 if (retval)
1367 goto bad_fork_cleanup_audit; 1368 goto bad_fork_cleanup_audit;
@@ -1913,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1913 */ 1914 */
1914 exit_sem(current); 1915 exit_sem(current);
1915 } 1916 }
1917 if (unshare_flags & CLONE_NEWIPC) {
1918 /* Orphan segments in old ns (see sem above). */
1919 exit_shm(current);
1920 shm_init_task(current);
1921 }
1916 1922
1917 if (new_nsproxy) 1923 if (new_nsproxy)
1918 switch_task_namespaces(current, new_nsproxy); 1924 switch_task_namespaces(current, new_nsproxy);