diff options
author | Paul Jackson <pj@sgi.com> | 2005-08-09 13:07:59 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-08-09 15:08:22 -0400 |
commit | 3077a260e9f316b611436b1506eec9cc5c4f8aa6 (patch) | |
tree | 43b7d5faa5f204904c713c463015792d9ff56b01 /kernel | |
parent | a242b44da6feb604c4c659b78f63dedb69b2d4a3 (diff) |
[PATCH] cpuset release ABBA deadlock fix
Fix possible cpuset_sem ABBA deadlock if 'notify_on_release' set.
For a particular usage pattern, creating and destroying cpusets fairly
frequently using notify_on_release, on a very large system, this deadlock
can be seen every few days. If you are not using the cpuset
notify_on_release feature, you will never see this deadlock.
The existing code, on task exit (or cpuset deletion) did:
get cpuset_sem
if cpuset marked notify_on_release and is ready to release:
compute cpuset path relative to /dev/cpuset mount point
call_usermodehelper() forks /sbin/cpuset_release_agent with path
drop cpuset_sem
Unfortunately, the fork in call_usermodehelper can allocate memory, and
allocating memory can require cpuset_sem, if the mems_generation values
changed in the interim. This results in an ABBA deadlock, trying to obtain
cpuset_sem when it is already held by the current task.
To fix this, I put the cpuset path (which must be computed while holding
cpuset_sem) in a temporary buffer, to be used in the call_usermodehelper
call of /sbin/cpuset_release_agent only _after_ dropping cpuset_sem.
So the new logic is:
get cpuset_sem
if cpuset marked notify_on_release and is ready to release:
compute cpuset path relative to /dev/cpuset mount point
stash path in kmalloc'd buffer
drop cpuset_sem
call_usermodehelper() forks /sbin/cpuset_release_agent with path
free path
The sharp eyed reader might notice that this patch does not contain any
calls to kmalloc. The existing code in the check_for_release() routine was
already kmalloc'ing a buffer to hold the cpuset path. In the old code, it
just held the buffer for a few lines, over the cpuset_release_agent() call
that in turn invoked call_usermodehelper(). In the new code, with the
application of this patch, it returns that buffer via the new char
**ppathbuf parameter, for later use and freeing in cpuset_release_agent(),
which is called after cpuset_sem is dropped. Whereas the old code has just
one call to cpuset_release_agent(), right in the check_for_release()
routine, the new code has three calls to cpuset_release_agent(), from the
various places that a cpuset can be released.
This patch has been build and booted on SN2, and passed a stress test that
previously hit the deadlock within a few seconds.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 68 |
1 files changed, 48 insertions, 20 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 805fb9097318..21a4e3b2cbda 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -398,21 +398,31 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) | |||
398 | * to continue to serve a useful existence. Next time it's released, | 398 | * to continue to serve a useful existence. Next time it's released, |
399 | * we will get notified again, if it still has 'notify_on_release' set. | 399 | * we will get notified again, if it still has 'notify_on_release' set. |
400 | * | 400 | * |
401 | * Note final arg to call_usermodehelper() is 0 - that means | 401 | * The final arg to call_usermodehelper() is 0, which means don't |
402 | * don't wait. Since we are holding the global cpuset_sem here, | 402 | * wait. The separate /sbin/cpuset_release_agent task is forked by |
403 | * and we are asking another thread (started from keventd) to rmdir a | 403 | * call_usermodehelper(), then control in this thread returns here, |
404 | * cpuset, we can't wait - or we'd deadlock with the removing thread | 404 | * without waiting for the release agent task. We don't bother to |
405 | * on cpuset_sem. | 405 | * wait because the caller of this routine has no use for the exit |
406 | * status of the /sbin/cpuset_release_agent task, so no sense holding | ||
407 | * our caller up for that. | ||
408 | * | ||
409 | * The simple act of forking that task might require more memory, | ||
410 | * which might need cpuset_sem. So this routine must be called while | ||
411 | * cpuset_sem is not held, to avoid a possible deadlock. See also | ||
412 | * comments for check_for_release(), below. | ||
406 | */ | 413 | */ |
407 | 414 | ||
408 | static int cpuset_release_agent(char *cpuset_str) | 415 | static void cpuset_release_agent(const char *pathbuf) |
409 | { | 416 | { |
410 | char *argv[3], *envp[3]; | 417 | char *argv[3], *envp[3]; |
411 | int i; | 418 | int i; |
412 | 419 | ||
420 | if (!pathbuf) | ||
421 | return; | ||
422 | |||
413 | i = 0; | 423 | i = 0; |
414 | argv[i++] = "/sbin/cpuset_release_agent"; | 424 | argv[i++] = "/sbin/cpuset_release_agent"; |
415 | argv[i++] = cpuset_str; | 425 | argv[i++] = (char *)pathbuf; |
416 | argv[i] = NULL; | 426 | argv[i] = NULL; |
417 | 427 | ||
418 | i = 0; | 428 | i = 0; |
@@ -421,17 +431,29 @@ static int cpuset_release_agent(char *cpuset_str) | |||
421 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | 431 | envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; |
422 | envp[i] = NULL; | 432 | envp[i] = NULL; |
423 | 433 | ||
424 | return call_usermodehelper(argv[0], argv, envp, 0); | 434 | call_usermodehelper(argv[0], argv, envp, 0); |
435 | kfree(pathbuf); | ||
425 | } | 436 | } |
426 | 437 | ||
427 | /* | 438 | /* |
428 | * Either cs->count of using tasks transitioned to zero, or the | 439 | * Either cs->count of using tasks transitioned to zero, or the |
429 | * cs->children list of child cpusets just became empty. If this | 440 | * cs->children list of child cpusets just became empty. If this |
430 | * cs is notify_on_release() and now both the user count is zero and | 441 | * cs is notify_on_release() and now both the user count is zero and |
431 | * the list of children is empty, send notice to user land. | 442 | * the list of children is empty, prepare cpuset path in a kmalloc'd |
443 | * buffer, to be returned via ppathbuf, so that the caller can invoke | ||
444 | * cpuset_release_agent() with it later on, once cpuset_sem is dropped. | ||
445 | * Call here with cpuset_sem held. | ||
446 | * | ||
447 | * This check_for_release() routine is responsible for kmalloc'ing | ||
448 | * pathbuf. The above cpuset_release_agent() is responsible for | ||
449 | * kfree'ing pathbuf. The caller of these routines is responsible | ||
450 | * for providing a pathbuf pointer, initialized to NULL, then | ||
451 | * calling check_for_release() with cpuset_sem held and the address | ||
452 | * of the pathbuf pointer, then dropping cpuset_sem, then calling | ||
453 | * cpuset_release_agent() with pathbuf, as set by check_for_release(). | ||
432 | */ | 454 | */ |
433 | 455 | ||
434 | static void check_for_release(struct cpuset *cs) | 456 | static void check_for_release(struct cpuset *cs, char **ppathbuf) |
435 | { | 457 | { |
436 | if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && | 458 | if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && |
437 | list_empty(&cs->children)) { | 459 | list_empty(&cs->children)) { |
@@ -441,10 +463,9 @@ static void check_for_release(struct cpuset *cs) | |||
441 | if (!buf) | 463 | if (!buf) |
442 | return; | 464 | return; |
443 | if (cpuset_path(cs, buf, PAGE_SIZE) < 0) | 465 | if (cpuset_path(cs, buf, PAGE_SIZE) < 0) |
444 | goto out; | 466 | kfree(buf); |
445 | cpuset_release_agent(buf); | 467 | else |
446 | out: | 468 | *ppathbuf = buf; |
447 | kfree(buf); | ||
448 | } | 469 | } |
449 | } | 470 | } |
450 | 471 | ||
@@ -727,14 +748,14 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
727 | return 0; | 748 | return 0; |
728 | } | 749 | } |
729 | 750 | ||
730 | static int attach_task(struct cpuset *cs, char *buf) | 751 | static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) |
731 | { | 752 | { |
732 | pid_t pid; | 753 | pid_t pid; |
733 | struct task_struct *tsk; | 754 | struct task_struct *tsk; |
734 | struct cpuset *oldcs; | 755 | struct cpuset *oldcs; |
735 | cpumask_t cpus; | 756 | cpumask_t cpus; |
736 | 757 | ||
737 | if (sscanf(buf, "%d", &pid) != 1) | 758 | if (sscanf(pidbuf, "%d", &pid) != 1) |
738 | return -EIO; | 759 | return -EIO; |
739 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) | 760 | if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) |
740 | return -ENOSPC; | 761 | return -ENOSPC; |
@@ -777,7 +798,7 @@ static int attach_task(struct cpuset *cs, char *buf) | |||
777 | 798 | ||
778 | put_task_struct(tsk); | 799 | put_task_struct(tsk); |
779 | if (atomic_dec_and_test(&oldcs->count)) | 800 | if (atomic_dec_and_test(&oldcs->count)) |
780 | check_for_release(oldcs); | 801 | check_for_release(oldcs, ppathbuf); |
781 | return 0; | 802 | return 0; |
782 | } | 803 | } |
783 | 804 | ||
@@ -801,6 +822,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
801 | struct cftype *cft = __d_cft(file->f_dentry); | 822 | struct cftype *cft = __d_cft(file->f_dentry); |
802 | cpuset_filetype_t type = cft->private; | 823 | cpuset_filetype_t type = cft->private; |
803 | char *buffer; | 824 | char *buffer; |
825 | char *pathbuf = NULL; | ||
804 | int retval = 0; | 826 | int retval = 0; |
805 | 827 | ||
806 | /* Crude upper limit on largest legitimate cpulist user might write. */ | 828 | /* Crude upper limit on largest legitimate cpulist user might write. */ |
@@ -841,7 +863,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
841 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); | 863 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); |
842 | break; | 864 | break; |
843 | case FILE_TASKLIST: | 865 | case FILE_TASKLIST: |
844 | retval = attach_task(cs, buffer); | 866 | retval = attach_task(cs, buffer, &pathbuf); |
845 | break; | 867 | break; |
846 | default: | 868 | default: |
847 | retval = -EINVAL; | 869 | retval = -EINVAL; |
@@ -852,6 +874,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
852 | retval = nbytes; | 874 | retval = nbytes; |
853 | out2: | 875 | out2: |
854 | up(&cpuset_sem); | 876 | up(&cpuset_sem); |
877 | cpuset_release_agent(pathbuf); | ||
855 | out1: | 878 | out1: |
856 | kfree(buffer); | 879 | kfree(buffer); |
857 | return retval; | 880 | return retval; |
@@ -1357,6 +1380,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1357 | struct cpuset *cs = dentry->d_fsdata; | 1380 | struct cpuset *cs = dentry->d_fsdata; |
1358 | struct dentry *d; | 1381 | struct dentry *d; |
1359 | struct cpuset *parent; | 1382 | struct cpuset *parent; |
1383 | char *pathbuf = NULL; | ||
1360 | 1384 | ||
1361 | /* the vfs holds both inode->i_sem already */ | 1385 | /* the vfs holds both inode->i_sem already */ |
1362 | 1386 | ||
@@ -1376,7 +1400,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1376 | update_cpu_domains(cs); | 1400 | update_cpu_domains(cs); |
1377 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1401 | list_del(&cs->sibling); /* delete my sibling from parent->children */ |
1378 | if (list_empty(&parent->children)) | 1402 | if (list_empty(&parent->children)) |
1379 | check_for_release(parent); | 1403 | check_for_release(parent, &pathbuf); |
1380 | spin_lock(&cs->dentry->d_lock); | 1404 | spin_lock(&cs->dentry->d_lock); |
1381 | d = dget(cs->dentry); | 1405 | d = dget(cs->dentry); |
1382 | cs->dentry = NULL; | 1406 | cs->dentry = NULL; |
@@ -1384,6 +1408,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1384 | cpuset_d_remove_dir(d); | 1408 | cpuset_d_remove_dir(d); |
1385 | dput(d); | 1409 | dput(d); |
1386 | up(&cpuset_sem); | 1410 | up(&cpuset_sem); |
1411 | cpuset_release_agent(pathbuf); | ||
1387 | return 0; | 1412 | return 0; |
1388 | } | 1413 | } |
1389 | 1414 | ||
@@ -1483,10 +1508,13 @@ void cpuset_exit(struct task_struct *tsk) | |||
1483 | task_unlock(tsk); | 1508 | task_unlock(tsk); |
1484 | 1509 | ||
1485 | if (notify_on_release(cs)) { | 1510 | if (notify_on_release(cs)) { |
1511 | char *pathbuf = NULL; | ||
1512 | |||
1486 | down(&cpuset_sem); | 1513 | down(&cpuset_sem); |
1487 | if (atomic_dec_and_test(&cs->count)) | 1514 | if (atomic_dec_and_test(&cs->count)) |
1488 | check_for_release(cs); | 1515 | check_for_release(cs, &pathbuf); |
1489 | up(&cpuset_sem); | 1516 | up(&cpuset_sem); |
1517 | cpuset_release_agent(pathbuf); | ||
1490 | } else { | 1518 | } else { |
1491 | atomic_dec(&cs->count); | 1519 | atomic_dec(&cs->count); |
1492 | } | 1520 | } |