aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2005-09-10 03:26:06 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-10 13:06:21 -0400
commit4247bdc60048018b98f71228b45cfbc5f5270c86 (patch)
tree6f6abbd10685af84c97e661da6771726a12209ac
parentfb1c8f93d869b34cacb8b8932e2b83d96a19d720 (diff)
[PATCH] cpuset semaphore depth check deadlock fix
The cpusets-formalize-intermediate-gfp_kernel-containment patch has a deadlock problem. This patch was part of a set of four patches to make more extensive use of the cpuset 'mem_exclusive' attribute to manage kernel GFP_KERNEL memory allocations and to constrain the out-of-memory (oom) killer. A task that is changing cpusets in particular ways on a system when it is very short of free memory could double trip over the global cpuset_sem semaphore (get the lock and then deadlock trying to get it again). The second attempt to get cpuset_sem would be in the routine cpuset_zone_allowed(). This was discovered by code inspection. I can not reproduce the problem except with an artifically hacked kernel and a specialized stress test. In real life you cannot hit this unless you are manipulating cpusets, and are very unlikely to hit it unless you are rapidly modifying cpusets on a memory tight system. Even then it would be a rare occurence. If you did hit it, the task double tripping over cpuset_sem would deadlock in the kernel, and any other task also trying to manipulate cpusets would deadlock there too, on cpuset_sem. Your batch manager would be wedged solid (if it was cpuset savvy), but classic Unix shells and utilities would work well enough to reboot the system. The unusual condition that led to this bug is that unlike most semaphores, cpuset_sem _can_ be acquired while in the page allocation code, when __alloc_pages() calls cpuset_zone_allowed. So it easy to mistakenly perform the following sequence: 1) task makes system call to alter a cpuset 2) take cpuset_sem 3) try to allocate memory 4) memory allocator, via cpuset_zone_allowed, trys to take cpuset_sem 5) deadlock The reason that this is not a serious bug for most users is that almost all calls to allocate memory don't require taking cpuset_sem. Only some code paths off the beaten track require taking cpuset_sem -- which is good. Taking a global semaphore on the main code path for allocating memory would not scale well. This patch fixes this deadlock by wrapping the up() and down() calls on cpuset_sem in kernel/cpuset.c with code that tracks the nesting depth of the current task on that semaphore, and only does the real down() if the task doesn't hold the lock already, and only does the real up() if the nesting depth (number of unmatched downs) is exactly one. The previous required use of refresh_mems(), anytime that the cpuset_sem semaphore was acquired and the code executed while holding that semaphore might try to allocate memory, is no longer required. Two refresh_mems() calls were removed thanks to this. This is a good change, as failing to get all the necessary refresh_mems() calls placed was a primary source of bugs in this cpuset code. The only remaining call to refresh_mems() is made while doing a memory allocation, if certain task memory placement data needs to be updated from its cpuset, due to the cpuset having been changed behind the tasks back. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/cpuset.c100
2 files changed, 61 insertions, 40 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c551e6a1447e..8a1fcfe80fc7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -782,6 +782,7 @@ struct task_struct {
782 short il_next; 782 short il_next;
783#endif 783#endif
784#ifdef CONFIG_CPUSETS 784#ifdef CONFIG_CPUSETS
785 short cpuset_sem_nest_depth;
785 struct cpuset *cpuset; 786 struct cpuset *cpuset;
786 nodemask_t mems_allowed; 787 nodemask_t mems_allowed;
787 int cpuset_mems_generation; 788 int cpuset_mems_generation;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 712d02029971..407b5f0a8c8e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -182,6 +182,37 @@ static struct super_block *cpuset_sb = NULL;
182static DECLARE_MUTEX(cpuset_sem); 182static DECLARE_MUTEX(cpuset_sem);
183 183
184/* 184/*
185 * The global cpuset semaphore cpuset_sem can be needed by the
186 * memory allocator to update a tasks mems_allowed (see the calls
187 * to cpuset_update_current_mems_allowed()) or to walk up the
188 * cpuset hierarchy to find a mem_exclusive cpuset see the calls
189 * to cpuset_excl_nodes_overlap()).
190 *
191 * But if the memory allocation is being done by cpuset.c code, it
192 * usually already holds cpuset_sem. Double tripping on a kernel
193 * semaphore deadlocks the current task, and any other task that
194 * subsequently tries to obtain the lock.
195 *
196 * Run all up's and down's on cpuset_sem through the following
197 * wrappers, which will detect this nested locking, and avoid
198 * deadlocking.
199 */
200
201static inline void cpuset_down(struct semaphore *psem)
202{
203 if (current->cpuset_sem_nest_depth == 0)
204 down(psem);
205 current->cpuset_sem_nest_depth++;
206}
207
208static inline void cpuset_up(struct semaphore *psem)
209{
210 current->cpuset_sem_nest_depth--;
211 if (current->cpuset_sem_nest_depth == 0)
212 up(psem);
213}
214
215/*
185 * A couple of forward declarations required, due to cyclic reference loop: 216 * A couple of forward declarations required, due to cyclic reference loop:
186 * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file 217 * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file
187 * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir. 218 * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
@@ -522,19 +553,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
522 * Refresh current tasks mems_allowed and mems_generation from 553 * Refresh current tasks mems_allowed and mems_generation from
523 * current tasks cpuset. Call with cpuset_sem held. 554 * current tasks cpuset. Call with cpuset_sem held.
524 * 555 *
525 * Be sure to call refresh_mems() on any cpuset operation which 556 * This routine is needed to update the per-task mems_allowed
526 * (1) holds cpuset_sem, and (2) might possibly alloc memory. 557 * data, within the tasks context, when it is trying to allocate
527 * Call after obtaining cpuset_sem lock, before any possible 558 * memory (in various mm/mempolicy.c routines) and notices
528 * allocation. Otherwise one risks trying to allocate memory 559 * that some other task has been modifying its cpuset.
529 * while the task cpuset_mems_generation is not the same as
530 * the mems_generation in its cpuset, which would deadlock on
531 * cpuset_sem in cpuset_update_current_mems_allowed().
532 *
533 * Since we hold cpuset_sem, once refresh_mems() is called, the
534 * test (current->cpuset_mems_generation != cs->mems_generation)
535 * in cpuset_update_current_mems_allowed() will remain false,
536 * until we drop cpuset_sem. Anyone else who would change our
537 * cpusets mems_generation needs to lock cpuset_sem first.
538 */ 560 */
539 561
540static void refresh_mems(void) 562static void refresh_mems(void)
@@ -840,7 +862,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
840 } 862 }
841 buffer[nbytes] = 0; /* nul-terminate */ 863 buffer[nbytes] = 0; /* nul-terminate */
842 864
843 down(&cpuset_sem); 865 cpuset_down(&cpuset_sem);
844 866
845 if (is_removed(cs)) { 867 if (is_removed(cs)) {
846 retval = -ENODEV; 868 retval = -ENODEV;
@@ -874,7 +896,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
874 if (retval == 0) 896 if (retval == 0)
875 retval = nbytes; 897 retval = nbytes;
876out2: 898out2:
877 up(&cpuset_sem); 899 cpuset_up(&cpuset_sem);
878 cpuset_release_agent(pathbuf); 900 cpuset_release_agent(pathbuf);
879out1: 901out1:
880 kfree(buffer); 902 kfree(buffer);
@@ -914,9 +936,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
914{ 936{
915 cpumask_t mask; 937 cpumask_t mask;
916 938
917 down(&cpuset_sem); 939 cpuset_down(&cpuset_sem);
918 mask = cs->cpus_allowed; 940 mask = cs->cpus_allowed;
919 up(&cpuset_sem); 941 cpuset_up(&cpuset_sem);
920 942
921 return cpulist_scnprintf(page, PAGE_SIZE, mask); 943 return cpulist_scnprintf(page, PAGE_SIZE, mask);
922} 944}
@@ -925,9 +947,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
925{ 947{
926 nodemask_t mask; 948 nodemask_t mask;
927 949
928 down(&cpuset_sem); 950 cpuset_down(&cpuset_sem);
929 mask = cs->mems_allowed; 951 mask = cs->mems_allowed;
930 up(&cpuset_sem); 952 cpuset_up(&cpuset_sem);
931 953
932 return nodelist_scnprintf(page, PAGE_SIZE, mask); 954 return nodelist_scnprintf(page, PAGE_SIZE, mask);
933} 955}
@@ -1334,8 +1356,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1334 if (!cs) 1356 if (!cs)
1335 return -ENOMEM; 1357 return -ENOMEM;
1336 1358
1337 down(&cpuset_sem); 1359 cpuset_down(&cpuset_sem);
1338 refresh_mems();
1339 cs->flags = 0; 1360 cs->flags = 0;
1340 if (notify_on_release(parent)) 1361 if (notify_on_release(parent))
1341 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1362 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1360,14 +1381,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1360 * will down() this new directory's i_sem and if we race with 1381 * will down() this new directory's i_sem and if we race with
1361 * another mkdir, we might deadlock. 1382 * another mkdir, we might deadlock.
1362 */ 1383 */
1363 up(&cpuset_sem); 1384 cpuset_up(&cpuset_sem);
1364 1385
1365 err = cpuset_populate_dir(cs->dentry); 1386 err = cpuset_populate_dir(cs->dentry);
1366 /* If err < 0, we have a half-filled directory - oh well ;) */ 1387 /* If err < 0, we have a half-filled directory - oh well ;) */
1367 return 0; 1388 return 0;
1368err: 1389err:
1369 list_del(&cs->sibling); 1390 list_del(&cs->sibling);
1370 up(&cpuset_sem); 1391 cpuset_up(&cpuset_sem);
1371 kfree(cs); 1392 kfree(cs);
1372 return err; 1393 return err;
1373} 1394}
@@ -1389,14 +1410,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1389 1410
1390 /* the vfs holds both inode->i_sem already */ 1411 /* the vfs holds both inode->i_sem already */
1391 1412
1392 down(&cpuset_sem); 1413 cpuset_down(&cpuset_sem);
1393 refresh_mems();
1394 if (atomic_read(&cs->count) > 0) { 1414 if (atomic_read(&cs->count) > 0) {
1395 up(&cpuset_sem); 1415 cpuset_up(&cpuset_sem);
1396 return -EBUSY; 1416 return -EBUSY;
1397 } 1417 }
1398 if (!list_empty(&cs->children)) { 1418 if (!list_empty(&cs->children)) {
1399 up(&cpuset_sem); 1419 cpuset_up(&cpuset_sem);
1400 return -EBUSY; 1420 return -EBUSY;
1401 } 1421 }
1402 parent = cs->parent; 1422 parent = cs->parent;
@@ -1412,7 +1432,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1412 spin_unlock(&d->d_lock); 1432 spin_unlock(&d->d_lock);
1413 cpuset_d_remove_dir(d); 1433 cpuset_d_remove_dir(d);
1414 dput(d); 1434 dput(d);
1415 up(&cpuset_sem); 1435 cpuset_up(&cpuset_sem);
1416 cpuset_release_agent(pathbuf); 1436 cpuset_release_agent(pathbuf);
1417 return 0; 1437 return 0;
1418} 1438}
@@ -1515,10 +1535,10 @@ void cpuset_exit(struct task_struct *tsk)
1515 if (notify_on_release(cs)) { 1535 if (notify_on_release(cs)) {
1516 char *pathbuf = NULL; 1536 char *pathbuf = NULL;
1517 1537
1518 down(&cpuset_sem); 1538 cpuset_down(&cpuset_sem);
1519 if (atomic_dec_and_test(&cs->count)) 1539 if (atomic_dec_and_test(&cs->count))
1520 check_for_release(cs, &pathbuf); 1540 check_for_release(cs, &pathbuf);
1521 up(&cpuset_sem); 1541 cpuset_up(&cpuset_sem);
1522 cpuset_release_agent(pathbuf); 1542 cpuset_release_agent(pathbuf);
1523 } else { 1543 } else {
1524 atomic_dec(&cs->count); 1544 atomic_dec(&cs->count);
@@ -1539,11 +1559,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
1539{ 1559{
1540 cpumask_t mask; 1560 cpumask_t mask;
1541 1561
1542 down(&cpuset_sem); 1562 cpuset_down(&cpuset_sem);
1543 task_lock((struct task_struct *)tsk); 1563 task_lock((struct task_struct *)tsk);
1544 guarantee_online_cpus(tsk->cpuset, &mask); 1564 guarantee_online_cpus(tsk->cpuset, &mask);
1545 task_unlock((struct task_struct *)tsk); 1565 task_unlock((struct task_struct *)tsk);
1546 up(&cpuset_sem); 1566 cpuset_up(&cpuset_sem);
1547 1567
1548 return mask; 1568 return mask;
1549} 1569}
@@ -1568,9 +1588,9 @@ void cpuset_update_current_mems_allowed(void)
1568 if (!cs) 1588 if (!cs)
1569 return; /* task is exiting */ 1589 return; /* task is exiting */
1570 if (current->cpuset_mems_generation != cs->mems_generation) { 1590 if (current->cpuset_mems_generation != cs->mems_generation) {
1571 down(&cpuset_sem); 1591 cpuset_down(&cpuset_sem);
1572 refresh_mems(); 1592 refresh_mems();
1573 up(&cpuset_sem); 1593 cpuset_up(&cpuset_sem);
1574 } 1594 }
1575} 1595}
1576 1596
@@ -1669,14 +1689,14 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
1669 return 0; 1689 return 0;
1670 1690
1671 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 1691 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1672 down(&cpuset_sem); 1692 cpuset_down(&cpuset_sem);
1673 cs = current->cpuset; 1693 cs = current->cpuset;
1674 if (!cs) 1694 if (!cs)
1675 goto done; /* current task exiting */ 1695 goto done; /* current task exiting */
1676 cs = nearest_exclusive_ancestor(cs); 1696 cs = nearest_exclusive_ancestor(cs);
1677 allowed = node_isset(node, cs->mems_allowed); 1697 allowed = node_isset(node, cs->mems_allowed);
1678done: 1698done:
1679 up(&cpuset_sem); 1699 cpuset_up(&cpuset_sem);
1680 return allowed; 1700 return allowed;
1681} 1701}
1682 1702
@@ -1697,7 +1717,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1697 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 1717 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1698 int overlap = 0; /* do cpusets overlap? */ 1718 int overlap = 0; /* do cpusets overlap? */
1699 1719
1700 down(&cpuset_sem); 1720 cpuset_down(&cpuset_sem);
1701 cs1 = current->cpuset; 1721 cs1 = current->cpuset;
1702 if (!cs1) 1722 if (!cs1)
1703 goto done; /* current task exiting */ 1723 goto done; /* current task exiting */
@@ -1708,7 +1728,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1708 cs2 = nearest_exclusive_ancestor(cs2); 1728 cs2 = nearest_exclusive_ancestor(cs2);
1709 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); 1729 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1710done: 1730done:
1711 up(&cpuset_sem); 1731 cpuset_up(&cpuset_sem);
1712 1732
1713 return overlap; 1733 return overlap;
1714} 1734}
@@ -1731,7 +1751,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1731 return -ENOMEM; 1751 return -ENOMEM;
1732 1752
1733 tsk = m->private; 1753 tsk = m->private;
1734 down(&cpuset_sem); 1754 cpuset_down(&cpuset_sem);
1735 task_lock(tsk); 1755 task_lock(tsk);
1736 cs = tsk->cpuset; 1756 cs = tsk->cpuset;
1737 task_unlock(tsk); 1757 task_unlock(tsk);
@@ -1746,7 +1766,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1746 seq_puts(m, buf); 1766 seq_puts(m, buf);
1747 seq_putc(m, '\n'); 1767 seq_putc(m, '\n');
1748out: 1768out:
1749 up(&cpuset_sem); 1769 cpuset_up(&cpuset_sem);
1750 kfree(buf); 1770 kfree(buf);
1751 return retval; 1771 return retval;
1752} 1772}