aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2005-10-30 18:02:28 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 20:37:21 -0500
commit5aa15b5f27fc2c404530c6c8eabdb8437deb3163 (patch)
tree914f0b33f5190bd0183cde2e9f6da552d3d1d7aa /kernel
parentf35f31d7ed0150f9865619f21b5050c91b46c03f (diff)
[PATCH] cpusets: remove depth counted locking hack
Remove a rather hackish depth counter on cpuset locking. The depth counter was avoiding a possible double trip on the global cpuset_sem semaphore. It worked, but now an improved version of cpuset locking is available, to come in the next patch, using two global semaphores. This patch reverses "cpuset semaphore depth check deadlock fix" The kernel still works, even after this patch, except for some rare and difficult to reproduce race conditions when agressively creating and destroying cpusets marked with the notify_on_release option, on very large systems. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpuset.c105
1 files changed, 40 insertions, 65 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b9342f90d28f..cd54dba2be18 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -180,42 +180,6 @@ static struct super_block *cpuset_sb = NULL;
180 */ 180 */
181 181
182static DECLARE_MUTEX(cpuset_sem); 182static DECLARE_MUTEX(cpuset_sem);
183static struct task_struct *cpuset_sem_owner;
184static int cpuset_sem_depth;
185
186/*
187 * The global cpuset semaphore cpuset_sem can be needed by the
188 * memory allocator to update a tasks mems_allowed (see the calls
189 * to cpuset_update_current_mems_allowed()) or to walk up the
190 * cpuset hierarchy to find a mem_exclusive cpuset see the calls
191 * to cpuset_excl_nodes_overlap()).
192 *
193 * But if the memory allocation is being done by cpuset.c code, it
194 * usually already holds cpuset_sem. Double tripping on a kernel
195 * semaphore deadlocks the current task, and any other task that
196 * subsequently tries to obtain the lock.
197 *
198 * Run all up's and down's on cpuset_sem through the following
199 * wrappers, which will detect this nested locking, and avoid
200 * deadlocking.
201 */
202
203static inline void cpuset_down(struct semaphore *psem)
204{
205 if (cpuset_sem_owner != current) {
206 down(psem);
207 cpuset_sem_owner = current;
208 }
209 cpuset_sem_depth++;
210}
211
212static inline void cpuset_up(struct semaphore *psem)
213{
214 if (--cpuset_sem_depth == 0) {
215 cpuset_sem_owner = NULL;
216 up(psem);
217 }
218}
219 183
220/* 184/*
221 * A couple of forward declarations required, due to cyclic reference loop: 185 * A couple of forward declarations required, due to cyclic reference loop:
@@ -558,10 +522,19 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
558 * Refresh current tasks mems_allowed and mems_generation from 522 * Refresh current tasks mems_allowed and mems_generation from
559 * current tasks cpuset. Call with cpuset_sem held. 523 * current tasks cpuset. Call with cpuset_sem held.
560 * 524 *
561 * This routine is needed to update the per-task mems_allowed 525 * Be sure to call refresh_mems() on any cpuset operation which
562 * data, within the tasks context, when it is trying to allocate 526 * (1) holds cpuset_sem, and (2) might possibly alloc memory.
563 * memory (in various mm/mempolicy.c routines) and notices 527 * Call after obtaining cpuset_sem lock, before any possible
564 * that some other task has been modifying its cpuset. 528 * allocation. Otherwise one risks trying to allocate memory
529 * while the task cpuset_mems_generation is not the same as
530 * the mems_generation in its cpuset, which would deadlock on
531 * cpuset_sem in cpuset_update_current_mems_allowed().
532 *
533 * Since we hold cpuset_sem, once refresh_mems() is called, the
534 * test (current->cpuset_mems_generation != cs->mems_generation)
535 * in cpuset_update_current_mems_allowed() will remain false,
536 * until we drop cpuset_sem. Anyone else who would change our
537 * cpusets mems_generation needs to lock cpuset_sem first.
565 */ 538 */
566 539
567static void refresh_mems(void) 540static void refresh_mems(void)
@@ -867,7 +840,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
867 } 840 }
868 buffer[nbytes] = 0; /* nul-terminate */ 841 buffer[nbytes] = 0; /* nul-terminate */
869 842
870 cpuset_down(&cpuset_sem); 843 down(&cpuset_sem);
871 844
872 if (is_removed(cs)) { 845 if (is_removed(cs)) {
873 retval = -ENODEV; 846 retval = -ENODEV;
@@ -901,7 +874,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
901 if (retval == 0) 874 if (retval == 0)
902 retval = nbytes; 875 retval = nbytes;
903out2: 876out2:
904 cpuset_up(&cpuset_sem); 877 up(&cpuset_sem);
905 cpuset_release_agent(pathbuf); 878 cpuset_release_agent(pathbuf);
906out1: 879out1:
907 kfree(buffer); 880 kfree(buffer);
@@ -941,9 +914,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
941{ 914{
942 cpumask_t mask; 915 cpumask_t mask;
943 916
944 cpuset_down(&cpuset_sem); 917 down(&cpuset_sem);
945 mask = cs->cpus_allowed; 918 mask = cs->cpus_allowed;
946 cpuset_up(&cpuset_sem); 919 up(&cpuset_sem);
947 920
948 return cpulist_scnprintf(page, PAGE_SIZE, mask); 921 return cpulist_scnprintf(page, PAGE_SIZE, mask);
949} 922}
@@ -952,9 +925,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
952{ 925{
953 nodemask_t mask; 926 nodemask_t mask;
954 927
955 cpuset_down(&cpuset_sem); 928 down(&cpuset_sem);
956 mask = cs->mems_allowed; 929 mask = cs->mems_allowed;
957 cpuset_up(&cpuset_sem); 930 up(&cpuset_sem);
958 931
959 return nodelist_scnprintf(page, PAGE_SIZE, mask); 932 return nodelist_scnprintf(page, PAGE_SIZE, mask);
960} 933}
@@ -1351,7 +1324,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1351 if (!cs) 1324 if (!cs)
1352 return -ENOMEM; 1325 return -ENOMEM;
1353 1326
1354 cpuset_down(&cpuset_sem); 1327 down(&cpuset_sem);
1328 refresh_mems();
1355 cs->flags = 0; 1329 cs->flags = 0;
1356 if (notify_on_release(parent)) 1330 if (notify_on_release(parent))
1357 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1331 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1376,14 +1350,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1376 * will down() this new directory's i_sem and if we race with 1350 * will down() this new directory's i_sem and if we race with
1377 * another mkdir, we might deadlock. 1351 * another mkdir, we might deadlock.
1378 */ 1352 */
1379 cpuset_up(&cpuset_sem); 1353 up(&cpuset_sem);
1380 1354
1381 err = cpuset_populate_dir(cs->dentry); 1355 err = cpuset_populate_dir(cs->dentry);
1382 /* If err < 0, we have a half-filled directory - oh well ;) */ 1356 /* If err < 0, we have a half-filled directory - oh well ;) */
1383 return 0; 1357 return 0;
1384err: 1358err:
1385 list_del(&cs->sibling); 1359 list_del(&cs->sibling);
1386 cpuset_up(&cpuset_sem); 1360 up(&cpuset_sem);
1387 kfree(cs); 1361 kfree(cs);
1388 return err; 1362 return err;
1389} 1363}
@@ -1405,13 +1379,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1405 1379
1406 /* the vfs holds both inode->i_sem already */ 1380 /* the vfs holds both inode->i_sem already */
1407 1381
1408 cpuset_down(&cpuset_sem); 1382 down(&cpuset_sem);
1383 refresh_mems();
1409 if (atomic_read(&cs->count) > 0) { 1384 if (atomic_read(&cs->count) > 0) {
1410 cpuset_up(&cpuset_sem); 1385 up(&cpuset_sem);
1411 return -EBUSY; 1386 return -EBUSY;
1412 } 1387 }
1413 if (!list_empty(&cs->children)) { 1388 if (!list_empty(&cs->children)) {
1414 cpuset_up(&cpuset_sem); 1389 up(&cpuset_sem);
1415 return -EBUSY; 1390 return -EBUSY;
1416 } 1391 }
1417 parent = cs->parent; 1392 parent = cs->parent;
@@ -1427,7 +1402,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1427 spin_unlock(&d->d_lock); 1402 spin_unlock(&d->d_lock);
1428 cpuset_d_remove_dir(d); 1403 cpuset_d_remove_dir(d);
1429 dput(d); 1404 dput(d);
1430 cpuset_up(&cpuset_sem); 1405 up(&cpuset_sem);
1431 cpuset_release_agent(pathbuf); 1406 cpuset_release_agent(pathbuf);
1432 return 0; 1407 return 0;
1433} 1408}
@@ -1530,10 +1505,10 @@ void cpuset_exit(struct task_struct *tsk)
1530 if (notify_on_release(cs)) { 1505 if (notify_on_release(cs)) {
1531 char *pathbuf = NULL; 1506 char *pathbuf = NULL;
1532 1507
1533 cpuset_down(&cpuset_sem); 1508 down(&cpuset_sem);
1534 if (atomic_dec_and_test(&cs->count)) 1509 if (atomic_dec_and_test(&cs->count))
1535 check_for_release(cs, &pathbuf); 1510 check_for_release(cs, &pathbuf);
1536 cpuset_up(&cpuset_sem); 1511 up(&cpuset_sem);
1537 cpuset_release_agent(pathbuf); 1512 cpuset_release_agent(pathbuf);
1538 } else { 1513 } else {
1539 atomic_dec(&cs->count); 1514 atomic_dec(&cs->count);
@@ -1554,11 +1529,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
1554{ 1529{
1555 cpumask_t mask; 1530 cpumask_t mask;
1556 1531
1557 cpuset_down(&cpuset_sem); 1532 down(&cpuset_sem);
1558 task_lock((struct task_struct *)tsk); 1533 task_lock((struct task_struct *)tsk);
1559 guarantee_online_cpus(tsk->cpuset, &mask); 1534 guarantee_online_cpus(tsk->cpuset, &mask);
1560 task_unlock((struct task_struct *)tsk); 1535 task_unlock((struct task_struct *)tsk);
1561 cpuset_up(&cpuset_sem); 1536 up(&cpuset_sem);
1562 1537
1563 return mask; 1538 return mask;
1564} 1539}
@@ -1583,9 +1558,9 @@ void cpuset_update_current_mems_allowed(void)
1583 if (!cs) 1558 if (!cs)
1584 return; /* task is exiting */ 1559 return; /* task is exiting */
1585 if (current->cpuset_mems_generation != cs->mems_generation) { 1560 if (current->cpuset_mems_generation != cs->mems_generation) {
1586 cpuset_down(&cpuset_sem); 1561 down(&cpuset_sem);
1587 refresh_mems(); 1562 refresh_mems();
1588 cpuset_up(&cpuset_sem); 1563 up(&cpuset_sem);
1589 } 1564 }
1590} 1565}
1591 1566
@@ -1684,14 +1659,14 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1684 return 0; 1659 return 0;
1685 1660
1686 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 1661 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1687 cpuset_down(&cpuset_sem); 1662 down(&cpuset_sem);
1688 cs = current->cpuset; 1663 cs = current->cpuset;
1689 if (!cs) 1664 if (!cs)
1690 goto done; /* current task exiting */ 1665 goto done; /* current task exiting */
1691 cs = nearest_exclusive_ancestor(cs); 1666 cs = nearest_exclusive_ancestor(cs);
1692 allowed = node_isset(node, cs->mems_allowed); 1667 allowed = node_isset(node, cs->mems_allowed);
1693done: 1668done:
1694 cpuset_up(&cpuset_sem); 1669 up(&cpuset_sem);
1695 return allowed; 1670 return allowed;
1696} 1671}
1697 1672
@@ -1712,7 +1687,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1712 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 1687 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1713 int overlap = 0; /* do cpusets overlap? */ 1688 int overlap = 0; /* do cpusets overlap? */
1714 1689
1715 cpuset_down(&cpuset_sem); 1690 down(&cpuset_sem);
1716 cs1 = current->cpuset; 1691 cs1 = current->cpuset;
1717 if (!cs1) 1692 if (!cs1)
1718 goto done; /* current task exiting */ 1693 goto done; /* current task exiting */
@@ -1723,7 +1698,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1723 cs2 = nearest_exclusive_ancestor(cs2); 1698 cs2 = nearest_exclusive_ancestor(cs2);
1724 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); 1699 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1725done: 1700done:
1726 cpuset_up(&cpuset_sem); 1701 up(&cpuset_sem);
1727 1702
1728 return overlap; 1703 return overlap;
1729} 1704}
@@ -1746,7 +1721,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1746 return -ENOMEM; 1721 return -ENOMEM;
1747 1722
1748 tsk = m->private; 1723 tsk = m->private;
1749 cpuset_down(&cpuset_sem); 1724 down(&cpuset_sem);
1750 task_lock(tsk); 1725 task_lock(tsk);
1751 cs = tsk->cpuset; 1726 cs = tsk->cpuset;
1752 task_unlock(tsk); 1727 task_unlock(tsk);
@@ -1761,7 +1736,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1761 seq_puts(m, buf); 1736 seq_puts(m, buf);
1762 seq_putc(m, '\n'); 1737 seq_putc(m, '\n');
1763out: 1738out:
1764 cpuset_up(&cpuset_sem); 1739 up(&cpuset_sem);
1765 kfree(buf); 1740 kfree(buf);
1766 return retval; 1741 return retval;
1767} 1742}