aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/cpuset.c100
2 files changed, 61 insertions, 40 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c551e6a1447e..8a1fcfe80fc7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -782,6 +782,7 @@ struct task_struct {
782 short il_next; 782 short il_next;
783#endif 783#endif
784#ifdef CONFIG_CPUSETS 784#ifdef CONFIG_CPUSETS
785 short cpuset_sem_nest_depth;
785 struct cpuset *cpuset; 786 struct cpuset *cpuset;
786 nodemask_t mems_allowed; 787 nodemask_t mems_allowed;
787 int cpuset_mems_generation; 788 int cpuset_mems_generation;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 712d02029971..407b5f0a8c8e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -182,6 +182,37 @@ static struct super_block *cpuset_sb = NULL;
182static DECLARE_MUTEX(cpuset_sem); 182static DECLARE_MUTEX(cpuset_sem);
183 183
184/* 184/*
185 * The global cpuset semaphore cpuset_sem can be needed by the
186 * memory allocator to update a tasks mems_allowed (see the calls
187 * to cpuset_update_current_mems_allowed()) or to walk up the
188 * cpuset hierarchy to find a mem_exclusive cpuset see the calls
189 * to cpuset_excl_nodes_overlap()).
190 *
191 * But if the memory allocation is being done by cpuset.c code, it
192 * usually already holds cpuset_sem. Double tripping on a kernel
193 * semaphore deadlocks the current task, and any other task that
194 * subsequently tries to obtain the lock.
195 *
196 * Run all up's and down's on cpuset_sem through the following
197 * wrappers, which will detect this nested locking, and avoid
198 * deadlocking.
199 */
200
201static inline void cpuset_down(struct semaphore *psem)
202{
203 if (current->cpuset_sem_nest_depth == 0)
204 down(psem);
205 current->cpuset_sem_nest_depth++;
206}
207
208static inline void cpuset_up(struct semaphore *psem)
209{
210 current->cpuset_sem_nest_depth--;
211 if (current->cpuset_sem_nest_depth == 0)
212 up(psem);
213}
214
215/*
185 * A couple of forward declarations required, due to cyclic reference loop: 216 * A couple of forward declarations required, due to cyclic reference loop:
186 * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file 217 * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file
187 * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir. 218 * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir.
@@ -522,19 +553,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
522 * Refresh current tasks mems_allowed and mems_generation from 553 * Refresh current tasks mems_allowed and mems_generation from
523 * current tasks cpuset. Call with cpuset_sem held. 554 * current tasks cpuset. Call with cpuset_sem held.
524 * 555 *
525 * Be sure to call refresh_mems() on any cpuset operation which 556 * This routine is needed to update the per-task mems_allowed
526 * (1) holds cpuset_sem, and (2) might possibly alloc memory. 557 * data, within the tasks context, when it is trying to allocate
527 * Call after obtaining cpuset_sem lock, before any possible 558 * memory (in various mm/mempolicy.c routines) and notices
528 * allocation. Otherwise one risks trying to allocate memory 559 * that some other task has been modifying its cpuset.
529 * while the task cpuset_mems_generation is not the same as
530 * the mems_generation in its cpuset, which would deadlock on
531 * cpuset_sem in cpuset_update_current_mems_allowed().
532 *
533 * Since we hold cpuset_sem, once refresh_mems() is called, the
534 * test (current->cpuset_mems_generation != cs->mems_generation)
535 * in cpuset_update_current_mems_allowed() will remain false,
536 * until we drop cpuset_sem. Anyone else who would change our
537 * cpusets mems_generation needs to lock cpuset_sem first.
538 */ 560 */
539 561
540static void refresh_mems(void) 562static void refresh_mems(void)
@@ -840,7 +862,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
840 } 862 }
841 buffer[nbytes] = 0; /* nul-terminate */ 863 buffer[nbytes] = 0; /* nul-terminate */
842 864
843 down(&cpuset_sem); 865 cpuset_down(&cpuset_sem);
844 866
845 if (is_removed(cs)) { 867 if (is_removed(cs)) {
846 retval = -ENODEV; 868 retval = -ENODEV;
@@ -874,7 +896,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
874 if (retval == 0) 896 if (retval == 0)
875 retval = nbytes; 897 retval = nbytes;
876out2: 898out2:
877 up(&cpuset_sem); 899 cpuset_up(&cpuset_sem);
878 cpuset_release_agent(pathbuf); 900 cpuset_release_agent(pathbuf);
879out1: 901out1:
880 kfree(buffer); 902 kfree(buffer);
@@ -914,9 +936,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
914{ 936{
915 cpumask_t mask; 937 cpumask_t mask;
916 938
917 down(&cpuset_sem); 939 cpuset_down(&cpuset_sem);
918 mask = cs->cpus_allowed; 940 mask = cs->cpus_allowed;
919 up(&cpuset_sem); 941 cpuset_up(&cpuset_sem);
920 942
921 return cpulist_scnprintf(page, PAGE_SIZE, mask); 943 return cpulist_scnprintf(page, PAGE_SIZE, mask);
922} 944}
@@ -925,9 +947,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
925{ 947{
926 nodemask_t mask; 948 nodemask_t mask;
927 949
928 down(&cpuset_sem); 950 cpuset_down(&cpuset_sem);
929 mask = cs->mems_allowed; 951 mask = cs->mems_allowed;
930 up(&cpuset_sem); 952 cpuset_up(&cpuset_sem);
931 953
932 return nodelist_scnprintf(page, PAGE_SIZE, mask); 954 return nodelist_scnprintf(page, PAGE_SIZE, mask);
933} 955}
@@ -1334,8 +1356,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1334 if (!cs) 1356 if (!cs)
1335 return -ENOMEM; 1357 return -ENOMEM;
1336 1358
1337 down(&cpuset_sem); 1359 cpuset_down(&cpuset_sem);
1338 refresh_mems();
1339 cs->flags = 0; 1360 cs->flags = 0;
1340 if (notify_on_release(parent)) 1361 if (notify_on_release(parent))
1341 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1362 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1360,14 +1381,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1360 * will down() this new directory's i_sem and if we race with 1381 * will down() this new directory's i_sem and if we race with
1361 * another mkdir, we might deadlock. 1382 * another mkdir, we might deadlock.
1362 */ 1383 */
1363 up(&cpuset_sem); 1384 cpuset_up(&cpuset_sem);
1364 1385
1365 err = cpuset_populate_dir(cs->dentry); 1386 err = cpuset_populate_dir(cs->dentry);
1366 /* If err < 0, we have a half-filled directory - oh well ;) */ 1387 /* If err < 0, we have a half-filled directory - oh well ;) */
1367 return 0; 1388 return 0;
1368err: 1389err:
1369 list_del(&cs->sibling); 1390 list_del(&cs->sibling);
1370 up(&cpuset_sem); 1391 cpuset_up(&cpuset_sem);
1371 kfree(cs); 1392 kfree(cs);
1372 return err; 1393 return err;
1373} 1394}
@@ -1389,14 +1410,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1389 1410
1390 /* the vfs holds both inode->i_sem already */ 1411 /* the vfs holds both inode->i_sem already */
1391 1412
1392 down(&cpuset_sem); 1413 cpuset_down(&cpuset_sem);
1393 refresh_mems();
1394 if (atomic_read(&cs->count) > 0) { 1414 if (atomic_read(&cs->count) > 0) {
1395 up(&cpuset_sem); 1415 cpuset_up(&cpuset_sem);
1396 return -EBUSY; 1416 return -EBUSY;
1397 } 1417 }
1398 if (!list_empty(&cs->children)) { 1418 if (!list_empty(&cs->children)) {
1399 up(&cpuset_sem); 1419 cpuset_up(&cpuset_sem);
1400 return -EBUSY; 1420 return -EBUSY;
1401 } 1421 }
1402 parent = cs->parent; 1422 parent = cs->parent;
@@ -1412,7 +1432,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1412 spin_unlock(&d->d_lock); 1432 spin_unlock(&d->d_lock);
1413 cpuset_d_remove_dir(d); 1433 cpuset_d_remove_dir(d);
1414 dput(d); 1434 dput(d);
1415 up(&cpuset_sem); 1435 cpuset_up(&cpuset_sem);
1416 cpuset_release_agent(pathbuf); 1436 cpuset_release_agent(pathbuf);
1417 return 0; 1437 return 0;
1418} 1438}
@@ -1515,10 +1535,10 @@ void cpuset_exit(struct task_struct *tsk)
1515 if (notify_on_release(cs)) { 1535 if (notify_on_release(cs)) {
1516 char *pathbuf = NULL; 1536 char *pathbuf = NULL;
1517 1537
1518 down(&cpuset_sem); 1538 cpuset_down(&cpuset_sem);
1519 if (atomic_dec_and_test(&cs->count)) 1539 if (atomic_dec_and_test(&cs->count))
1520 check_for_release(cs, &pathbuf); 1540 check_for_release(cs, &pathbuf);
1521 up(&cpuset_sem); 1541 cpuset_up(&cpuset_sem);
1522 cpuset_release_agent(pathbuf); 1542 cpuset_release_agent(pathbuf);
1523 } else { 1543 } else {
1524 atomic_dec(&cs->count); 1544 atomic_dec(&cs->count);
@@ -1539,11 +1559,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
1539{ 1559{
1540 cpumask_t mask; 1560 cpumask_t mask;
1541 1561
1542 down(&cpuset_sem); 1562 cpuset_down(&cpuset_sem);
1543 task_lock((struct task_struct *)tsk); 1563 task_lock((struct task_struct *)tsk);
1544 guarantee_online_cpus(tsk->cpuset, &mask); 1564 guarantee_online_cpus(tsk->cpuset, &mask);
1545 task_unlock((struct task_struct *)tsk); 1565 task_unlock((struct task_struct *)tsk);
1546 up(&cpuset_sem); 1566 cpuset_up(&cpuset_sem);
1547 1567
1548 return mask; 1568 return mask;
1549} 1569}
@@ -1568,9 +1588,9 @@ void cpuset_update_current_mems_allowed(void)
1568 if (!cs) 1588 if (!cs)
1569 return; /* task is exiting */ 1589 return; /* task is exiting */
1570 if (current->cpuset_mems_generation != cs->mems_generation) { 1590 if (current->cpuset_mems_generation != cs->mems_generation) {
1571 down(&cpuset_sem); 1591 cpuset_down(&cpuset_sem);
1572 refresh_mems(); 1592 refresh_mems();
1573 up(&cpuset_sem); 1593 cpuset_up(&cpuset_sem);
1574 } 1594 }
1575} 1595}
1576 1596
@@ -1669,14 +1689,14 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
1669 return 0; 1689 return 0;
1670 1690
1671 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 1691 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1672 down(&cpuset_sem); 1692 cpuset_down(&cpuset_sem);
1673 cs = current->cpuset; 1693 cs = current->cpuset;
1674 if (!cs) 1694 if (!cs)
1675 goto done; /* current task exiting */ 1695 goto done; /* current task exiting */
1676 cs = nearest_exclusive_ancestor(cs); 1696 cs = nearest_exclusive_ancestor(cs);
1677 allowed = node_isset(node, cs->mems_allowed); 1697 allowed = node_isset(node, cs->mems_allowed);
1678done: 1698done:
1679 up(&cpuset_sem); 1699 cpuset_up(&cpuset_sem);
1680 return allowed; 1700 return allowed;
1681} 1701}
1682 1702
@@ -1697,7 +1717,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1697 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 1717 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1698 int overlap = 0; /* do cpusets overlap? */ 1718 int overlap = 0; /* do cpusets overlap? */
1699 1719
1700 down(&cpuset_sem); 1720 cpuset_down(&cpuset_sem);
1701 cs1 = current->cpuset; 1721 cs1 = current->cpuset;
1702 if (!cs1) 1722 if (!cs1)
1703 goto done; /* current task exiting */ 1723 goto done; /* current task exiting */
@@ -1708,7 +1728,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1708 cs2 = nearest_exclusive_ancestor(cs2); 1728 cs2 = nearest_exclusive_ancestor(cs2);
1709 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); 1729 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1710done: 1730done:
1711 up(&cpuset_sem); 1731 cpuset_up(&cpuset_sem);
1712 1732
1713 return overlap; 1733 return overlap;
1714} 1734}
@@ -1731,7 +1751,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1731 return -ENOMEM; 1751 return -ENOMEM;
1732 1752
1733 tsk = m->private; 1753 tsk = m->private;
1734 down(&cpuset_sem); 1754 cpuset_down(&cpuset_sem);
1735 task_lock(tsk); 1755 task_lock(tsk);
1736 cs = tsk->cpuset; 1756 cs = tsk->cpuset;
1737 task_unlock(tsk); 1757 task_unlock(tsk);
@@ -1746,7 +1766,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1746 seq_puts(m, buf); 1766 seq_puts(m, buf);
1747 seq_putc(m, '\n'); 1767 seq_putc(m, '\n');
1748out: 1768out:
1749 up(&cpuset_sem); 1769 cpuset_up(&cpuset_sem);
1750 kfree(buf); 1770 kfree(buf);
1751 return retval; 1771 return retval;
1752} 1772}