diff options
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | kernel/cpuset.c | 100 |
2 files changed, 61 insertions, 40 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index c551e6a1447e..8a1fcfe80fc7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -782,6 +782,7 @@ struct task_struct { | |||
782 | short il_next; | 782 | short il_next; |
783 | #endif | 783 | #endif |
784 | #ifdef CONFIG_CPUSETS | 784 | #ifdef CONFIG_CPUSETS |
785 | short cpuset_sem_nest_depth; | ||
785 | struct cpuset *cpuset; | 786 | struct cpuset *cpuset; |
786 | nodemask_t mems_allowed; | 787 | nodemask_t mems_allowed; |
787 | int cpuset_mems_generation; | 788 | int cpuset_mems_generation; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 712d02029971..407b5f0a8c8e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -182,6 +182,37 @@ static struct super_block *cpuset_sb = NULL; | |||
182 | static DECLARE_MUTEX(cpuset_sem); | 182 | static DECLARE_MUTEX(cpuset_sem); |
183 | 183 | ||
184 | /* | 184 | /* |
185 | * The global cpuset semaphore cpuset_sem can be needed by the | ||
186 | * memory allocator to update a tasks mems_allowed (see the calls | ||
187 | * to cpuset_update_current_mems_allowed()) or to walk up the | ||
188 | * cpuset hierarchy to find a mem_exclusive cpuset see the calls | ||
189 | * to cpuset_excl_nodes_overlap()). | ||
190 | * | ||
191 | * But if the memory allocation is being done by cpuset.c code, it | ||
192 | * usually already holds cpuset_sem. Double tripping on a kernel | ||
193 | * semaphore deadlocks the current task, and any other task that | ||
194 | * subsequently tries to obtain the lock. | ||
195 | * | ||
196 | * Run all up's and down's on cpuset_sem through the following | ||
197 | * wrappers, which will detect this nested locking, and avoid | ||
198 | * deadlocking. | ||
199 | */ | ||
200 | |||
201 | static inline void cpuset_down(struct semaphore *psem) | ||
202 | { | ||
203 | if (current->cpuset_sem_nest_depth == 0) | ||
204 | down(psem); | ||
205 | current->cpuset_sem_nest_depth++; | ||
206 | } | ||
207 | |||
208 | static inline void cpuset_up(struct semaphore *psem) | ||
209 | { | ||
210 | current->cpuset_sem_nest_depth--; | ||
211 | if (current->cpuset_sem_nest_depth == 0) | ||
212 | up(psem); | ||
213 | } | ||
214 | |||
215 | /* | ||
185 | * A couple of forward declarations required, due to cyclic reference loop: | 216 | * A couple of forward declarations required, due to cyclic reference loop: |
186 | * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file | 217 | * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file |
187 | * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir. | 218 | * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir. |
@@ -522,19 +553,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
522 | * Refresh current tasks mems_allowed and mems_generation from | 553 | * Refresh current tasks mems_allowed and mems_generation from |
523 | * current tasks cpuset. Call with cpuset_sem held. | 554 | * current tasks cpuset. Call with cpuset_sem held. |
524 | * | 555 | * |
525 | * Be sure to call refresh_mems() on any cpuset operation which | 556 | * This routine is needed to update the per-task mems_allowed |
526 | * (1) holds cpuset_sem, and (2) might possibly alloc memory. | 557 | * data, within the tasks context, when it is trying to allocate |
527 | * Call after obtaining cpuset_sem lock, before any possible | 558 | * memory (in various mm/mempolicy.c routines) and notices |
528 | * allocation. Otherwise one risks trying to allocate memory | 559 | * that some other task has been modifying its cpuset. |
529 | * while the task cpuset_mems_generation is not the same as | ||
530 | * the mems_generation in its cpuset, which would deadlock on | ||
531 | * cpuset_sem in cpuset_update_current_mems_allowed(). | ||
532 | * | ||
533 | * Since we hold cpuset_sem, once refresh_mems() is called, the | ||
534 | * test (current->cpuset_mems_generation != cs->mems_generation) | ||
535 | * in cpuset_update_current_mems_allowed() will remain false, | ||
536 | * until we drop cpuset_sem. Anyone else who would change our | ||
537 | * cpusets mems_generation needs to lock cpuset_sem first. | ||
538 | */ | 560 | */ |
539 | 561 | ||
540 | static void refresh_mems(void) | 562 | static void refresh_mems(void) |
@@ -840,7 +862,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
840 | } | 862 | } |
841 | buffer[nbytes] = 0; /* nul-terminate */ | 863 | buffer[nbytes] = 0; /* nul-terminate */ |
842 | 864 | ||
843 | down(&cpuset_sem); | 865 | cpuset_down(&cpuset_sem); |
844 | 866 | ||
845 | if (is_removed(cs)) { | 867 | if (is_removed(cs)) { |
846 | retval = -ENODEV; | 868 | retval = -ENODEV; |
@@ -874,7 +896,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
874 | if (retval == 0) | 896 | if (retval == 0) |
875 | retval = nbytes; | 897 | retval = nbytes; |
876 | out2: | 898 | out2: |
877 | up(&cpuset_sem); | 899 | cpuset_up(&cpuset_sem); |
878 | cpuset_release_agent(pathbuf); | 900 | cpuset_release_agent(pathbuf); |
879 | out1: | 901 | out1: |
880 | kfree(buffer); | 902 | kfree(buffer); |
@@ -914,9 +936,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
914 | { | 936 | { |
915 | cpumask_t mask; | 937 | cpumask_t mask; |
916 | 938 | ||
917 | down(&cpuset_sem); | 939 | cpuset_down(&cpuset_sem); |
918 | mask = cs->cpus_allowed; | 940 | mask = cs->cpus_allowed; |
919 | up(&cpuset_sem); | 941 | cpuset_up(&cpuset_sem); |
920 | 942 | ||
921 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 943 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
922 | } | 944 | } |
@@ -925,9 +947,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
925 | { | 947 | { |
926 | nodemask_t mask; | 948 | nodemask_t mask; |
927 | 949 | ||
928 | down(&cpuset_sem); | 950 | cpuset_down(&cpuset_sem); |
929 | mask = cs->mems_allowed; | 951 | mask = cs->mems_allowed; |
930 | up(&cpuset_sem); | 952 | cpuset_up(&cpuset_sem); |
931 | 953 | ||
932 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 954 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
933 | } | 955 | } |
@@ -1334,8 +1356,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1334 | if (!cs) | 1356 | if (!cs) |
1335 | return -ENOMEM; | 1357 | return -ENOMEM; |
1336 | 1358 | ||
1337 | down(&cpuset_sem); | 1359 | cpuset_down(&cpuset_sem); |
1338 | refresh_mems(); | ||
1339 | cs->flags = 0; | 1360 | cs->flags = 0; |
1340 | if (notify_on_release(parent)) | 1361 | if (notify_on_release(parent)) |
1341 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1362 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
@@ -1360,14 +1381,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1360 | * will down() this new directory's i_sem and if we race with | 1381 | * will down() this new directory's i_sem and if we race with |
1361 | * another mkdir, we might deadlock. | 1382 | * another mkdir, we might deadlock. |
1362 | */ | 1383 | */ |
1363 | up(&cpuset_sem); | 1384 | cpuset_up(&cpuset_sem); |
1364 | 1385 | ||
1365 | err = cpuset_populate_dir(cs->dentry); | 1386 | err = cpuset_populate_dir(cs->dentry); |
1366 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1387 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
1367 | return 0; | 1388 | return 0; |
1368 | err: | 1389 | err: |
1369 | list_del(&cs->sibling); | 1390 | list_del(&cs->sibling); |
1370 | up(&cpuset_sem); | 1391 | cpuset_up(&cpuset_sem); |
1371 | kfree(cs); | 1392 | kfree(cs); |
1372 | return err; | 1393 | return err; |
1373 | } | 1394 | } |
@@ -1389,14 +1410,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1389 | 1410 | ||
1390 | /* the vfs holds both inode->i_sem already */ | 1411 | /* the vfs holds both inode->i_sem already */ |
1391 | 1412 | ||
1392 | down(&cpuset_sem); | 1413 | cpuset_down(&cpuset_sem); |
1393 | refresh_mems(); | ||
1394 | if (atomic_read(&cs->count) > 0) { | 1414 | if (atomic_read(&cs->count) > 0) { |
1395 | up(&cpuset_sem); | 1415 | cpuset_up(&cpuset_sem); |
1396 | return -EBUSY; | 1416 | return -EBUSY; |
1397 | } | 1417 | } |
1398 | if (!list_empty(&cs->children)) { | 1418 | if (!list_empty(&cs->children)) { |
1399 | up(&cpuset_sem); | 1419 | cpuset_up(&cpuset_sem); |
1400 | return -EBUSY; | 1420 | return -EBUSY; |
1401 | } | 1421 | } |
1402 | parent = cs->parent; | 1422 | parent = cs->parent; |
@@ -1412,7 +1432,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1412 | spin_unlock(&d->d_lock); | 1432 | spin_unlock(&d->d_lock); |
1413 | cpuset_d_remove_dir(d); | 1433 | cpuset_d_remove_dir(d); |
1414 | dput(d); | 1434 | dput(d); |
1415 | up(&cpuset_sem); | 1435 | cpuset_up(&cpuset_sem); |
1416 | cpuset_release_agent(pathbuf); | 1436 | cpuset_release_agent(pathbuf); |
1417 | return 0; | 1437 | return 0; |
1418 | } | 1438 | } |
@@ -1515,10 +1535,10 @@ void cpuset_exit(struct task_struct *tsk) | |||
1515 | if (notify_on_release(cs)) { | 1535 | if (notify_on_release(cs)) { |
1516 | char *pathbuf = NULL; | 1536 | char *pathbuf = NULL; |
1517 | 1537 | ||
1518 | down(&cpuset_sem); | 1538 | cpuset_down(&cpuset_sem); |
1519 | if (atomic_dec_and_test(&cs->count)) | 1539 | if (atomic_dec_and_test(&cs->count)) |
1520 | check_for_release(cs, &pathbuf); | 1540 | check_for_release(cs, &pathbuf); |
1521 | up(&cpuset_sem); | 1541 | cpuset_up(&cpuset_sem); |
1522 | cpuset_release_agent(pathbuf); | 1542 | cpuset_release_agent(pathbuf); |
1523 | } else { | 1543 | } else { |
1524 | atomic_dec(&cs->count); | 1544 | atomic_dec(&cs->count); |
@@ -1539,11 +1559,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
1539 | { | 1559 | { |
1540 | cpumask_t mask; | 1560 | cpumask_t mask; |
1541 | 1561 | ||
1542 | down(&cpuset_sem); | 1562 | cpuset_down(&cpuset_sem); |
1543 | task_lock((struct task_struct *)tsk); | 1563 | task_lock((struct task_struct *)tsk); |
1544 | guarantee_online_cpus(tsk->cpuset, &mask); | 1564 | guarantee_online_cpus(tsk->cpuset, &mask); |
1545 | task_unlock((struct task_struct *)tsk); | 1565 | task_unlock((struct task_struct *)tsk); |
1546 | up(&cpuset_sem); | 1566 | cpuset_up(&cpuset_sem); |
1547 | 1567 | ||
1548 | return mask; | 1568 | return mask; |
1549 | } | 1569 | } |
@@ -1568,9 +1588,9 @@ void cpuset_update_current_mems_allowed(void) | |||
1568 | if (!cs) | 1588 | if (!cs) |
1569 | return; /* task is exiting */ | 1589 | return; /* task is exiting */ |
1570 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1590 | if (current->cpuset_mems_generation != cs->mems_generation) { |
1571 | down(&cpuset_sem); | 1591 | cpuset_down(&cpuset_sem); |
1572 | refresh_mems(); | 1592 | refresh_mems(); |
1573 | up(&cpuset_sem); | 1593 | cpuset_up(&cpuset_sem); |
1574 | } | 1594 | } |
1575 | } | 1595 | } |
1576 | 1596 | ||
@@ -1669,14 +1689,14 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) | |||
1669 | return 0; | 1689 | return 0; |
1670 | 1690 | ||
1671 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 1691 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
1672 | down(&cpuset_sem); | 1692 | cpuset_down(&cpuset_sem); |
1673 | cs = current->cpuset; | 1693 | cs = current->cpuset; |
1674 | if (!cs) | 1694 | if (!cs) |
1675 | goto done; /* current task exiting */ | 1695 | goto done; /* current task exiting */ |
1676 | cs = nearest_exclusive_ancestor(cs); | 1696 | cs = nearest_exclusive_ancestor(cs); |
1677 | allowed = node_isset(node, cs->mems_allowed); | 1697 | allowed = node_isset(node, cs->mems_allowed); |
1678 | done: | 1698 | done: |
1679 | up(&cpuset_sem); | 1699 | cpuset_up(&cpuset_sem); |
1680 | return allowed; | 1700 | return allowed; |
1681 | } | 1701 | } |
1682 | 1702 | ||
@@ -1697,7 +1717,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1697 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 1717 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
1698 | int overlap = 0; /* do cpusets overlap? */ | 1718 | int overlap = 0; /* do cpusets overlap? */ |
1699 | 1719 | ||
1700 | down(&cpuset_sem); | 1720 | cpuset_down(&cpuset_sem); |
1701 | cs1 = current->cpuset; | 1721 | cs1 = current->cpuset; |
1702 | if (!cs1) | 1722 | if (!cs1) |
1703 | goto done; /* current task exiting */ | 1723 | goto done; /* current task exiting */ |
@@ -1708,7 +1728,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1708 | cs2 = nearest_exclusive_ancestor(cs2); | 1728 | cs2 = nearest_exclusive_ancestor(cs2); |
1709 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 1729 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
1710 | done: | 1730 | done: |
1711 | up(&cpuset_sem); | 1731 | cpuset_up(&cpuset_sem); |
1712 | 1732 | ||
1713 | return overlap; | 1733 | return overlap; |
1714 | } | 1734 | } |
@@ -1731,7 +1751,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1731 | return -ENOMEM; | 1751 | return -ENOMEM; |
1732 | 1752 | ||
1733 | tsk = m->private; | 1753 | tsk = m->private; |
1734 | down(&cpuset_sem); | 1754 | cpuset_down(&cpuset_sem); |
1735 | task_lock(tsk); | 1755 | task_lock(tsk); |
1736 | cs = tsk->cpuset; | 1756 | cs = tsk->cpuset; |
1737 | task_unlock(tsk); | 1757 | task_unlock(tsk); |
@@ -1746,7 +1766,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1746 | seq_puts(m, buf); | 1766 | seq_puts(m, buf); |
1747 | seq_putc(m, '\n'); | 1767 | seq_putc(m, '\n'); |
1748 | out: | 1768 | out: |
1749 | up(&cpuset_sem); | 1769 | cpuset_up(&cpuset_sem); |
1750 | kfree(buf); | 1770 | kfree(buf); |
1751 | return retval; | 1771 | return retval; |
1752 | } | 1772 | } |