diff options
-rw-r--r-- | kernel/cpuset.c | 105 |
1 files changed, 40 insertions, 65 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b9342f90d28f..cd54dba2be18 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -180,42 +180,6 @@ static struct super_block *cpuset_sb = NULL; | |||
180 | */ | 180 | */ |
181 | 181 | ||
182 | static DECLARE_MUTEX(cpuset_sem); | 182 | static DECLARE_MUTEX(cpuset_sem); |
183 | static struct task_struct *cpuset_sem_owner; | ||
184 | static int cpuset_sem_depth; | ||
185 | |||
186 | /* | ||
187 | * The global cpuset semaphore cpuset_sem can be needed by the | ||
188 | * memory allocator to update a tasks mems_allowed (see the calls | ||
189 | * to cpuset_update_current_mems_allowed()) or to walk up the | ||
190 | * cpuset hierarchy to find a mem_exclusive cpuset see the calls | ||
191 | * to cpuset_excl_nodes_overlap()). | ||
192 | * | ||
193 | * But if the memory allocation is being done by cpuset.c code, it | ||
194 | * usually already holds cpuset_sem. Double tripping on a kernel | ||
195 | * semaphore deadlocks the current task, and any other task that | ||
196 | * subsequently tries to obtain the lock. | ||
197 | * | ||
198 | * Run all up's and down's on cpuset_sem through the following | ||
199 | * wrappers, which will detect this nested locking, and avoid | ||
200 | * deadlocking. | ||
201 | */ | ||
202 | |||
203 | static inline void cpuset_down(struct semaphore *psem) | ||
204 | { | ||
205 | if (cpuset_sem_owner != current) { | ||
206 | down(psem); | ||
207 | cpuset_sem_owner = current; | ||
208 | } | ||
209 | cpuset_sem_depth++; | ||
210 | } | ||
211 | |||
212 | static inline void cpuset_up(struct semaphore *psem) | ||
213 | { | ||
214 | if (--cpuset_sem_depth == 0) { | ||
215 | cpuset_sem_owner = NULL; | ||
216 | up(psem); | ||
217 | } | ||
218 | } | ||
219 | 183 | ||
220 | /* | 184 | /* |
221 | * A couple of forward declarations required, due to cyclic reference loop: | 185 | * A couple of forward declarations required, due to cyclic reference loop: |
@@ -558,10 +522,19 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
558 | * Refresh current tasks mems_allowed and mems_generation from | 522 | * Refresh current tasks mems_allowed and mems_generation from |
559 | * current tasks cpuset. Call with cpuset_sem held. | 523 | * current tasks cpuset. Call with cpuset_sem held. |
560 | * | 524 | * |
561 | * This routine is needed to update the per-task mems_allowed | 525 | * Be sure to call refresh_mems() on any cpuset operation which |
562 | * data, within the tasks context, when it is trying to allocate | 526 | * (1) holds cpuset_sem, and (2) might possibly alloc memory. |
563 | * memory (in various mm/mempolicy.c routines) and notices | 527 | * Call after obtaining cpuset_sem lock, before any possible |
564 | * that some other task has been modifying its cpuset. | 528 | * allocation. Otherwise one risks trying to allocate memory |
529 | * while the task cpuset_mems_generation is not the same as | ||
530 | * the mems_generation in its cpuset, which would deadlock on | ||
531 | * cpuset_sem in cpuset_update_current_mems_allowed(). | ||
532 | * | ||
533 | * Since we hold cpuset_sem, once refresh_mems() is called, the | ||
534 | * test (current->cpuset_mems_generation != cs->mems_generation) | ||
535 | * in cpuset_update_current_mems_allowed() will remain false, | ||
536 | * until we drop cpuset_sem. Anyone else who would change our | ||
537 | * cpusets mems_generation needs to lock cpuset_sem first. | ||
565 | */ | 538 | */ |
566 | 539 | ||
567 | static void refresh_mems(void) | 540 | static void refresh_mems(void) |
@@ -867,7 +840,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
867 | } | 840 | } |
868 | buffer[nbytes] = 0; /* nul-terminate */ | 841 | buffer[nbytes] = 0; /* nul-terminate */ |
869 | 842 | ||
870 | cpuset_down(&cpuset_sem); | 843 | down(&cpuset_sem); |
871 | 844 | ||
872 | if (is_removed(cs)) { | 845 | if (is_removed(cs)) { |
873 | retval = -ENODEV; | 846 | retval = -ENODEV; |
@@ -901,7 +874,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
901 | if (retval == 0) | 874 | if (retval == 0) |
902 | retval = nbytes; | 875 | retval = nbytes; |
903 | out2: | 876 | out2: |
904 | cpuset_up(&cpuset_sem); | 877 | up(&cpuset_sem); |
905 | cpuset_release_agent(pathbuf); | 878 | cpuset_release_agent(pathbuf); |
906 | out1: | 879 | out1: |
907 | kfree(buffer); | 880 | kfree(buffer); |
@@ -941,9 +914,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
941 | { | 914 | { |
942 | cpumask_t mask; | 915 | cpumask_t mask; |
943 | 916 | ||
944 | cpuset_down(&cpuset_sem); | 917 | down(&cpuset_sem); |
945 | mask = cs->cpus_allowed; | 918 | mask = cs->cpus_allowed; |
946 | cpuset_up(&cpuset_sem); | 919 | up(&cpuset_sem); |
947 | 920 | ||
948 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 921 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
949 | } | 922 | } |
@@ -952,9 +925,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
952 | { | 925 | { |
953 | nodemask_t mask; | 926 | nodemask_t mask; |
954 | 927 | ||
955 | cpuset_down(&cpuset_sem); | 928 | down(&cpuset_sem); |
956 | mask = cs->mems_allowed; | 929 | mask = cs->mems_allowed; |
957 | cpuset_up(&cpuset_sem); | 930 | up(&cpuset_sem); |
958 | 931 | ||
959 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 932 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
960 | } | 933 | } |
@@ -1351,7 +1324,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1351 | if (!cs) | 1324 | if (!cs) |
1352 | return -ENOMEM; | 1325 | return -ENOMEM; |
1353 | 1326 | ||
1354 | cpuset_down(&cpuset_sem); | 1327 | down(&cpuset_sem); |
1328 | refresh_mems(); | ||
1355 | cs->flags = 0; | 1329 | cs->flags = 0; |
1356 | if (notify_on_release(parent)) | 1330 | if (notify_on_release(parent)) |
1357 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1331 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
@@ -1376,14 +1350,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1376 | * will down() this new directory's i_sem and if we race with | 1350 | * will down() this new directory's i_sem and if we race with |
1377 | * another mkdir, we might deadlock. | 1351 | * another mkdir, we might deadlock. |
1378 | */ | 1352 | */ |
1379 | cpuset_up(&cpuset_sem); | 1353 | up(&cpuset_sem); |
1380 | 1354 | ||
1381 | err = cpuset_populate_dir(cs->dentry); | 1355 | err = cpuset_populate_dir(cs->dentry); |
1382 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1356 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
1383 | return 0; | 1357 | return 0; |
1384 | err: | 1358 | err: |
1385 | list_del(&cs->sibling); | 1359 | list_del(&cs->sibling); |
1386 | cpuset_up(&cpuset_sem); | 1360 | up(&cpuset_sem); |
1387 | kfree(cs); | 1361 | kfree(cs); |
1388 | return err; | 1362 | return err; |
1389 | } | 1363 | } |
@@ -1405,13 +1379,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1405 | 1379 | ||
1406 | /* the vfs holds both inode->i_sem already */ | 1380 | /* the vfs holds both inode->i_sem already */ |
1407 | 1381 | ||
1408 | cpuset_down(&cpuset_sem); | 1382 | down(&cpuset_sem); |
1383 | refresh_mems(); | ||
1409 | if (atomic_read(&cs->count) > 0) { | 1384 | if (atomic_read(&cs->count) > 0) { |
1410 | cpuset_up(&cpuset_sem); | 1385 | up(&cpuset_sem); |
1411 | return -EBUSY; | 1386 | return -EBUSY; |
1412 | } | 1387 | } |
1413 | if (!list_empty(&cs->children)) { | 1388 | if (!list_empty(&cs->children)) { |
1414 | cpuset_up(&cpuset_sem); | 1389 | up(&cpuset_sem); |
1415 | return -EBUSY; | 1390 | return -EBUSY; |
1416 | } | 1391 | } |
1417 | parent = cs->parent; | 1392 | parent = cs->parent; |
@@ -1427,7 +1402,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1427 | spin_unlock(&d->d_lock); | 1402 | spin_unlock(&d->d_lock); |
1428 | cpuset_d_remove_dir(d); | 1403 | cpuset_d_remove_dir(d); |
1429 | dput(d); | 1404 | dput(d); |
1430 | cpuset_up(&cpuset_sem); | 1405 | up(&cpuset_sem); |
1431 | cpuset_release_agent(pathbuf); | 1406 | cpuset_release_agent(pathbuf); |
1432 | return 0; | 1407 | return 0; |
1433 | } | 1408 | } |
@@ -1530,10 +1505,10 @@ void cpuset_exit(struct task_struct *tsk) | |||
1530 | if (notify_on_release(cs)) { | 1505 | if (notify_on_release(cs)) { |
1531 | char *pathbuf = NULL; | 1506 | char *pathbuf = NULL; |
1532 | 1507 | ||
1533 | cpuset_down(&cpuset_sem); | 1508 | down(&cpuset_sem); |
1534 | if (atomic_dec_and_test(&cs->count)) | 1509 | if (atomic_dec_and_test(&cs->count)) |
1535 | check_for_release(cs, &pathbuf); | 1510 | check_for_release(cs, &pathbuf); |
1536 | cpuset_up(&cpuset_sem); | 1511 | up(&cpuset_sem); |
1537 | cpuset_release_agent(pathbuf); | 1512 | cpuset_release_agent(pathbuf); |
1538 | } else { | 1513 | } else { |
1539 | atomic_dec(&cs->count); | 1514 | atomic_dec(&cs->count); |
@@ -1554,11 +1529,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
1554 | { | 1529 | { |
1555 | cpumask_t mask; | 1530 | cpumask_t mask; |
1556 | 1531 | ||
1557 | cpuset_down(&cpuset_sem); | 1532 | down(&cpuset_sem); |
1558 | task_lock((struct task_struct *)tsk); | 1533 | task_lock((struct task_struct *)tsk); |
1559 | guarantee_online_cpus(tsk->cpuset, &mask); | 1534 | guarantee_online_cpus(tsk->cpuset, &mask); |
1560 | task_unlock((struct task_struct *)tsk); | 1535 | task_unlock((struct task_struct *)tsk); |
1561 | cpuset_up(&cpuset_sem); | 1536 | up(&cpuset_sem); |
1562 | 1537 | ||
1563 | return mask; | 1538 | return mask; |
1564 | } | 1539 | } |
@@ -1583,9 +1558,9 @@ void cpuset_update_current_mems_allowed(void) | |||
1583 | if (!cs) | 1558 | if (!cs) |
1584 | return; /* task is exiting */ | 1559 | return; /* task is exiting */ |
1585 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1560 | if (current->cpuset_mems_generation != cs->mems_generation) { |
1586 | cpuset_down(&cpuset_sem); | 1561 | down(&cpuset_sem); |
1587 | refresh_mems(); | 1562 | refresh_mems(); |
1588 | cpuset_up(&cpuset_sem); | 1563 | up(&cpuset_sem); |
1589 | } | 1564 | } |
1590 | } | 1565 | } |
1591 | 1566 | ||
@@ -1684,14 +1659,14 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
1684 | return 0; | 1659 | return 0; |
1685 | 1660 | ||
1686 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 1661 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
1687 | cpuset_down(&cpuset_sem); | 1662 | down(&cpuset_sem); |
1688 | cs = current->cpuset; | 1663 | cs = current->cpuset; |
1689 | if (!cs) | 1664 | if (!cs) |
1690 | goto done; /* current task exiting */ | 1665 | goto done; /* current task exiting */ |
1691 | cs = nearest_exclusive_ancestor(cs); | 1666 | cs = nearest_exclusive_ancestor(cs); |
1692 | allowed = node_isset(node, cs->mems_allowed); | 1667 | allowed = node_isset(node, cs->mems_allowed); |
1693 | done: | 1668 | done: |
1694 | cpuset_up(&cpuset_sem); | 1669 | up(&cpuset_sem); |
1695 | return allowed; | 1670 | return allowed; |
1696 | } | 1671 | } |
1697 | 1672 | ||
@@ -1712,7 +1687,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1712 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 1687 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
1713 | int overlap = 0; /* do cpusets overlap? */ | 1688 | int overlap = 0; /* do cpusets overlap? */ |
1714 | 1689 | ||
1715 | cpuset_down(&cpuset_sem); | 1690 | down(&cpuset_sem); |
1716 | cs1 = current->cpuset; | 1691 | cs1 = current->cpuset; |
1717 | if (!cs1) | 1692 | if (!cs1) |
1718 | goto done; /* current task exiting */ | 1693 | goto done; /* current task exiting */ |
@@ -1723,7 +1698,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1723 | cs2 = nearest_exclusive_ancestor(cs2); | 1698 | cs2 = nearest_exclusive_ancestor(cs2); |
1724 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 1699 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
1725 | done: | 1700 | done: |
1726 | cpuset_up(&cpuset_sem); | 1701 | up(&cpuset_sem); |
1727 | 1702 | ||
1728 | return overlap; | 1703 | return overlap; |
1729 | } | 1704 | } |
@@ -1746,7 +1721,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1746 | return -ENOMEM; | 1721 | return -ENOMEM; |
1747 | 1722 | ||
1748 | tsk = m->private; | 1723 | tsk = m->private; |
1749 | cpuset_down(&cpuset_sem); | 1724 | down(&cpuset_sem); |
1750 | task_lock(tsk); | 1725 | task_lock(tsk); |
1751 | cs = tsk->cpuset; | 1726 | cs = tsk->cpuset; |
1752 | task_unlock(tsk); | 1727 | task_unlock(tsk); |
@@ -1761,7 +1736,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1761 | seq_puts(m, buf); | 1736 | seq_puts(m, buf); |
1762 | seq_putc(m, '\n'); | 1737 | seq_putc(m, '\n'); |
1763 | out: | 1738 | out: |
1764 | cpuset_up(&cpuset_sem); | 1739 | up(&cpuset_sem); |
1765 | kfree(buf); | 1740 | kfree(buf); |
1766 | return retval; | 1741 | return retval; |
1767 | } | 1742 | } |