diff options
Diffstat (limited to 'kernel/cpuset.c')
| -rw-r--r-- | kernel/cpuset.c | 109 |
1 files changed, 69 insertions, 40 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1f06e7690106..79866bc6b3a1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL; | |||
| 180 | */ | 180 | */ |
| 181 | 181 | ||
| 182 | static DECLARE_MUTEX(cpuset_sem); | 182 | static DECLARE_MUTEX(cpuset_sem); |
| 183 | static struct task_struct *cpuset_sem_owner; | ||
| 184 | static int cpuset_sem_depth; | ||
| 185 | |||
| 186 | /* | ||
| 187 | * The global cpuset semaphore cpuset_sem can be needed by the | ||
| 188 | * memory allocator to update a tasks mems_allowed (see the calls | ||
| 189 | * to cpuset_update_current_mems_allowed()) or to walk up the | ||
| 190 | * cpuset hierarchy to find a mem_exclusive cpuset see the calls | ||
| 191 | * to cpuset_excl_nodes_overlap()). | ||
| 192 | * | ||
| 193 | * But if the memory allocation is being done by cpuset.c code, it | ||
| 194 | * usually already holds cpuset_sem. Double tripping on a kernel | ||
| 195 | * semaphore deadlocks the current task, and any other task that | ||
| 196 | * subsequently tries to obtain the lock. | ||
| 197 | * | ||
| 198 | * Run all up's and down's on cpuset_sem through the following | ||
| 199 | * wrappers, which will detect this nested locking, and avoid | ||
| 200 | * deadlocking. | ||
| 201 | */ | ||
| 202 | |||
| 203 | static inline void cpuset_down(struct semaphore *psem) | ||
| 204 | { | ||
| 205 | if (cpuset_sem_owner != current) { | ||
| 206 | down(psem); | ||
| 207 | cpuset_sem_owner = current; | ||
| 208 | } | ||
| 209 | cpuset_sem_depth++; | ||
| 210 | } | ||
| 211 | |||
| 212 | static inline void cpuset_up(struct semaphore *psem) | ||
| 213 | { | ||
| 214 | if (--cpuset_sem_depth == 0) { | ||
| 215 | cpuset_sem_owner = NULL; | ||
| 216 | up(psem); | ||
| 217 | } | ||
| 218 | } | ||
| 183 | 219 | ||
| 184 | /* | 220 | /* |
| 185 | * A couple of forward declarations required, due to cyclic reference loop: | 221 | * A couple of forward declarations required, due to cyclic reference loop: |
| @@ -522,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
| 522 | * Refresh current tasks mems_allowed and mems_generation from | 558 | * Refresh current tasks mems_allowed and mems_generation from |
| 523 | * current tasks cpuset. Call with cpuset_sem held. | 559 | * current tasks cpuset. Call with cpuset_sem held. |
| 524 | * | 560 | * |
| 525 | * Be sure to call refresh_mems() on any cpuset operation which | 561 | * This routine is needed to update the per-task mems_allowed |
| 526 | * (1) holds cpuset_sem, and (2) might possibly alloc memory. | 562 | * data, within the tasks context, when it is trying to allocate |
| 527 | * Call after obtaining cpuset_sem lock, before any possible | 563 | * memory (in various mm/mempolicy.c routines) and notices |
| 528 | * allocation. Otherwise one risks trying to allocate memory | 564 | * that some other task has been modifying its cpuset. |
| 529 | * while the task cpuset_mems_generation is not the same as | ||
| 530 | * the mems_generation in its cpuset, which would deadlock on | ||
| 531 | * cpuset_sem in cpuset_update_current_mems_allowed(). | ||
| 532 | * | ||
| 533 | * Since we hold cpuset_sem, once refresh_mems() is called, the | ||
| 534 | * test (current->cpuset_mems_generation != cs->mems_generation) | ||
| 535 | * in cpuset_update_current_mems_allowed() will remain false, | ||
| 536 | * until we drop cpuset_sem. Anyone else who would change our | ||
| 537 | * cpusets mems_generation needs to lock cpuset_sem first. | ||
| 538 | */ | 565 | */ |
| 539 | 566 | ||
| 540 | static void refresh_mems(void) | 567 | static void refresh_mems(void) |
| @@ -840,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
| 840 | } | 867 | } |
| 841 | buffer[nbytes] = 0; /* nul-terminate */ | 868 | buffer[nbytes] = 0; /* nul-terminate */ |
| 842 | 869 | ||
| 843 | down(&cpuset_sem); | 870 | cpuset_down(&cpuset_sem); |
| 844 | 871 | ||
| 845 | if (is_removed(cs)) { | 872 | if (is_removed(cs)) { |
| 846 | retval = -ENODEV; | 873 | retval = -ENODEV; |
| @@ -874,7 +901,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
| 874 | if (retval == 0) | 901 | if (retval == 0) |
| 875 | retval = nbytes; | 902 | retval = nbytes; |
| 876 | out2: | 903 | out2: |
| 877 | up(&cpuset_sem); | 904 | cpuset_up(&cpuset_sem); |
| 878 | cpuset_release_agent(pathbuf); | 905 | cpuset_release_agent(pathbuf); |
| 879 | out1: | 906 | out1: |
| 880 | kfree(buffer); | 907 | kfree(buffer); |
| @@ -914,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
| 914 | { | 941 | { |
| 915 | cpumask_t mask; | 942 | cpumask_t mask; |
| 916 | 943 | ||
| 917 | down(&cpuset_sem); | 944 | cpuset_down(&cpuset_sem); |
| 918 | mask = cs->cpus_allowed; | 945 | mask = cs->cpus_allowed; |
| 919 | up(&cpuset_sem); | 946 | cpuset_up(&cpuset_sem); |
| 920 | 947 | ||
| 921 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 948 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
| 922 | } | 949 | } |
| @@ -925,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
| 925 | { | 952 | { |
| 926 | nodemask_t mask; | 953 | nodemask_t mask; |
| 927 | 954 | ||
| 928 | down(&cpuset_sem); | 955 | cpuset_down(&cpuset_sem); |
| 929 | mask = cs->mems_allowed; | 956 | mask = cs->mems_allowed; |
| 930 | up(&cpuset_sem); | 957 | cpuset_up(&cpuset_sem); |
| 931 | 958 | ||
| 932 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 959 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
| 933 | } | 960 | } |
| @@ -972,6 +999,10 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
| 972 | *s++ = '\n'; | 999 | *s++ = '\n'; |
| 973 | *s = '\0'; | 1000 | *s = '\0'; |
| 974 | 1001 | ||
| 1002 | /* Do nothing if *ppos is at the eof or beyond the eof. */ | ||
| 1003 | if (s - page <= *ppos) | ||
| 1004 | return 0; | ||
| 1005 | |||
| 975 | start = page + *ppos; | 1006 | start = page + *ppos; |
| 976 | n = s - start; | 1007 | n = s - start; |
| 977 | retval = n - copy_to_user(buf, start, min(n, nbytes)); | 1008 | retval = n - copy_to_user(buf, start, min(n, nbytes)); |
| @@ -1330,8 +1361,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1330 | if (!cs) | 1361 | if (!cs) |
| 1331 | return -ENOMEM; | 1362 | return -ENOMEM; |
| 1332 | 1363 | ||
| 1333 | down(&cpuset_sem); | 1364 | cpuset_down(&cpuset_sem); |
| 1334 | refresh_mems(); | ||
| 1335 | cs->flags = 0; | 1365 | cs->flags = 0; |
| 1336 | if (notify_on_release(parent)) | 1366 | if (notify_on_release(parent)) |
| 1337 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1367 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
| @@ -1356,14 +1386,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
| 1356 | * will down() this new directory's i_sem and if we race with | 1386 | * will down() this new directory's i_sem and if we race with |
| 1357 | * another mkdir, we might deadlock. | 1387 | * another mkdir, we might deadlock. |
| 1358 | */ | 1388 | */ |
| 1359 | up(&cpuset_sem); | 1389 | cpuset_up(&cpuset_sem); |
| 1360 | 1390 | ||
| 1361 | err = cpuset_populate_dir(cs->dentry); | 1391 | err = cpuset_populate_dir(cs->dentry); |
| 1362 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1392 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
| 1363 | return 0; | 1393 | return 0; |
| 1364 | err: | 1394 | err: |
| 1365 | list_del(&cs->sibling); | 1395 | list_del(&cs->sibling); |
| 1366 | up(&cpuset_sem); | 1396 | cpuset_up(&cpuset_sem); |
| 1367 | kfree(cs); | 1397 | kfree(cs); |
| 1368 | return err; | 1398 | return err; |
| 1369 | } | 1399 | } |
| @@ -1385,14 +1415,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1385 | 1415 | ||
| 1386 | /* the vfs holds both inode->i_sem already */ | 1416 | /* the vfs holds both inode->i_sem already */ |
| 1387 | 1417 | ||
| 1388 | down(&cpuset_sem); | 1418 | cpuset_down(&cpuset_sem); |
| 1389 | refresh_mems(); | ||
| 1390 | if (atomic_read(&cs->count) > 0) { | 1419 | if (atomic_read(&cs->count) > 0) { |
| 1391 | up(&cpuset_sem); | 1420 | cpuset_up(&cpuset_sem); |
| 1392 | return -EBUSY; | 1421 | return -EBUSY; |
| 1393 | } | 1422 | } |
| 1394 | if (!list_empty(&cs->children)) { | 1423 | if (!list_empty(&cs->children)) { |
| 1395 | up(&cpuset_sem); | 1424 | cpuset_up(&cpuset_sem); |
| 1396 | return -EBUSY; | 1425 | return -EBUSY; |
| 1397 | } | 1426 | } |
| 1398 | parent = cs->parent; | 1427 | parent = cs->parent; |
| @@ -1408,7 +1437,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 1408 | spin_unlock(&d->d_lock); | 1437 | spin_unlock(&d->d_lock); |
| 1409 | cpuset_d_remove_dir(d); | 1438 | cpuset_d_remove_dir(d); |
| 1410 | dput(d); | 1439 | dput(d); |
| 1411 | up(&cpuset_sem); | 1440 | cpuset_up(&cpuset_sem); |
| 1412 | cpuset_release_agent(pathbuf); | 1441 | cpuset_release_agent(pathbuf); |
| 1413 | return 0; | 1442 | return 0; |
| 1414 | } | 1443 | } |
| @@ -1511,10 +1540,10 @@ void cpuset_exit(struct task_struct *tsk) | |||
| 1511 | if (notify_on_release(cs)) { | 1540 | if (notify_on_release(cs)) { |
| 1512 | char *pathbuf = NULL; | 1541 | char *pathbuf = NULL; |
| 1513 | 1542 | ||
| 1514 | down(&cpuset_sem); | 1543 | cpuset_down(&cpuset_sem); |
| 1515 | if (atomic_dec_and_test(&cs->count)) | 1544 | if (atomic_dec_and_test(&cs->count)) |
| 1516 | check_for_release(cs, &pathbuf); | 1545 | check_for_release(cs, &pathbuf); |
| 1517 | up(&cpuset_sem); | 1546 | cpuset_up(&cpuset_sem); |
| 1518 | cpuset_release_agent(pathbuf); | 1547 | cpuset_release_agent(pathbuf); |
| 1519 | } else { | 1548 | } else { |
| 1520 | atomic_dec(&cs->count); | 1549 | atomic_dec(&cs->count); |
| @@ -1535,11 +1564,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
| 1535 | { | 1564 | { |
| 1536 | cpumask_t mask; | 1565 | cpumask_t mask; |
| 1537 | 1566 | ||
| 1538 | down(&cpuset_sem); | 1567 | cpuset_down(&cpuset_sem); |
| 1539 | task_lock((struct task_struct *)tsk); | 1568 | task_lock((struct task_struct *)tsk); |
| 1540 | guarantee_online_cpus(tsk->cpuset, &mask); | 1569 | guarantee_online_cpus(tsk->cpuset, &mask); |
| 1541 | task_unlock((struct task_struct *)tsk); | 1570 | task_unlock((struct task_struct *)tsk); |
| 1542 | up(&cpuset_sem); | 1571 | cpuset_up(&cpuset_sem); |
| 1543 | 1572 | ||
| 1544 | return mask; | 1573 | return mask; |
| 1545 | } | 1574 | } |
| @@ -1564,9 +1593,9 @@ void cpuset_update_current_mems_allowed(void) | |||
| 1564 | if (!cs) | 1593 | if (!cs) |
| 1565 | return; /* task is exiting */ | 1594 | return; /* task is exiting */ |
| 1566 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1595 | if (current->cpuset_mems_generation != cs->mems_generation) { |
| 1567 | down(&cpuset_sem); | 1596 | cpuset_down(&cpuset_sem); |
| 1568 | refresh_mems(); | 1597 | refresh_mems(); |
| 1569 | up(&cpuset_sem); | 1598 | cpuset_up(&cpuset_sem); |
| 1570 | } | 1599 | } |
| 1571 | } | 1600 | } |
| 1572 | 1601 | ||
| @@ -1665,14 +1694,14 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) | |||
| 1665 | return 0; | 1694 | return 0; |
| 1666 | 1695 | ||
| 1667 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 1696 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
| 1668 | down(&cpuset_sem); | 1697 | cpuset_down(&cpuset_sem); |
| 1669 | cs = current->cpuset; | 1698 | cs = current->cpuset; |
| 1670 | if (!cs) | 1699 | if (!cs) |
| 1671 | goto done; /* current task exiting */ | 1700 | goto done; /* current task exiting */ |
| 1672 | cs = nearest_exclusive_ancestor(cs); | 1701 | cs = nearest_exclusive_ancestor(cs); |
| 1673 | allowed = node_isset(node, cs->mems_allowed); | 1702 | allowed = node_isset(node, cs->mems_allowed); |
| 1674 | done: | 1703 | done: |
| 1675 | up(&cpuset_sem); | 1704 | cpuset_up(&cpuset_sem); |
| 1676 | return allowed; | 1705 | return allowed; |
| 1677 | } | 1706 | } |
| 1678 | 1707 | ||
| @@ -1693,7 +1722,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
| 1693 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 1722 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
| 1694 | int overlap = 0; /* do cpusets overlap? */ | 1723 | int overlap = 0; /* do cpusets overlap? */ |
| 1695 | 1724 | ||
| 1696 | down(&cpuset_sem); | 1725 | cpuset_down(&cpuset_sem); |
| 1697 | cs1 = current->cpuset; | 1726 | cs1 = current->cpuset; |
| 1698 | if (!cs1) | 1727 | if (!cs1) |
| 1699 | goto done; /* current task exiting */ | 1728 | goto done; /* current task exiting */ |
| @@ -1704,7 +1733,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
| 1704 | cs2 = nearest_exclusive_ancestor(cs2); | 1733 | cs2 = nearest_exclusive_ancestor(cs2); |
| 1705 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 1734 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
| 1706 | done: | 1735 | done: |
| 1707 | up(&cpuset_sem); | 1736 | cpuset_up(&cpuset_sem); |
| 1708 | 1737 | ||
| 1709 | return overlap; | 1738 | return overlap; |
| 1710 | } | 1739 | } |
| @@ -1727,7 +1756,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
| 1727 | return -ENOMEM; | 1756 | return -ENOMEM; |
| 1728 | 1757 | ||
| 1729 | tsk = m->private; | 1758 | tsk = m->private; |
| 1730 | down(&cpuset_sem); | 1759 | cpuset_down(&cpuset_sem); |
| 1731 | task_lock(tsk); | 1760 | task_lock(tsk); |
| 1732 | cs = tsk->cpuset; | 1761 | cs = tsk->cpuset; |
| 1733 | task_unlock(tsk); | 1762 | task_unlock(tsk); |
| @@ -1742,7 +1771,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
| 1742 | seq_puts(m, buf); | 1771 | seq_puts(m, buf); |
| 1743 | seq_putc(m, '\n'); | 1772 | seq_putc(m, '\n'); |
| 1744 | out: | 1773 | out: |
| 1745 | up(&cpuset_sem); | 1774 | cpuset_up(&cpuset_sem); |
| 1746 | kfree(buf); | 1775 | kfree(buf); |
| 1747 | return retval; | 1776 | return retval; |
| 1748 | } | 1777 | } |
