diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 109 |
1 files changed, 69 insertions, 40 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1f06e7690106..79866bc6b3a1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL; | |||
180 | */ | 180 | */ |
181 | 181 | ||
182 | static DECLARE_MUTEX(cpuset_sem); | 182 | static DECLARE_MUTEX(cpuset_sem); |
183 | static struct task_struct *cpuset_sem_owner; | ||
184 | static int cpuset_sem_depth; | ||
185 | |||
186 | /* | ||
187 | * The global cpuset semaphore cpuset_sem can be needed by the | ||
188 | * memory allocator to update a tasks mems_allowed (see the calls | ||
189 | * to cpuset_update_current_mems_allowed()) or to walk up the | ||
190 | * cpuset hierarchy to find a mem_exclusive cpuset see the calls | ||
191 | * to cpuset_excl_nodes_overlap()). | ||
192 | * | ||
193 | * But if the memory allocation is being done by cpuset.c code, it | ||
194 | * usually already holds cpuset_sem. Double tripping on a kernel | ||
195 | * semaphore deadlocks the current task, and any other task that | ||
196 | * subsequently tries to obtain the lock. | ||
197 | * | ||
198 | * Run all up's and down's on cpuset_sem through the following | ||
199 | * wrappers, which will detect this nested locking, and avoid | ||
200 | * deadlocking. | ||
201 | */ | ||
202 | |||
203 | static inline void cpuset_down(struct semaphore *psem) | ||
204 | { | ||
205 | if (cpuset_sem_owner != current) { | ||
206 | down(psem); | ||
207 | cpuset_sem_owner = current; | ||
208 | } | ||
209 | cpuset_sem_depth++; | ||
210 | } | ||
211 | |||
212 | static inline void cpuset_up(struct semaphore *psem) | ||
213 | { | ||
214 | if (--cpuset_sem_depth == 0) { | ||
215 | cpuset_sem_owner = NULL; | ||
216 | up(psem); | ||
217 | } | ||
218 | } | ||
183 | 219 | ||
184 | /* | 220 | /* |
185 | * A couple of forward declarations required, due to cyclic reference loop: | 221 | * A couple of forward declarations required, due to cyclic reference loop: |
@@ -522,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
522 | * Refresh current tasks mems_allowed and mems_generation from | 558 | * Refresh current tasks mems_allowed and mems_generation from |
523 | * current tasks cpuset. Call with cpuset_sem held. | 559 | * current tasks cpuset. Call with cpuset_sem held. |
524 | * | 560 | * |
525 | * Be sure to call refresh_mems() on any cpuset operation which | 561 | * This routine is needed to update the per-task mems_allowed |
526 | * (1) holds cpuset_sem, and (2) might possibly alloc memory. | 562 | * data, within the tasks context, when it is trying to allocate |
527 | * Call after obtaining cpuset_sem lock, before any possible | 563 | * memory (in various mm/mempolicy.c routines) and notices |
528 | * allocation. Otherwise one risks trying to allocate memory | 564 | * that some other task has been modifying its cpuset. |
529 | * while the task cpuset_mems_generation is not the same as | ||
530 | * the mems_generation in its cpuset, which would deadlock on | ||
531 | * cpuset_sem in cpuset_update_current_mems_allowed(). | ||
532 | * | ||
533 | * Since we hold cpuset_sem, once refresh_mems() is called, the | ||
534 | * test (current->cpuset_mems_generation != cs->mems_generation) | ||
535 | * in cpuset_update_current_mems_allowed() will remain false, | ||
536 | * until we drop cpuset_sem. Anyone else who would change our | ||
537 | * cpusets mems_generation needs to lock cpuset_sem first. | ||
538 | */ | 565 | */ |
539 | 566 | ||
540 | static void refresh_mems(void) | 567 | static void refresh_mems(void) |
@@ -840,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
840 | } | 867 | } |
841 | buffer[nbytes] = 0; /* nul-terminate */ | 868 | buffer[nbytes] = 0; /* nul-terminate */ |
842 | 869 | ||
843 | down(&cpuset_sem); | 870 | cpuset_down(&cpuset_sem); |
844 | 871 | ||
845 | if (is_removed(cs)) { | 872 | if (is_removed(cs)) { |
846 | retval = -ENODEV; | 873 | retval = -ENODEV; |
@@ -874,7 +901,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
874 | if (retval == 0) | 901 | if (retval == 0) |
875 | retval = nbytes; | 902 | retval = nbytes; |
876 | out2: | 903 | out2: |
877 | up(&cpuset_sem); | 904 | cpuset_up(&cpuset_sem); |
878 | cpuset_release_agent(pathbuf); | 905 | cpuset_release_agent(pathbuf); |
879 | out1: | 906 | out1: |
880 | kfree(buffer); | 907 | kfree(buffer); |
@@ -914,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
914 | { | 941 | { |
915 | cpumask_t mask; | 942 | cpumask_t mask; |
916 | 943 | ||
917 | down(&cpuset_sem); | 944 | cpuset_down(&cpuset_sem); |
918 | mask = cs->cpus_allowed; | 945 | mask = cs->cpus_allowed; |
919 | up(&cpuset_sem); | 946 | cpuset_up(&cpuset_sem); |
920 | 947 | ||
921 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 948 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
922 | } | 949 | } |
@@ -925,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
925 | { | 952 | { |
926 | nodemask_t mask; | 953 | nodemask_t mask; |
927 | 954 | ||
928 | down(&cpuset_sem); | 955 | cpuset_down(&cpuset_sem); |
929 | mask = cs->mems_allowed; | 956 | mask = cs->mems_allowed; |
930 | up(&cpuset_sem); | 957 | cpuset_up(&cpuset_sem); |
931 | 958 | ||
932 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 959 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
933 | } | 960 | } |
@@ -972,6 +999,10 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
972 | *s++ = '\n'; | 999 | *s++ = '\n'; |
973 | *s = '\0'; | 1000 | *s = '\0'; |
974 | 1001 | ||
1002 | /* Do nothing if *ppos is at the eof or beyond the eof. */ | ||
1003 | if (s - page <= *ppos) | ||
1004 | return 0; | ||
1005 | |||
975 | start = page + *ppos; | 1006 | start = page + *ppos; |
976 | n = s - start; | 1007 | n = s - start; |
977 | retval = n - copy_to_user(buf, start, min(n, nbytes)); | 1008 | retval = n - copy_to_user(buf, start, min(n, nbytes)); |
@@ -1330,8 +1361,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1330 | if (!cs) | 1361 | if (!cs) |
1331 | return -ENOMEM; | 1362 | return -ENOMEM; |
1332 | 1363 | ||
1333 | down(&cpuset_sem); | 1364 | cpuset_down(&cpuset_sem); |
1334 | refresh_mems(); | ||
1335 | cs->flags = 0; | 1365 | cs->flags = 0; |
1336 | if (notify_on_release(parent)) | 1366 | if (notify_on_release(parent)) |
1337 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1367 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
@@ -1356,14 +1386,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1356 | * will down() this new directory's i_sem and if we race with | 1386 | * will down() this new directory's i_sem and if we race with |
1357 | * another mkdir, we might deadlock. | 1387 | * another mkdir, we might deadlock. |
1358 | */ | 1388 | */ |
1359 | up(&cpuset_sem); | 1389 | cpuset_up(&cpuset_sem); |
1360 | 1390 | ||
1361 | err = cpuset_populate_dir(cs->dentry); | 1391 | err = cpuset_populate_dir(cs->dentry); |
1362 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1392 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
1363 | return 0; | 1393 | return 0; |
1364 | err: | 1394 | err: |
1365 | list_del(&cs->sibling); | 1395 | list_del(&cs->sibling); |
1366 | up(&cpuset_sem); | 1396 | cpuset_up(&cpuset_sem); |
1367 | kfree(cs); | 1397 | kfree(cs); |
1368 | return err; | 1398 | return err; |
1369 | } | 1399 | } |
@@ -1385,14 +1415,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1385 | 1415 | ||
1386 | /* the vfs holds both inode->i_sem already */ | 1416 | /* the vfs holds both inode->i_sem already */ |
1387 | 1417 | ||
1388 | down(&cpuset_sem); | 1418 | cpuset_down(&cpuset_sem); |
1389 | refresh_mems(); | ||
1390 | if (atomic_read(&cs->count) > 0) { | 1419 | if (atomic_read(&cs->count) > 0) { |
1391 | up(&cpuset_sem); | 1420 | cpuset_up(&cpuset_sem); |
1392 | return -EBUSY; | 1421 | return -EBUSY; |
1393 | } | 1422 | } |
1394 | if (!list_empty(&cs->children)) { | 1423 | if (!list_empty(&cs->children)) { |
1395 | up(&cpuset_sem); | 1424 | cpuset_up(&cpuset_sem); |
1396 | return -EBUSY; | 1425 | return -EBUSY; |
1397 | } | 1426 | } |
1398 | parent = cs->parent; | 1427 | parent = cs->parent; |
@@ -1408,7 +1437,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1408 | spin_unlock(&d->d_lock); | 1437 | spin_unlock(&d->d_lock); |
1409 | cpuset_d_remove_dir(d); | 1438 | cpuset_d_remove_dir(d); |
1410 | dput(d); | 1439 | dput(d); |
1411 | up(&cpuset_sem); | 1440 | cpuset_up(&cpuset_sem); |
1412 | cpuset_release_agent(pathbuf); | 1441 | cpuset_release_agent(pathbuf); |
1413 | return 0; | 1442 | return 0; |
1414 | } | 1443 | } |
@@ -1511,10 +1540,10 @@ void cpuset_exit(struct task_struct *tsk) | |||
1511 | if (notify_on_release(cs)) { | 1540 | if (notify_on_release(cs)) { |
1512 | char *pathbuf = NULL; | 1541 | char *pathbuf = NULL; |
1513 | 1542 | ||
1514 | down(&cpuset_sem); | 1543 | cpuset_down(&cpuset_sem); |
1515 | if (atomic_dec_and_test(&cs->count)) | 1544 | if (atomic_dec_and_test(&cs->count)) |
1516 | check_for_release(cs, &pathbuf); | 1545 | check_for_release(cs, &pathbuf); |
1517 | up(&cpuset_sem); | 1546 | cpuset_up(&cpuset_sem); |
1518 | cpuset_release_agent(pathbuf); | 1547 | cpuset_release_agent(pathbuf); |
1519 | } else { | 1548 | } else { |
1520 | atomic_dec(&cs->count); | 1549 | atomic_dec(&cs->count); |
@@ -1535,11 +1564,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
1535 | { | 1564 | { |
1536 | cpumask_t mask; | 1565 | cpumask_t mask; |
1537 | 1566 | ||
1538 | down(&cpuset_sem); | 1567 | cpuset_down(&cpuset_sem); |
1539 | task_lock((struct task_struct *)tsk); | 1568 | task_lock((struct task_struct *)tsk); |
1540 | guarantee_online_cpus(tsk->cpuset, &mask); | 1569 | guarantee_online_cpus(tsk->cpuset, &mask); |
1541 | task_unlock((struct task_struct *)tsk); | 1570 | task_unlock((struct task_struct *)tsk); |
1542 | up(&cpuset_sem); | 1571 | cpuset_up(&cpuset_sem); |
1543 | 1572 | ||
1544 | return mask; | 1573 | return mask; |
1545 | } | 1574 | } |
@@ -1564,9 +1593,9 @@ void cpuset_update_current_mems_allowed(void) | |||
1564 | if (!cs) | 1593 | if (!cs) |
1565 | return; /* task is exiting */ | 1594 | return; /* task is exiting */ |
1566 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1595 | if (current->cpuset_mems_generation != cs->mems_generation) { |
1567 | down(&cpuset_sem); | 1596 | cpuset_down(&cpuset_sem); |
1568 | refresh_mems(); | 1597 | refresh_mems(); |
1569 | up(&cpuset_sem); | 1598 | cpuset_up(&cpuset_sem); |
1570 | } | 1599 | } |
1571 | } | 1600 | } |
1572 | 1601 | ||
@@ -1665,14 +1694,14 @@ int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) | |||
1665 | return 0; | 1694 | return 0; |
1666 | 1695 | ||
1667 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 1696 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
1668 | down(&cpuset_sem); | 1697 | cpuset_down(&cpuset_sem); |
1669 | cs = current->cpuset; | 1698 | cs = current->cpuset; |
1670 | if (!cs) | 1699 | if (!cs) |
1671 | goto done; /* current task exiting */ | 1700 | goto done; /* current task exiting */ |
1672 | cs = nearest_exclusive_ancestor(cs); | 1701 | cs = nearest_exclusive_ancestor(cs); |
1673 | allowed = node_isset(node, cs->mems_allowed); | 1702 | allowed = node_isset(node, cs->mems_allowed); |
1674 | done: | 1703 | done: |
1675 | up(&cpuset_sem); | 1704 | cpuset_up(&cpuset_sem); |
1676 | return allowed; | 1705 | return allowed; |
1677 | } | 1706 | } |
1678 | 1707 | ||
@@ -1693,7 +1722,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1693 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 1722 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
1694 | int overlap = 0; /* do cpusets overlap? */ | 1723 | int overlap = 0; /* do cpusets overlap? */ |
1695 | 1724 | ||
1696 | down(&cpuset_sem); | 1725 | cpuset_down(&cpuset_sem); |
1697 | cs1 = current->cpuset; | 1726 | cs1 = current->cpuset; |
1698 | if (!cs1) | 1727 | if (!cs1) |
1699 | goto done; /* current task exiting */ | 1728 | goto done; /* current task exiting */ |
@@ -1704,7 +1733,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1704 | cs2 = nearest_exclusive_ancestor(cs2); | 1733 | cs2 = nearest_exclusive_ancestor(cs2); |
1705 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 1734 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
1706 | done: | 1735 | done: |
1707 | up(&cpuset_sem); | 1736 | cpuset_up(&cpuset_sem); |
1708 | 1737 | ||
1709 | return overlap; | 1738 | return overlap; |
1710 | } | 1739 | } |
@@ -1727,7 +1756,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1727 | return -ENOMEM; | 1756 | return -ENOMEM; |
1728 | 1757 | ||
1729 | tsk = m->private; | 1758 | tsk = m->private; |
1730 | down(&cpuset_sem); | 1759 | cpuset_down(&cpuset_sem); |
1731 | task_lock(tsk); | 1760 | task_lock(tsk); |
1732 | cs = tsk->cpuset; | 1761 | cs = tsk->cpuset; |
1733 | task_unlock(tsk); | 1762 | task_unlock(tsk); |
@@ -1742,7 +1771,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1742 | seq_puts(m, buf); | 1771 | seq_puts(m, buf); |
1743 | seq_putc(m, '\n'); | 1772 | seq_putc(m, '\n'); |
1744 | out: | 1773 | out: |
1745 | up(&cpuset_sem); | 1774 | cpuset_up(&cpuset_sem); |
1746 | kfree(buf); | 1775 | kfree(buf); |
1747 | return retval; | 1776 | return retval; |
1748 | } | 1777 | } |