diff options
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/Makefile | 1 | ||||
| -rw-r--r-- | kernel/cgroup.c | 318 | ||||
| -rw-r--r-- | kernel/cpuset.c | 394 | ||||
| -rw-r--r-- | kernel/fork.c | 11 | ||||
| -rw-r--r-- | kernel/kexec.c | 18 | ||||
| -rw-r--r-- | kernel/panic.c | 5 | ||||
| -rw-r--r-- | kernel/pid.c | 1 | ||||
| -rw-r--r-- | kernel/power/Kconfig | 9 | ||||
| -rw-r--r-- | kernel/res_counter.c | 134 | ||||
| -rw-r--r-- | kernel/sysctl.c | 9 |
10 files changed, 640 insertions, 260 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 135a1b943446..685697c0a181 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -43,6 +43,7 @@ obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o | |||
| 43 | obj-$(CONFIG_CPUSETS) += cpuset.o | 43 | obj-$(CONFIG_CPUSETS) += cpuset.o |
| 44 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o | 44 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o |
| 45 | obj-$(CONFIG_IKCONFIG) += configs.o | 45 | obj-$(CONFIG_IKCONFIG) += configs.o |
| 46 | obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o | ||
| 46 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o | 47 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o |
| 47 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | 48 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o |
| 48 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | 49 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1a3c23936d43..4766bb65e4d9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -141,7 +141,7 @@ enum { | |||
| 141 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ | 141 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ |
| 142 | }; | 142 | }; |
| 143 | 143 | ||
| 144 | inline int cgroup_is_releasable(const struct cgroup *cgrp) | 144 | static int cgroup_is_releasable(const struct cgroup *cgrp) |
| 145 | { | 145 | { |
| 146 | const int bits = | 146 | const int bits = |
| 147 | (1 << CGRP_RELEASABLE) | | 147 | (1 << CGRP_RELEASABLE) | |
| @@ -149,7 +149,7 @@ inline int cgroup_is_releasable(const struct cgroup *cgrp) | |||
| 149 | return (cgrp->flags & bits) == bits; | 149 | return (cgrp->flags & bits) == bits; |
| 150 | } | 150 | } |
| 151 | 151 | ||
| 152 | inline int notify_on_release(const struct cgroup *cgrp) | 152 | static int notify_on_release(const struct cgroup *cgrp) |
| 153 | { | 153 | { |
| 154 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 154 | return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
| 155 | } | 155 | } |
| @@ -489,7 +489,7 @@ static struct css_set *find_css_set( | |||
| 489 | * Any task can increment and decrement the count field without lock. | 489 | * Any task can increment and decrement the count field without lock. |
| 490 | * So in general, code holding cgroup_mutex can't rely on the count | 490 | * So in general, code holding cgroup_mutex can't rely on the count |
| 491 | * field not changing. However, if the count goes to zero, then only | 491 | * field not changing. However, if the count goes to zero, then only |
| 492 | * attach_task() can increment it again. Because a count of zero | 492 | * cgroup_attach_task() can increment it again. Because a count of zero |
| 493 | * means that no tasks are currently attached, therefore there is no | 493 | * means that no tasks are currently attached, therefore there is no |
| 494 | * way a task attached to that cgroup can fork (the other way to | 494 | * way a task attached to that cgroup can fork (the other way to |
| 495 | * increment the count). So code holding cgroup_mutex can safely | 495 | * increment the count). So code holding cgroup_mutex can safely |
| @@ -520,17 +520,17 @@ static struct css_set *find_css_set( | |||
| 520 | * The task_lock() exception | 520 | * The task_lock() exception |
| 521 | * | 521 | * |
| 522 | * The need for this exception arises from the action of | 522 | * The need for this exception arises from the action of |
| 523 | * attach_task(), which overwrites one tasks cgroup pointer with | 523 | * cgroup_attach_task(), which overwrites one tasks cgroup pointer with |
| 524 | * another. It does so using cgroup_mutexe, however there are | 524 | * another. It does so using cgroup_mutexe, however there are |
| 525 | * several performance critical places that need to reference | 525 | * several performance critical places that need to reference |
| 526 | * task->cgroup without the expense of grabbing a system global | 526 | * task->cgroup without the expense of grabbing a system global |
| 527 | * mutex. Therefore except as noted below, when dereferencing or, as | 527 | * mutex. Therefore except as noted below, when dereferencing or, as |
| 528 | * in attach_task(), modifying a task'ss cgroup pointer we use | 528 | * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use |
| 529 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in | 529 | * task_lock(), which acts on a spinlock (task->alloc_lock) already in |
| 530 | * the task_struct routinely used for such matters. | 530 | * the task_struct routinely used for such matters. |
| 531 | * | 531 | * |
| 532 | * P.S. One more locking exception. RCU is used to guard the | 532 | * P.S. One more locking exception. RCU is used to guard the |
| 533 | * update of a tasks cgroup pointer by attach_task() | 533 | * update of a tasks cgroup pointer by cgroup_attach_task() |
| 534 | */ | 534 | */ |
| 535 | 535 | ||
| 536 | /** | 536 | /** |
| @@ -586,11 +586,27 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) | |||
| 586 | return inode; | 586 | return inode; |
| 587 | } | 587 | } |
| 588 | 588 | ||
| 589 | /* | ||
| 590 | * Call subsys's pre_destroy handler. | ||
| 591 | * This is called before css refcnt check. | ||
| 592 | */ | ||
| 593 | |||
| 594 | static void cgroup_call_pre_destroy(struct cgroup *cgrp) | ||
| 595 | { | ||
| 596 | struct cgroup_subsys *ss; | ||
| 597 | for_each_subsys(cgrp->root, ss) | ||
| 598 | if (ss->pre_destroy && cgrp->subsys[ss->subsys_id]) | ||
| 599 | ss->pre_destroy(ss, cgrp); | ||
| 600 | return; | ||
| 601 | } | ||
| 602 | |||
| 603 | |||
| 589 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) | 604 | static void cgroup_diput(struct dentry *dentry, struct inode *inode) |
| 590 | { | 605 | { |
| 591 | /* is dentry a directory ? if so, kfree() associated cgroup */ | 606 | /* is dentry a directory ? if so, kfree() associated cgroup */ |
| 592 | if (S_ISDIR(inode->i_mode)) { | 607 | if (S_ISDIR(inode->i_mode)) { |
| 593 | struct cgroup *cgrp = dentry->d_fsdata; | 608 | struct cgroup *cgrp = dentry->d_fsdata; |
| 609 | struct cgroup_subsys *ss; | ||
| 594 | BUG_ON(!(cgroup_is_removed(cgrp))); | 610 | BUG_ON(!(cgroup_is_removed(cgrp))); |
| 595 | /* It's possible for external users to be holding css | 611 | /* It's possible for external users to be holding css |
| 596 | * reference counts on a cgroup; css_put() needs to | 612 | * reference counts on a cgroup; css_put() needs to |
| @@ -599,6 +615,23 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
| 599 | * queue the cgroup to be handled by the release | 615 | * queue the cgroup to be handled by the release |
| 600 | * agent */ | 616 | * agent */ |
| 601 | synchronize_rcu(); | 617 | synchronize_rcu(); |
| 618 | |||
| 619 | mutex_lock(&cgroup_mutex); | ||
| 620 | /* | ||
| 621 | * Release the subsystem state objects. | ||
| 622 | */ | ||
| 623 | for_each_subsys(cgrp->root, ss) { | ||
| 624 | if (cgrp->subsys[ss->subsys_id]) | ||
| 625 | ss->destroy(ss, cgrp); | ||
| 626 | } | ||
| 627 | |||
| 628 | cgrp->root->number_of_cgroups--; | ||
| 629 | mutex_unlock(&cgroup_mutex); | ||
| 630 | |||
| 631 | /* Drop the active superblock reference that we took when we | ||
| 632 | * created the cgroup */ | ||
| 633 | deactivate_super(cgrp->root->sb); | ||
| 634 | |||
| 602 | kfree(cgrp); | 635 | kfree(cgrp); |
| 603 | } | 636 | } |
| 604 | iput(inode); | 637 | iput(inode); |
| @@ -1161,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp, | |||
| 1161 | * Call holding cgroup_mutex. May take task_lock of | 1194 | * Call holding cgroup_mutex. May take task_lock of |
| 1162 | * the task 'pid' during call. | 1195 | * the task 'pid' during call. |
| 1163 | */ | 1196 | */ |
| 1164 | static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) | 1197 | int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
| 1165 | { | 1198 | { |
| 1166 | int retval = 0; | 1199 | int retval = 0; |
| 1167 | struct cgroup_subsys *ss; | 1200 | struct cgroup_subsys *ss; |
| @@ -1181,9 +1214,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
| 1181 | for_each_subsys(root, ss) { | 1214 | for_each_subsys(root, ss) { |
| 1182 | if (ss->can_attach) { | 1215 | if (ss->can_attach) { |
| 1183 | retval = ss->can_attach(ss, cgrp, tsk); | 1216 | retval = ss->can_attach(ss, cgrp, tsk); |
| 1184 | if (retval) { | 1217 | if (retval) |
| 1185 | return retval; | 1218 | return retval; |
| 1186 | } | ||
| 1187 | } | 1219 | } |
| 1188 | } | 1220 | } |
| 1189 | 1221 | ||
| @@ -1192,9 +1224,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
| 1192 | * based on its final set of cgroups | 1224 | * based on its final set of cgroups |
| 1193 | */ | 1225 | */ |
| 1194 | newcg = find_css_set(cg, cgrp); | 1226 | newcg = find_css_set(cg, cgrp); |
| 1195 | if (!newcg) { | 1227 | if (!newcg) |
| 1196 | return -ENOMEM; | 1228 | return -ENOMEM; |
| 1197 | } | ||
| 1198 | 1229 | ||
| 1199 | task_lock(tsk); | 1230 | task_lock(tsk); |
| 1200 | if (tsk->flags & PF_EXITING) { | 1231 | if (tsk->flags & PF_EXITING) { |
| @@ -1214,9 +1245,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) | |||
| 1214 | write_unlock(&css_set_lock); | 1245 | write_unlock(&css_set_lock); |
| 1215 | 1246 | ||
| 1216 | for_each_subsys(root, ss) { | 1247 | for_each_subsys(root, ss) { |
| 1217 | if (ss->attach) { | 1248 | if (ss->attach) |
| 1218 | ss->attach(ss, cgrp, oldcgrp, tsk); | 1249 | ss->attach(ss, cgrp, oldcgrp, tsk); |
| 1219 | } | ||
| 1220 | } | 1250 | } |
| 1221 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); | 1251 | set_bit(CGRP_RELEASABLE, &oldcgrp->flags); |
| 1222 | synchronize_rcu(); | 1252 | synchronize_rcu(); |
| @@ -1239,7 +1269,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) | |||
| 1239 | 1269 | ||
| 1240 | if (pid) { | 1270 | if (pid) { |
| 1241 | rcu_read_lock(); | 1271 | rcu_read_lock(); |
| 1242 | tsk = find_task_by_pid(pid); | 1272 | tsk = find_task_by_vpid(pid); |
| 1243 | if (!tsk || tsk->flags & PF_EXITING) { | 1273 | if (!tsk || tsk->flags & PF_EXITING) { |
| 1244 | rcu_read_unlock(); | 1274 | rcu_read_unlock(); |
| 1245 | return -ESRCH; | 1275 | return -ESRCH; |
| @@ -1257,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) | |||
| 1257 | get_task_struct(tsk); | 1287 | get_task_struct(tsk); |
| 1258 | } | 1288 | } |
| 1259 | 1289 | ||
| 1260 | ret = attach_task(cgrp, tsk); | 1290 | ret = cgroup_attach_task(cgrp, tsk); |
| 1261 | put_task_struct(tsk); | 1291 | put_task_struct(tsk); |
| 1262 | return ret; | 1292 | return ret; |
| 1263 | } | 1293 | } |
| @@ -1329,9 +1359,14 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp, | |||
| 1329 | goto out1; | 1359 | goto out1; |
| 1330 | } | 1360 | } |
| 1331 | buffer[nbytes] = 0; /* nul-terminate */ | 1361 | buffer[nbytes] = 0; /* nul-terminate */ |
| 1362 | strstrip(buffer); /* strip -just- trailing whitespace */ | ||
| 1332 | 1363 | ||
| 1333 | mutex_lock(&cgroup_mutex); | 1364 | mutex_lock(&cgroup_mutex); |
| 1334 | 1365 | ||
| 1366 | /* | ||
| 1367 | * This was already checked for in cgroup_file_write(), but | ||
| 1368 | * check again now we're holding cgroup_mutex. | ||
| 1369 | */ | ||
| 1335 | if (cgroup_is_removed(cgrp)) { | 1370 | if (cgroup_is_removed(cgrp)) { |
| 1336 | retval = -ENODEV; | 1371 | retval = -ENODEV; |
| 1337 | goto out2; | 1372 | goto out2; |
| @@ -1349,24 +1384,9 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp, | |||
| 1349 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); | 1384 | clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); |
| 1350 | break; | 1385 | break; |
| 1351 | case FILE_RELEASE_AGENT: | 1386 | case FILE_RELEASE_AGENT: |
| 1352 | { | 1387 | BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); |
| 1353 | struct cgroupfs_root *root = cgrp->root; | 1388 | strcpy(cgrp->root->release_agent_path, buffer); |
| 1354 | /* Strip trailing newline */ | ||
| 1355 | if (nbytes && (buffer[nbytes-1] == '\n')) { | ||
| 1356 | buffer[nbytes-1] = 0; | ||
| 1357 | } | ||
| 1358 | if (nbytes < sizeof(root->release_agent_path)) { | ||
| 1359 | /* We never write anything other than '\0' | ||
| 1360 | * into the last char of release_agent_path, | ||
| 1361 | * so it always remains a NUL-terminated | ||
| 1362 | * string */ | ||
| 1363 | strncpy(root->release_agent_path, buffer, nbytes); | ||
| 1364 | root->release_agent_path[nbytes] = 0; | ||
| 1365 | } else { | ||
| 1366 | retval = -ENOSPC; | ||
| 1367 | } | ||
| 1368 | break; | 1389 | break; |
| 1369 | } | ||
| 1370 | default: | 1390 | default: |
| 1371 | retval = -EINVAL; | 1391 | retval = -EINVAL; |
| 1372 | goto out2; | 1392 | goto out2; |
| @@ -1387,7 +1407,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, | |||
| 1387 | struct cftype *cft = __d_cft(file->f_dentry); | 1407 | struct cftype *cft = __d_cft(file->f_dentry); |
| 1388 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 1408 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
| 1389 | 1409 | ||
| 1390 | if (!cft) | 1410 | if (!cft || cgroup_is_removed(cgrp)) |
| 1391 | return -ENODEV; | 1411 | return -ENODEV; |
| 1392 | if (cft->write) | 1412 | if (cft->write) |
| 1393 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); | 1413 | return cft->write(cgrp, cft, file, buf, nbytes, ppos); |
| @@ -1457,7 +1477,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, | |||
| 1457 | struct cftype *cft = __d_cft(file->f_dentry); | 1477 | struct cftype *cft = __d_cft(file->f_dentry); |
| 1458 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); | 1478 | struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); |
| 1459 | 1479 | ||
| 1460 | if (!cft) | 1480 | if (!cft || cgroup_is_removed(cgrp)) |
| 1461 | return -ENODEV; | 1481 | return -ENODEV; |
| 1462 | 1482 | ||
| 1463 | if (cft->read) | 1483 | if (cft->read) |
| @@ -1675,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp, | |||
| 1675 | it->task = cg->tasks.next; | 1695 | it->task = cg->tasks.next; |
| 1676 | } | 1696 | } |
| 1677 | 1697 | ||
| 1698 | /* | ||
| 1699 | * To reduce the fork() overhead for systems that are not actually | ||
| 1700 | * using their cgroups capability, we don't maintain the lists running | ||
| 1701 | * through each css_set to its tasks until we see the list actually | ||
| 1702 | * used - in other words after the first call to cgroup_iter_start(). | ||
| 1703 | * | ||
| 1704 | * The tasklist_lock is not held here, as do_each_thread() and | ||
| 1705 | * while_each_thread() are protected by RCU. | ||
| 1706 | */ | ||
| 1707 | void cgroup_enable_task_cg_lists(void) | ||
| 1708 | { | ||
| 1709 | struct task_struct *p, *g; | ||
| 1710 | write_lock(&css_set_lock); | ||
| 1711 | use_task_css_set_links = 1; | ||
| 1712 | do_each_thread(g, p) { | ||
| 1713 | task_lock(p); | ||
| 1714 | if (list_empty(&p->cg_list)) | ||
| 1715 | list_add(&p->cg_list, &p->cgroups->tasks); | ||
| 1716 | task_unlock(p); | ||
| 1717 | } while_each_thread(g, p); | ||
| 1718 | write_unlock(&css_set_lock); | ||
| 1719 | } | ||
| 1720 | |||
| 1678 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | 1721 | void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) |
| 1679 | { | 1722 | { |
| 1680 | /* | 1723 | /* |
| @@ -1682,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) | |||
| 1682 | * we need to enable the list linking each css_set to its | 1725 | * we need to enable the list linking each css_set to its |
| 1683 | * tasks, and fix up all existing tasks. | 1726 | * tasks, and fix up all existing tasks. |
| 1684 | */ | 1727 | */ |
| 1685 | if (!use_task_css_set_links) { | 1728 | if (!use_task_css_set_links) |
| 1686 | struct task_struct *p, *g; | 1729 | cgroup_enable_task_cg_lists(); |
| 1687 | write_lock(&css_set_lock); | 1730 | |
| 1688 | use_task_css_set_links = 1; | ||
| 1689 | do_each_thread(g, p) { | ||
| 1690 | task_lock(p); | ||
| 1691 | if (list_empty(&p->cg_list)) | ||
| 1692 | list_add(&p->cg_list, &p->cgroups->tasks); | ||
| 1693 | task_unlock(p); | ||
| 1694 | } while_each_thread(g, p); | ||
| 1695 | write_unlock(&css_set_lock); | ||
| 1696 | } | ||
| 1697 | read_lock(&css_set_lock); | 1731 | read_lock(&css_set_lock); |
| 1698 | it->cg_link = &cgrp->css_sets; | 1732 | it->cg_link = &cgrp->css_sets; |
| 1699 | cgroup_advance_iter(cgrp, it); | 1733 | cgroup_advance_iter(cgrp, it); |
| @@ -1726,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) | |||
| 1726 | read_unlock(&css_set_lock); | 1760 | read_unlock(&css_set_lock); |
| 1727 | } | 1761 | } |
| 1728 | 1762 | ||
| 1763 | static inline int started_after_time(struct task_struct *t1, | ||
| 1764 | struct timespec *time, | ||
| 1765 | struct task_struct *t2) | ||
| 1766 | { | ||
| 1767 | int start_diff = timespec_compare(&t1->start_time, time); | ||
| 1768 | if (start_diff > 0) { | ||
| 1769 | return 1; | ||
| 1770 | } else if (start_diff < 0) { | ||
| 1771 | return 0; | ||
| 1772 | } else { | ||
| 1773 | /* | ||
| 1774 | * Arbitrarily, if two processes started at the same | ||
| 1775 | * time, we'll say that the lower pointer value | ||
| 1776 | * started first. Note that t2 may have exited by now | ||
| 1777 | * so this may not be a valid pointer any longer, but | ||
| 1778 | * that's fine - it still serves to distinguish | ||
| 1779 | * between two tasks started (effectively) simultaneously. | ||
| 1780 | */ | ||
| 1781 | return t1 > t2; | ||
| 1782 | } | ||
| 1783 | } | ||
| 1784 | |||
| 1785 | /* | ||
| 1786 | * This function is a callback from heap_insert() and is used to order | ||
| 1787 | * the heap. | ||
| 1788 | * In this case we order the heap in descending task start time. | ||
| 1789 | */ | ||
| 1790 | static inline int started_after(void *p1, void *p2) | ||
| 1791 | { | ||
| 1792 | struct task_struct *t1 = p1; | ||
| 1793 | struct task_struct *t2 = p2; | ||
| 1794 | return started_after_time(t1, &t2->start_time, t2); | ||
| 1795 | } | ||
| 1796 | |||
| 1797 | /** | ||
| 1798 | * cgroup_scan_tasks - iterate though all the tasks in a cgroup | ||
| 1799 | * @scan: struct cgroup_scanner containing arguments for the scan | ||
| 1800 | * | ||
| 1801 | * Arguments include pointers to callback functions test_task() and | ||
| 1802 | * process_task(). | ||
| 1803 | * Iterate through all the tasks in a cgroup, calling test_task() for each, | ||
| 1804 | * and if it returns true, call process_task() for it also. | ||
| 1805 | * The test_task pointer may be NULL, meaning always true (select all tasks). | ||
| 1806 | * Effectively duplicates cgroup_iter_{start,next,end}() | ||
| 1807 | * but does not lock css_set_lock for the call to process_task(). | ||
| 1808 | * The struct cgroup_scanner may be embedded in any structure of the caller's | ||
| 1809 | * creation. | ||
| 1810 | * It is guaranteed that process_task() will act on every task that | ||
| 1811 | * is a member of the cgroup for the duration of this call. This | ||
| 1812 | * function may or may not call process_task() for tasks that exit | ||
| 1813 | * or move to a different cgroup during the call, or are forked or | ||
| 1814 | * move into the cgroup during the call. | ||
| 1815 | * | ||
| 1816 | * Note that test_task() may be called with locks held, and may in some | ||
| 1817 | * situations be called multiple times for the same task, so it should | ||
| 1818 | * be cheap. | ||
| 1819 | * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been | ||
| 1820 | * pre-allocated and will be used for heap operations (and its "gt" member will | ||
| 1821 | * be overwritten), else a temporary heap will be used (allocation of which | ||
| 1822 | * may cause this function to fail). | ||
| 1823 | */ | ||
| 1824 | int cgroup_scan_tasks(struct cgroup_scanner *scan) | ||
| 1825 | { | ||
| 1826 | int retval, i; | ||
| 1827 | struct cgroup_iter it; | ||
| 1828 | struct task_struct *p, *dropped; | ||
| 1829 | /* Never dereference latest_task, since it's not refcounted */ | ||
| 1830 | struct task_struct *latest_task = NULL; | ||
| 1831 | struct ptr_heap tmp_heap; | ||
| 1832 | struct ptr_heap *heap; | ||
| 1833 | struct timespec latest_time = { 0, 0 }; | ||
| 1834 | |||
| 1835 | if (scan->heap) { | ||
| 1836 | /* The caller supplied our heap and pre-allocated its memory */ | ||
| 1837 | heap = scan->heap; | ||
| 1838 | heap->gt = &started_after; | ||
| 1839 | } else { | ||
| 1840 | /* We need to allocate our own heap memory */ | ||
| 1841 | heap = &tmp_heap; | ||
| 1842 | retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); | ||
| 1843 | if (retval) | ||
| 1844 | /* cannot allocate the heap */ | ||
| 1845 | return retval; | ||
| 1846 | } | ||
| 1847 | |||
| 1848 | again: | ||
| 1849 | /* | ||
| 1850 | * Scan tasks in the cgroup, using the scanner's "test_task" callback | ||
| 1851 | * to determine which are of interest, and using the scanner's | ||
| 1852 | * "process_task" callback to process any of them that need an update. | ||
| 1853 | * Since we don't want to hold any locks during the task updates, | ||
| 1854 | * gather tasks to be processed in a heap structure. | ||
| 1855 | * The heap is sorted by descending task start time. | ||
| 1856 | * If the statically-sized heap fills up, we overflow tasks that | ||
| 1857 | * started later, and in future iterations only consider tasks that | ||
| 1858 | * started after the latest task in the previous pass. This | ||
| 1859 | * guarantees forward progress and that we don't miss any tasks. | ||
| 1860 | */ | ||
| 1861 | heap->size = 0; | ||
| 1862 | cgroup_iter_start(scan->cg, &it); | ||
| 1863 | while ((p = cgroup_iter_next(scan->cg, &it))) { | ||
| 1864 | /* | ||
| 1865 | * Only affect tasks that qualify per the caller's callback, | ||
| 1866 | * if he provided one | ||
| 1867 | */ | ||
| 1868 | if (scan->test_task && !scan->test_task(p, scan)) | ||
| 1869 | continue; | ||
| 1870 | /* | ||
| 1871 | * Only process tasks that started after the last task | ||
| 1872 | * we processed | ||
| 1873 | */ | ||
| 1874 | if (!started_after_time(p, &latest_time, latest_task)) | ||
| 1875 | continue; | ||
| 1876 | dropped = heap_insert(heap, p); | ||
| 1877 | if (dropped == NULL) { | ||
| 1878 | /* | ||
| 1879 | * The new task was inserted; the heap wasn't | ||
| 1880 | * previously full | ||
| 1881 | */ | ||
| 1882 | get_task_struct(p); | ||
| 1883 | } else if (dropped != p) { | ||
| 1884 | /* | ||
| 1885 | * The new task was inserted, and pushed out a | ||
| 1886 | * different task | ||
| 1887 | */ | ||
| 1888 | get_task_struct(p); | ||
| 1889 | put_task_struct(dropped); | ||
| 1890 | } | ||
| 1891 | /* | ||
| 1892 | * Else the new task was newer than anything already in | ||
| 1893 | * the heap and wasn't inserted | ||
| 1894 | */ | ||
| 1895 | } | ||
| 1896 | cgroup_iter_end(scan->cg, &it); | ||
| 1897 | |||
| 1898 | if (heap->size) { | ||
| 1899 | for (i = 0; i < heap->size; i++) { | ||
| 1900 | struct task_struct *p = heap->ptrs[i]; | ||
| 1901 | if (i == 0) { | ||
| 1902 | latest_time = p->start_time; | ||
| 1903 | latest_task = p; | ||
| 1904 | } | ||
| 1905 | /* Process the task per the caller's callback */ | ||
| 1906 | scan->process_task(p, scan); | ||
| 1907 | put_task_struct(p); | ||
| 1908 | } | ||
| 1909 | /* | ||
| 1910 | * If we had to process any tasks at all, scan again | ||
| 1911 | * in case some of them were in the middle of forking | ||
| 1912 | * children that didn't get processed. | ||
| 1913 | * Not the most efficient way to do it, but it avoids | ||
| 1914 | * having to take callback_mutex in the fork path | ||
| 1915 | */ | ||
| 1916 | goto again; | ||
| 1917 | } | ||
| 1918 | if (heap == &tmp_heap) | ||
| 1919 | heap_free(&tmp_heap); | ||
| 1920 | return 0; | ||
| 1921 | } | ||
| 1922 | |||
| 1729 | /* | 1923 | /* |
| 1730 | * Stuff for reading the 'tasks' file. | 1924 | * Stuff for reading the 'tasks' file. |
| 1731 | * | 1925 | * |
| @@ -1761,7 +1955,7 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) | |||
| 1761 | while ((tsk = cgroup_iter_next(cgrp, &it))) { | 1955 | while ((tsk = cgroup_iter_next(cgrp, &it))) { |
| 1762 | if (unlikely(n == npids)) | 1956 | if (unlikely(n == npids)) |
| 1763 | break; | 1957 | break; |
| 1764 | pidarray[n++] = task_pid_nr(tsk); | 1958 | pidarray[n++] = task_pid_vnr(tsk); |
| 1765 | } | 1959 | } |
| 1766 | cgroup_iter_end(cgrp, &it); | 1960 | cgroup_iter_end(cgrp, &it); |
| 1767 | return n; | 1961 | return n; |
| @@ -2126,9 +2320,8 @@ static inline int cgroup_has_css_refs(struct cgroup *cgrp) | |||
| 2126 | * matter, since it can only happen if the cgroup | 2320 | * matter, since it can only happen if the cgroup |
| 2127 | * has been deleted and hence no longer needs the | 2321 | * has been deleted and hence no longer needs the |
| 2128 | * release agent to be called anyway. */ | 2322 | * release agent to be called anyway. */ |
| 2129 | if (css && atomic_read(&css->refcnt)) { | 2323 | if (css && atomic_read(&css->refcnt)) |
| 2130 | return 1; | 2324 | return 1; |
| 2131 | } | ||
| 2132 | } | 2325 | } |
| 2133 | return 0; | 2326 | return 0; |
| 2134 | } | 2327 | } |
| @@ -2138,7 +2331,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 2138 | struct cgroup *cgrp = dentry->d_fsdata; | 2331 | struct cgroup *cgrp = dentry->d_fsdata; |
| 2139 | struct dentry *d; | 2332 | struct dentry *d; |
| 2140 | struct cgroup *parent; | 2333 | struct cgroup *parent; |
| 2141 | struct cgroup_subsys *ss; | ||
| 2142 | struct super_block *sb; | 2334 | struct super_block *sb; |
| 2143 | struct cgroupfs_root *root; | 2335 | struct cgroupfs_root *root; |
| 2144 | 2336 | ||
| @@ -2157,17 +2349,19 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 2157 | parent = cgrp->parent; | 2349 | parent = cgrp->parent; |
| 2158 | root = cgrp->root; | 2350 | root = cgrp->root; |
| 2159 | sb = root->sb; | 2351 | sb = root->sb; |
| 2352 | /* | ||
| 2353 | * Call pre_destroy handlers of subsys | ||
| 2354 | */ | ||
| 2355 | cgroup_call_pre_destroy(cgrp); | ||
| 2356 | /* | ||
| 2357 | * Notify subsyses that rmdir() request comes. | ||
| 2358 | */ | ||
| 2160 | 2359 | ||
| 2161 | if (cgroup_has_css_refs(cgrp)) { | 2360 | if (cgroup_has_css_refs(cgrp)) { |
| 2162 | mutex_unlock(&cgroup_mutex); | 2361 | mutex_unlock(&cgroup_mutex); |
| 2163 | return -EBUSY; | 2362 | return -EBUSY; |
| 2164 | } | 2363 | } |
| 2165 | 2364 | ||
| 2166 | for_each_subsys(root, ss) { | ||
| 2167 | if (cgrp->subsys[ss->subsys_id]) | ||
| 2168 | ss->destroy(ss, cgrp); | ||
| 2169 | } | ||
| 2170 | |||
| 2171 | spin_lock(&release_list_lock); | 2365 | spin_lock(&release_list_lock); |
| 2172 | set_bit(CGRP_REMOVED, &cgrp->flags); | 2366 | set_bit(CGRP_REMOVED, &cgrp->flags); |
| 2173 | if (!list_empty(&cgrp->release_list)) | 2367 | if (!list_empty(&cgrp->release_list)) |
| @@ -2182,15 +2376,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
| 2182 | 2376 | ||
| 2183 | cgroup_d_remove_dir(d); | 2377 | cgroup_d_remove_dir(d); |
| 2184 | dput(d); | 2378 | dput(d); |
| 2185 | root->number_of_cgroups--; | ||
| 2186 | 2379 | ||
| 2187 | set_bit(CGRP_RELEASABLE, &parent->flags); | 2380 | set_bit(CGRP_RELEASABLE, &parent->flags); |
| 2188 | check_for_release(parent); | 2381 | check_for_release(parent); |
| 2189 | 2382 | ||
| 2190 | mutex_unlock(&cgroup_mutex); | 2383 | mutex_unlock(&cgroup_mutex); |
| 2191 | /* Drop the active superblock reference that we took when we | ||
| 2192 | * created the cgroup */ | ||
| 2193 | deactivate_super(sb); | ||
| 2194 | return 0; | 2384 | return 0; |
| 2195 | } | 2385 | } |
| 2196 | 2386 | ||
| @@ -2324,7 +2514,7 @@ out: | |||
| 2324 | * - Used for /proc/<pid>/cgroup. | 2514 | * - Used for /proc/<pid>/cgroup. |
| 2325 | * - No need to task_lock(tsk) on this tsk->cgroup reference, as it | 2515 | * - No need to task_lock(tsk) on this tsk->cgroup reference, as it |
| 2326 | * doesn't really matter if tsk->cgroup changes after we read it, | 2516 | * doesn't really matter if tsk->cgroup changes after we read it, |
| 2327 | * and we take cgroup_mutex, keeping attach_task() from changing it | 2517 | * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it |
| 2328 | * anyway. No need to check that tsk->cgroup != NULL, thanks to | 2518 | * anyway. No need to check that tsk->cgroup != NULL, thanks to |
| 2329 | * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks | 2519 | * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks |
| 2330 | * cgroup to top_cgroup. | 2520 | * cgroup to top_cgroup. |
| @@ -2435,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = { | |||
| 2435 | * A pointer to the shared css_set was automatically copied in | 2625 | * A pointer to the shared css_set was automatically copied in |
| 2436 | * fork.c by dup_task_struct(). However, we ignore that copy, since | 2626 | * fork.c by dup_task_struct(). However, we ignore that copy, since |
| 2437 | * it was not made under the protection of RCU or cgroup_mutex, so | 2627 | * it was not made under the protection of RCU or cgroup_mutex, so |
| 2438 | * might no longer be a valid cgroup pointer. attach_task() might | 2628 | * might no longer be a valid cgroup pointer. cgroup_attach_task() might |
| 2439 | * have already changed current->cgroups, allowing the previously | 2629 | * have already changed current->cgroups, allowing the previously |
| 2440 | * referenced cgroup group to be removed and freed. | 2630 | * referenced cgroup group to be removed and freed. |
| 2441 | * | 2631 | * |
| @@ -2514,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child) | |||
| 2514 | * attach us to a different cgroup, decrementing the count on | 2704 | * attach us to a different cgroup, decrementing the count on |
| 2515 | * the first cgroup that we never incremented. But in this case, | 2705 | * the first cgroup that we never incremented. But in this case, |
| 2516 | * top_cgroup isn't going away, and either task has PF_EXITING set, | 2706 | * top_cgroup isn't going away, and either task has PF_EXITING set, |
| 2517 | * which wards off any attach_task() attempts, or task is a failed | 2707 | * which wards off any cgroup_attach_task() attempts, or task is a failed |
| 2518 | * fork, never visible to attach_task. | 2708 | * fork, never visible to cgroup_attach_task. |
| 2519 | * | 2709 | * |
| 2520 | */ | 2710 | */ |
| 2521 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) | 2711 | void cgroup_exit(struct task_struct *tsk, int run_callbacks) |
| @@ -2655,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) | |||
| 2655 | } | 2845 | } |
| 2656 | 2846 | ||
| 2657 | /* All seems fine. Finish by moving the task into the new cgroup */ | 2847 | /* All seems fine. Finish by moving the task into the new cgroup */ |
| 2658 | ret = attach_task(child, tsk); | 2848 | ret = cgroup_attach_task(child, tsk); |
| 2659 | mutex_unlock(&cgroup_mutex); | 2849 | mutex_unlock(&cgroup_mutex); |
| 2660 | 2850 | ||
| 2661 | out_release: | 2851 | out_release: |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index cfaf6419d817..67b2bfe27814 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -38,7 +38,6 @@ | |||
| 38 | #include <linux/mount.h> | 38 | #include <linux/mount.h> |
| 39 | #include <linux/namei.h> | 39 | #include <linux/namei.h> |
| 40 | #include <linux/pagemap.h> | 40 | #include <linux/pagemap.h> |
| 41 | #include <linux/prio_heap.h> | ||
| 42 | #include <linux/proc_fs.h> | 41 | #include <linux/proc_fs.h> |
| 43 | #include <linux/rcupdate.h> | 42 | #include <linux/rcupdate.h> |
| 44 | #include <linux/sched.h> | 43 | #include <linux/sched.h> |
| @@ -56,6 +55,8 @@ | |||
| 56 | #include <asm/atomic.h> | 55 | #include <asm/atomic.h> |
| 57 | #include <linux/mutex.h> | 56 | #include <linux/mutex.h> |
| 58 | #include <linux/kfifo.h> | 57 | #include <linux/kfifo.h> |
| 58 | #include <linux/workqueue.h> | ||
| 59 | #include <linux/cgroup.h> | ||
| 59 | 60 | ||
| 60 | /* | 61 | /* |
| 61 | * Tracks how many cpusets are currently defined in system. | 62 | * Tracks how many cpusets are currently defined in system. |
| @@ -64,7 +65,7 @@ | |||
| 64 | */ | 65 | */ |
| 65 | int number_of_cpusets __read_mostly; | 66 | int number_of_cpusets __read_mostly; |
| 66 | 67 | ||
| 67 | /* Retrieve the cpuset from a cgroup */ | 68 | /* Forward declare cgroup structures */ |
| 68 | struct cgroup_subsys cpuset_subsys; | 69 | struct cgroup_subsys cpuset_subsys; |
| 69 | struct cpuset; | 70 | struct cpuset; |
| 70 | 71 | ||
| @@ -96,6 +97,9 @@ struct cpuset { | |||
| 96 | 97 | ||
| 97 | /* partition number for rebuild_sched_domains() */ | 98 | /* partition number for rebuild_sched_domains() */ |
| 98 | int pn; | 99 | int pn; |
| 100 | |||
| 101 | /* used for walking a cpuset heirarchy */ | ||
| 102 | struct list_head stack_list; | ||
| 99 | }; | 103 | }; |
| 100 | 104 | ||
| 101 | /* Retrieve the cpuset for a cgroup */ | 105 | /* Retrieve the cpuset for a cgroup */ |
| @@ -111,7 +115,10 @@ static inline struct cpuset *task_cs(struct task_struct *task) | |||
| 111 | return container_of(task_subsys_state(task, cpuset_subsys_id), | 115 | return container_of(task_subsys_state(task, cpuset_subsys_id), |
| 112 | struct cpuset, css); | 116 | struct cpuset, css); |
| 113 | } | 117 | } |
| 114 | 118 | struct cpuset_hotplug_scanner { | |
| 119 | struct cgroup_scanner scan; | ||
| 120 | struct cgroup *to; | ||
| 121 | }; | ||
| 115 | 122 | ||
| 116 | /* bits in struct cpuset flags field */ | 123 | /* bits in struct cpuset flags field */ |
| 117 | typedef enum { | 124 | typedef enum { |
| @@ -160,17 +167,17 @@ static inline int is_spread_slab(const struct cpuset *cs) | |||
| 160 | * number, and avoid having to lock and reload mems_allowed unless | 167 | * number, and avoid having to lock and reload mems_allowed unless |
| 161 | * the cpuset they're using changes generation. | 168 | * the cpuset they're using changes generation. |
| 162 | * | 169 | * |
| 163 | * A single, global generation is needed because attach_task() could | 170 | * A single, global generation is needed because cpuset_attach_task() could |
| 164 | * reattach a task to a different cpuset, which must not have its | 171 | * reattach a task to a different cpuset, which must not have its |
| 165 | * generation numbers aliased with those of that tasks previous cpuset. | 172 | * generation numbers aliased with those of that tasks previous cpuset. |
| 166 | * | 173 | * |
| 167 | * Generations are needed for mems_allowed because one task cannot | 174 | * Generations are needed for mems_allowed because one task cannot |
| 168 | * modify anothers memory placement. So we must enable every task, | 175 | * modify another's memory placement. So we must enable every task, |
| 169 | * on every visit to __alloc_pages(), to efficiently check whether | 176 | * on every visit to __alloc_pages(), to efficiently check whether |
| 170 | * its current->cpuset->mems_allowed has changed, requiring an update | 177 | * its current->cpuset->mems_allowed has changed, requiring an update |
| 171 | * of its current->mems_allowed. | 178 | * of its current->mems_allowed. |
| 172 | * | 179 | * |
| 173 | * Since cpuset_mems_generation is guarded by manage_mutex, | 180 | * Since writes to cpuset_mems_generation are guarded by the cgroup lock |
| 174 | * there is no need to mark it atomic. | 181 | * there is no need to mark it atomic. |
| 175 | */ | 182 | */ |
| 176 | static int cpuset_mems_generation; | 183 | static int cpuset_mems_generation; |
| @@ -182,17 +189,20 @@ static struct cpuset top_cpuset = { | |||
| 182 | }; | 189 | }; |
| 183 | 190 | ||
| 184 | /* | 191 | /* |
| 185 | * We have two global cpuset mutexes below. They can nest. | 192 | * There are two global mutexes guarding cpuset structures. The first |
| 186 | * It is ok to first take manage_mutex, then nest callback_mutex. We also | 193 | * is the main control groups cgroup_mutex, accessed via |
| 187 | * require taking task_lock() when dereferencing a tasks cpuset pointer. | 194 | * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific |
| 188 | * See "The task_lock() exception", at the end of this comment. | 195 | * callback_mutex, below. They can nest. It is ok to first take |
| 196 | * cgroup_mutex, then nest callback_mutex. We also require taking | ||
| 197 | * task_lock() when dereferencing a task's cpuset pointer. See "The | ||
| 198 | * task_lock() exception", at the end of this comment. | ||
| 189 | * | 199 | * |
| 190 | * A task must hold both mutexes to modify cpusets. If a task | 200 | * A task must hold both mutexes to modify cpusets. If a task |
| 191 | * holds manage_mutex, then it blocks others wanting that mutex, | 201 | * holds cgroup_mutex, then it blocks others wanting that mutex, |
| 192 | * ensuring that it is the only task able to also acquire callback_mutex | 202 | * ensuring that it is the only task able to also acquire callback_mutex |
| 193 | * and be able to modify cpusets. It can perform various checks on | 203 | * and be able to modify cpusets. It can perform various checks on |
| 194 | * the cpuset structure first, knowing nothing will change. It can | 204 | * the cpuset structure first, knowing nothing will change. It can |
| 195 | * also allocate memory while just holding manage_mutex. While it is | 205 | * also allocate memory while just holding cgroup_mutex. While it is |
| 196 | * performing these checks, various callback routines can briefly | 206 | * performing these checks, various callback routines can briefly |
| 197 | * acquire callback_mutex to query cpusets. Once it is ready to make | 207 | * acquire callback_mutex to query cpusets. Once it is ready to make |
| 198 | * the changes, it takes callback_mutex, blocking everyone else. | 208 | * the changes, it takes callback_mutex, blocking everyone else. |
| @@ -208,60 +218,16 @@ static struct cpuset top_cpuset = { | |||
| 208 | * The task_struct fields mems_allowed and mems_generation may only | 218 | * The task_struct fields mems_allowed and mems_generation may only |
| 209 | * be accessed in the context of that task, so require no locks. | 219 | * be accessed in the context of that task, so require no locks. |
| 210 | * | 220 | * |
| 211 | * Any task can increment and decrement the count field without lock. | ||
| 212 | * So in general, code holding manage_mutex or callback_mutex can't rely | ||
| 213 | * on the count field not changing. However, if the count goes to | ||
| 214 | * zero, then only attach_task(), which holds both mutexes, can | ||
| 215 | * increment it again. Because a count of zero means that no tasks | ||
| 216 | * are currently attached, therefore there is no way a task attached | ||
| 217 | * to that cpuset can fork (the other way to increment the count). | ||
| 218 | * So code holding manage_mutex or callback_mutex can safely assume that | ||
| 219 | * if the count is zero, it will stay zero. Similarly, if a task | ||
| 220 | * holds manage_mutex or callback_mutex on a cpuset with zero count, it | ||
| 221 | * knows that the cpuset won't be removed, as cpuset_rmdir() needs | ||
| 222 | * both of those mutexes. | ||
| 223 | * | ||
| 224 | * The cpuset_common_file_write handler for operations that modify | 221 | * The cpuset_common_file_write handler for operations that modify |
| 225 | * the cpuset hierarchy holds manage_mutex across the entire operation, | 222 | * the cpuset hierarchy holds cgroup_mutex across the entire operation, |
| 226 | * single threading all such cpuset modifications across the system. | 223 | * single threading all such cpuset modifications across the system. |
| 227 | * | 224 | * |
| 228 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 225 | * The cpuset_common_file_read() handlers only hold callback_mutex across |
| 229 | * small pieces of code, such as when reading out possibly multi-word | 226 | * small pieces of code, such as when reading out possibly multi-word |
| 230 | * cpumasks and nodemasks. | 227 | * cpumasks and nodemasks. |
| 231 | * | 228 | * |
| 232 | * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't | 229 | * Accessing a task's cpuset should be done in accordance with the |
| 233 | * (usually) take either mutex. These are the two most performance | 230 | * guidelines for accessing subsystem state in kernel/cgroup.c |
| 234 | * critical pieces of code here. The exception occurs on cpuset_exit(), | ||
| 235 | * when a task in a notify_on_release cpuset exits. Then manage_mutex | ||
| 236 | * is taken, and if the cpuset count is zero, a usermode call made | ||
| 237 | * to /sbin/cpuset_release_agent with the name of the cpuset (path | ||
| 238 | * relative to the root of cpuset file system) as the argument. | ||
| 239 | * | ||
| 240 | * A cpuset can only be deleted if both its 'count' of using tasks | ||
| 241 | * is zero, and its list of 'children' cpusets is empty. Since all | ||
| 242 | * tasks in the system use _some_ cpuset, and since there is always at | ||
| 243 | * least one task in the system (init), therefore, top_cpuset | ||
| 244 | * always has either children cpusets and/or using tasks. So we don't | ||
| 245 | * need a special hack to ensure that top_cpuset cannot be deleted. | ||
| 246 | * | ||
| 247 | * The above "Tale of Two Semaphores" would be complete, but for: | ||
| 248 | * | ||
| 249 | * The task_lock() exception | ||
| 250 | * | ||
| 251 | * The need for this exception arises from the action of attach_task(), | ||
| 252 | * which overwrites one tasks cpuset pointer with another. It does | ||
| 253 | * so using both mutexes, however there are several performance | ||
| 254 | * critical places that need to reference task->cpuset without the | ||
| 255 | * expense of grabbing a system global mutex. Therefore except as | ||
| 256 | * noted below, when dereferencing or, as in attach_task(), modifying | ||
| 257 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | ||
| 258 | * (task->alloc_lock) already in the task_struct routinely used for | ||
| 259 | * such matters. | ||
| 260 | * | ||
| 261 | * P.S. One more locking exception. RCU is used to guard the | ||
| 262 | * update of a tasks cpuset pointer by attach_task() and the | ||
| 263 | * access of task->cpuset->mems_generation via that pointer in | ||
| 264 | * the routine cpuset_update_task_memory_state(). | ||
| 265 | */ | 231 | */ |
| 266 | 232 | ||
| 267 | static DEFINE_MUTEX(callback_mutex); | 233 | static DEFINE_MUTEX(callback_mutex); |
| @@ -354,15 +320,14 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
| 354 | * Do not call this routine if in_interrupt(). | 320 | * Do not call this routine if in_interrupt(). |
| 355 | * | 321 | * |
| 356 | * Call without callback_mutex or task_lock() held. May be | 322 | * Call without callback_mutex or task_lock() held. May be |
| 357 | * called with or without manage_mutex held. Thanks in part to | 323 | * called with or without cgroup_mutex held. Thanks in part to |
| 358 | * 'the_top_cpuset_hack', the tasks cpuset pointer will never | 324 | * 'the_top_cpuset_hack', the task's cpuset pointer will never |
| 359 | * be NULL. This routine also might acquire callback_mutex and | 325 | * be NULL. This routine also might acquire callback_mutex and |
| 360 | * current->mm->mmap_sem during call. | 326 | * current->mm->mmap_sem during call. |
| 361 | * | 327 | * |
| 362 | * Reading current->cpuset->mems_generation doesn't need task_lock | 328 | * Reading current->cpuset->mems_generation doesn't need task_lock |
| 363 | * to guard the current->cpuset derefence, because it is guarded | 329 | * to guard the current->cpuset derefence, because it is guarded |
| 364 | * from concurrent freeing of current->cpuset by attach_task(), | 330 | * from concurrent freeing of current->cpuset using RCU. |
| 365 | * using RCU. | ||
| 366 | * | 331 | * |
| 367 | * The rcu_dereference() is technically probably not needed, | 332 | * The rcu_dereference() is technically probably not needed, |
| 368 | * as I don't actually mind if I see a new cpuset pointer but | 333 | * as I don't actually mind if I see a new cpuset pointer but |
| @@ -424,7 +389,7 @@ void cpuset_update_task_memory_state(void) | |||
| 424 | * | 389 | * |
| 425 | * One cpuset is a subset of another if all its allowed CPUs and | 390 | * One cpuset is a subset of another if all its allowed CPUs and |
| 426 | * Memory Nodes are a subset of the other, and its exclusive flags | 391 | * Memory Nodes are a subset of the other, and its exclusive flags |
| 427 | * are only set if the other's are set. Call holding manage_mutex. | 392 | * are only set if the other's are set. Call holding cgroup_mutex. |
| 428 | */ | 393 | */ |
| 429 | 394 | ||
| 430 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | 395 | static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) |
| @@ -442,7 +407,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) | |||
| 442 | * If we replaced the flag and mask values of the current cpuset | 407 | * If we replaced the flag and mask values of the current cpuset |
| 443 | * (cur) with those values in the trial cpuset (trial), would | 408 | * (cur) with those values in the trial cpuset (trial), would |
| 444 | * our various subset and exclusive rules still be valid? Presumes | 409 | * our various subset and exclusive rules still be valid? Presumes |
| 445 | * manage_mutex held. | 410 | * cgroup_mutex held. |
| 446 | * | 411 | * |
| 447 | * 'cur' is the address of an actual, in-use cpuset. Operations | 412 | * 'cur' is the address of an actual, in-use cpuset. Operations |
| 448 | * such as list traversal that depend on the actual address of the | 413 | * such as list traversal that depend on the actual address of the |
| @@ -476,7 +441,10 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 476 | if (!is_cpuset_subset(trial, par)) | 441 | if (!is_cpuset_subset(trial, par)) |
| 477 | return -EACCES; | 442 | return -EACCES; |
| 478 | 443 | ||
| 479 | /* If either I or some sibling (!= me) is exclusive, we can't overlap */ | 444 | /* |
| 445 | * If either I or some sibling (!= me) is exclusive, we can't | ||
| 446 | * overlap | ||
| 447 | */ | ||
| 480 | list_for_each_entry(cont, &par->css.cgroup->children, sibling) { | 448 | list_for_each_entry(cont, &par->css.cgroup->children, sibling) { |
| 481 | c = cgroup_cs(cont); | 449 | c = cgroup_cs(cont); |
| 482 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && | 450 | if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && |
| @@ -732,22 +700,50 @@ static inline int started_after(void *p1, void *p2) | |||
| 732 | return started_after_time(t1, &t2->start_time, t2); | 700 | return started_after_time(t1, &t2->start_time, t2); |
| 733 | } | 701 | } |
| 734 | 702 | ||
| 735 | /* | 703 | /** |
| 736 | * Call with manage_mutex held. May take callback_mutex during call. | 704 | * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's |
| 705 | * @tsk: task to test | ||
| 706 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner | ||
| 707 | * | ||
| 708 | * Call with cgroup_mutex held. May take callback_mutex during call. | ||
| 709 | * Called for each task in a cgroup by cgroup_scan_tasks(). | ||
| 710 | * Return nonzero if this tasks's cpus_allowed mask should be changed (in other | ||
| 711 | * words, if its mask is not equal to its cpuset's mask). | ||
| 712 | */ | ||
| 713 | int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) | ||
| 714 | { | ||
| 715 | return !cpus_equal(tsk->cpus_allowed, | ||
| 716 | (cgroup_cs(scan->cg))->cpus_allowed); | ||
| 717 | } | ||
| 718 | |||
| 719 | /** | ||
| 720 | * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's | ||
| 721 | * @tsk: task to test | ||
| 722 | * @scan: struct cgroup_scanner containing the cgroup of the task | ||
| 723 | * | ||
| 724 | * Called by cgroup_scan_tasks() for each task in a cgroup whose | ||
| 725 | * cpus_allowed mask needs to be changed. | ||
| 726 | * | ||
| 727 | * We don't need to re-check for the cgroup/cpuset membership, since we're | ||
| 728 | * holding cgroup_lock() at this point. | ||
| 737 | */ | 729 | */ |
| 730 | void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan) | ||
| 731 | { | ||
| 732 | set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed); | ||
| 733 | } | ||
| 738 | 734 | ||
| 735 | /** | ||
| 736 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | ||
| 737 | * @cs: the cpuset to consider | ||
| 738 | * @buf: buffer of cpu numbers written to this cpuset | ||
| 739 | */ | ||
| 739 | static int update_cpumask(struct cpuset *cs, char *buf) | 740 | static int update_cpumask(struct cpuset *cs, char *buf) |
| 740 | { | 741 | { |
| 741 | struct cpuset trialcs; | 742 | struct cpuset trialcs; |
| 742 | int retval, i; | 743 | struct cgroup_scanner scan; |
| 743 | int is_load_balanced; | ||
| 744 | struct cgroup_iter it; | ||
| 745 | struct cgroup *cgrp = cs->css.cgroup; | ||
| 746 | struct task_struct *p, *dropped; | ||
| 747 | /* Never dereference latest_task, since it's not refcounted */ | ||
| 748 | struct task_struct *latest_task = NULL; | ||
| 749 | struct ptr_heap heap; | 744 | struct ptr_heap heap; |
| 750 | struct timespec latest_time = { 0, 0 }; | 745 | int retval; |
| 746 | int is_load_balanced; | ||
| 751 | 747 | ||
| 752 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ | 748 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ |
| 753 | if (cs == &top_cpuset) | 749 | if (cs == &top_cpuset) |
| @@ -756,7 +752,7 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 756 | trialcs = *cs; | 752 | trialcs = *cs; |
| 757 | 753 | ||
| 758 | /* | 754 | /* |
| 759 | * An empty cpus_allowed is ok iff there are no tasks in the cpuset. | 755 | * An empty cpus_allowed is ok only if the cpuset has no tasks. |
| 760 | * Since cpulist_parse() fails on an empty mask, we special case | 756 | * Since cpulist_parse() fails on an empty mask, we special case |
| 761 | * that parsing. The validate_change() call ensures that cpusets | 757 | * that parsing. The validate_change() call ensures that cpusets |
| 762 | * with tasks have cpus. | 758 | * with tasks have cpus. |
| @@ -777,6 +773,7 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 777 | /* Nothing to do if the cpus didn't change */ | 773 | /* Nothing to do if the cpus didn't change */ |
| 778 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | 774 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) |
| 779 | return 0; | 775 | return 0; |
| 776 | |||
| 780 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); | 777 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); |
| 781 | if (retval) | 778 | if (retval) |
| 782 | return retval; | 779 | return retval; |
| @@ -787,62 +784,19 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 787 | cs->cpus_allowed = trialcs.cpus_allowed; | 784 | cs->cpus_allowed = trialcs.cpus_allowed; |
| 788 | mutex_unlock(&callback_mutex); | 785 | mutex_unlock(&callback_mutex); |
| 789 | 786 | ||
| 790 | again: | ||
| 791 | /* | 787 | /* |
| 792 | * Scan tasks in the cpuset, and update the cpumasks of any | 788 | * Scan tasks in the cpuset, and update the cpumasks of any |
| 793 | * that need an update. Since we can't call set_cpus_allowed() | 789 | * that need an update. |
| 794 | * while holding tasklist_lock, gather tasks to be processed | ||
| 795 | * in a heap structure. If the statically-sized heap fills up, | ||
| 796 | * overflow tasks that started later, and in future iterations | ||
| 797 | * only consider tasks that started after the latest task in | ||
| 798 | * the previous pass. This guarantees forward progress and | ||
| 799 | * that we don't miss any tasks | ||
| 800 | */ | 790 | */ |
| 801 | heap.size = 0; | 791 | scan.cg = cs->css.cgroup; |
| 802 | cgroup_iter_start(cgrp, &it); | 792 | scan.test_task = cpuset_test_cpumask; |
| 803 | while ((p = cgroup_iter_next(cgrp, &it))) { | 793 | scan.process_task = cpuset_change_cpumask; |
| 804 | /* Only affect tasks that don't have the right cpus_allowed */ | 794 | scan.heap = &heap; |
| 805 | if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) | 795 | cgroup_scan_tasks(&scan); |
| 806 | continue; | ||
| 807 | /* | ||
| 808 | * Only process tasks that started after the last task | ||
| 809 | * we processed | ||
| 810 | */ | ||
| 811 | if (!started_after_time(p, &latest_time, latest_task)) | ||
| 812 | continue; | ||
| 813 | dropped = heap_insert(&heap, p); | ||
| 814 | if (dropped == NULL) { | ||
| 815 | get_task_struct(p); | ||
| 816 | } else if (dropped != p) { | ||
| 817 | get_task_struct(p); | ||
| 818 | put_task_struct(dropped); | ||
| 819 | } | ||
| 820 | } | ||
| 821 | cgroup_iter_end(cgrp, &it); | ||
| 822 | if (heap.size) { | ||
| 823 | for (i = 0; i < heap.size; i++) { | ||
| 824 | struct task_struct *p = heap.ptrs[i]; | ||
| 825 | if (i == 0) { | ||
| 826 | latest_time = p->start_time; | ||
| 827 | latest_task = p; | ||
| 828 | } | ||
| 829 | set_cpus_allowed(p, cs->cpus_allowed); | ||
| 830 | put_task_struct(p); | ||
| 831 | } | ||
| 832 | /* | ||
| 833 | * If we had to process any tasks at all, scan again | ||
| 834 | * in case some of them were in the middle of forking | ||
| 835 | * children that didn't notice the new cpumask | ||
| 836 | * restriction. Not the most efficient way to do it, | ||
| 837 | * but it avoids having to take callback_mutex in the | ||
| 838 | * fork path | ||
| 839 | */ | ||
| 840 | goto again; | ||
| 841 | } | ||
| 842 | heap_free(&heap); | 796 | heap_free(&heap); |
| 797 | |||
| 843 | if (is_load_balanced) | 798 | if (is_load_balanced) |
| 844 | rebuild_sched_domains(); | 799 | rebuild_sched_domains(); |
| 845 | |||
| 846 | return 0; | 800 | return 0; |
| 847 | } | 801 | } |
| 848 | 802 | ||
| @@ -854,11 +808,11 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
| 854 | * Temporarilly set tasks mems_allowed to target nodes of migration, | 808 | * Temporarilly set tasks mems_allowed to target nodes of migration, |
| 855 | * so that the migration code can allocate pages on these nodes. | 809 | * so that the migration code can allocate pages on these nodes. |
| 856 | * | 810 | * |
| 857 | * Call holding manage_mutex, so our current->cpuset won't change | 811 | * Call holding cgroup_mutex, so current's cpuset won't change |
| 858 | * during this call, as manage_mutex holds off any attach_task() | 812 | * during this call, as manage_mutex holds off any cpuset_attach() |
| 859 | * calls. Therefore we don't need to take task_lock around the | 813 | * calls. Therefore we don't need to take task_lock around the |
| 860 | * call to guarantee_online_mems(), as we know no one is changing | 814 | * call to guarantee_online_mems(), as we know no one is changing |
| 861 | * our tasks cpuset. | 815 | * our task's cpuset. |
| 862 | * | 816 | * |
| 863 | * Hold callback_mutex around the two modifications of our tasks | 817 | * Hold callback_mutex around the two modifications of our tasks |
| 864 | * mems_allowed to synchronize with cpuset_mems_allowed(). | 818 | * mems_allowed to synchronize with cpuset_mems_allowed(). |
| @@ -903,7 +857,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
| 903 | * the cpuset is marked 'memory_migrate', migrate the tasks | 857 | * the cpuset is marked 'memory_migrate', migrate the tasks |
| 904 | * pages to the new memory. | 858 | * pages to the new memory. |
| 905 | * | 859 | * |
| 906 | * Call with manage_mutex held. May take callback_mutex during call. | 860 | * Call with cgroup_mutex held. May take callback_mutex during call. |
| 907 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 861 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
| 908 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | 862 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
| 909 | * their mempolicies to the cpusets new mems_allowed. | 863 | * their mempolicies to the cpusets new mems_allowed. |
| @@ -1016,7 +970,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 1016 | * tasklist_lock. Forks can happen again now - the mpol_copy() | 970 | * tasklist_lock. Forks can happen again now - the mpol_copy() |
| 1017 | * cpuset_being_rebound check will catch such forks, and rebind | 971 | * cpuset_being_rebound check will catch such forks, and rebind |
| 1018 | * their vma mempolicies too. Because we still hold the global | 972 | * their vma mempolicies too. Because we still hold the global |
| 1019 | * cpuset manage_mutex, we know that no other rebind effort will | 973 | * cgroup_mutex, we know that no other rebind effort will |
| 1020 | * be contending for the global variable cpuset_being_rebound. | 974 | * be contending for the global variable cpuset_being_rebound. |
| 1021 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | 975 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() |
| 1022 | * is idempotent. Also migrate pages in each mm to new nodes. | 976 | * is idempotent. Also migrate pages in each mm to new nodes. |
| @@ -1031,7 +985,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
| 1031 | mmput(mm); | 985 | mmput(mm); |
| 1032 | } | 986 | } |
| 1033 | 987 | ||
| 1034 | /* We're done rebinding vma's to this cpusets new mems_allowed. */ | 988 | /* We're done rebinding vmas to this cpuset's new mems_allowed. */ |
| 1035 | kfree(mmarray); | 989 | kfree(mmarray); |
| 1036 | cpuset_being_rebound = NULL; | 990 | cpuset_being_rebound = NULL; |
| 1037 | retval = 0; | 991 | retval = 0; |
| @@ -1045,7 +999,7 @@ int current_cpuset_is_being_rebound(void) | |||
| 1045 | } | 999 | } |
| 1046 | 1000 | ||
| 1047 | /* | 1001 | /* |
| 1048 | * Call with manage_mutex held. | 1002 | * Call with cgroup_mutex held. |
| 1049 | */ | 1003 | */ |
| 1050 | 1004 | ||
| 1051 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | 1005 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) |
| @@ -1066,7 +1020,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | |||
| 1066 | * cs: the cpuset to update | 1020 | * cs: the cpuset to update |
| 1067 | * buf: the buffer where we read the 0 or 1 | 1021 | * buf: the buffer where we read the 0 or 1 |
| 1068 | * | 1022 | * |
| 1069 | * Call with manage_mutex held. | 1023 | * Call with cgroup_mutex held. |
| 1070 | */ | 1024 | */ |
| 1071 | 1025 | ||
| 1072 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | 1026 | static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) |
| @@ -1200,6 +1154,7 @@ static int fmeter_getrate(struct fmeter *fmp) | |||
| 1200 | return val; | 1154 | return val; |
| 1201 | } | 1155 | } |
| 1202 | 1156 | ||
| 1157 | /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ | ||
| 1203 | static int cpuset_can_attach(struct cgroup_subsys *ss, | 1158 | static int cpuset_can_attach(struct cgroup_subsys *ss, |
| 1204 | struct cgroup *cont, struct task_struct *tsk) | 1159 | struct cgroup *cont, struct task_struct *tsk) |
| 1205 | { | 1160 | { |
| @@ -1547,7 +1502,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 1547 | * If this becomes a problem for some users who wish to | 1502 | * If this becomes a problem for some users who wish to |
| 1548 | * allow that scenario, then cpuset_post_clone() could be | 1503 | * allow that scenario, then cpuset_post_clone() could be |
| 1549 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive | 1504 | * changed to grant parent->cpus_allowed-sibling_cpus_exclusive |
| 1550 | * (and likewise for mems) to the new cgroup. | 1505 | * (and likewise for mems) to the new cgroup. Called with cgroup_mutex |
| 1506 | * held. | ||
| 1551 | */ | 1507 | */ |
| 1552 | static void cpuset_post_clone(struct cgroup_subsys *ss, | 1508 | static void cpuset_post_clone(struct cgroup_subsys *ss, |
| 1553 | struct cgroup *cgroup) | 1509 | struct cgroup *cgroup) |
| @@ -1571,11 +1527,8 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, | |||
| 1571 | 1527 | ||
| 1572 | /* | 1528 | /* |
| 1573 | * cpuset_create - create a cpuset | 1529 | * cpuset_create - create a cpuset |
| 1574 | * parent: cpuset that will be parent of the new cpuset. | 1530 | * ss: cpuset cgroup subsystem |
| 1575 | * name: name of the new cpuset. Will be strcpy'ed. | 1531 | * cont: control group that the new cpuset will be part of |
| 1576 | * mode: mode to set on new inode | ||
| 1577 | * | ||
| 1578 | * Must be called with the mutex on the parent inode held | ||
| 1579 | */ | 1532 | */ |
| 1580 | 1533 | ||
| 1581 | static struct cgroup_subsys_state *cpuset_create( | 1534 | static struct cgroup_subsys_state *cpuset_create( |
| @@ -1687,53 +1640,140 @@ int __init cpuset_init(void) | |||
| 1687 | return 0; | 1640 | return 0; |
| 1688 | } | 1641 | } |
| 1689 | 1642 | ||
| 1643 | /** | ||
| 1644 | * cpuset_do_move_task - move a given task to another cpuset | ||
| 1645 | * @tsk: pointer to task_struct the task to move | ||
| 1646 | * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner | ||
| 1647 | * | ||
| 1648 | * Called by cgroup_scan_tasks() for each task in a cgroup. | ||
| 1649 | * Return nonzero to stop the walk through the tasks. | ||
| 1650 | */ | ||
| 1651 | void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan) | ||
| 1652 | { | ||
| 1653 | struct cpuset_hotplug_scanner *chsp; | ||
| 1654 | |||
| 1655 | chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); | ||
| 1656 | cgroup_attach_task(chsp->to, tsk); | ||
| 1657 | } | ||
| 1658 | |||
| 1659 | /** | ||
| 1660 | * move_member_tasks_to_cpuset - move tasks from one cpuset to another | ||
| 1661 | * @from: cpuset in which the tasks currently reside | ||
| 1662 | * @to: cpuset to which the tasks will be moved | ||
| 1663 | * | ||
| 1664 | * Called with cgroup_mutex held | ||
| 1665 | * callback_mutex must not be held, as cpuset_attach() will take it. | ||
| 1666 | * | ||
| 1667 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | ||
| 1668 | * calling callback functions for each. | ||
| 1669 | */ | ||
| 1670 | static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | ||
| 1671 | { | ||
| 1672 | struct cpuset_hotplug_scanner scan; | ||
| 1673 | |||
| 1674 | scan.scan.cg = from->css.cgroup; | ||
| 1675 | scan.scan.test_task = NULL; /* select all tasks in cgroup */ | ||
| 1676 | scan.scan.process_task = cpuset_do_move_task; | ||
| 1677 | scan.scan.heap = NULL; | ||
| 1678 | scan.to = to->css.cgroup; | ||
| 1679 | |||
| 1680 | if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) | ||
| 1681 | printk(KERN_ERR "move_member_tasks_to_cpuset: " | ||
| 1682 | "cgroup_scan_tasks failed\n"); | ||
| 1683 | } | ||
| 1684 | |||
| 1690 | /* | 1685 | /* |
| 1691 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 1686 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs |
| 1692 | * or memory nodes, we need to walk over the cpuset hierarchy, | 1687 | * or memory nodes, we need to walk over the cpuset hierarchy, |
| 1693 | * removing that CPU or node from all cpusets. If this removes the | 1688 | * removing that CPU or node from all cpusets. If this removes the |
| 1694 | * last CPU or node from a cpuset, then the guarantee_online_cpus() | 1689 | * last CPU or node from a cpuset, then move the tasks in the empty |
| 1695 | * or guarantee_online_mems() code will use that emptied cpusets | 1690 | * cpuset to its next-highest non-empty parent. |
| 1696 | * parent online CPUs or nodes. Cpusets that were already empty of | ||
| 1697 | * CPUs or nodes are left empty. | ||
| 1698 | * | 1691 | * |
| 1699 | * This routine is intentionally inefficient in a couple of regards. | 1692 | * Called with cgroup_mutex held |
| 1700 | * It will check all cpusets in a subtree even if the top cpuset of | 1693 | * callback_mutex must not be held, as cpuset_attach() will take it. |
| 1701 | * the subtree has no offline CPUs or nodes. It checks both CPUs and | 1694 | */ |
| 1702 | * nodes, even though the caller could have been coded to know that | 1695 | static void remove_tasks_in_empty_cpuset(struct cpuset *cs) |
| 1703 | * only one of CPUs or nodes needed to be checked on a given call. | 1696 | { |
| 1704 | * This was done to minimize text size rather than cpu cycles. | 1697 | struct cpuset *parent; |
| 1698 | |||
| 1699 | /* | ||
| 1700 | * The cgroup's css_sets list is in use if there are tasks | ||
| 1701 | * in the cpuset; the list is empty if there are none; | ||
| 1702 | * the cs->css.refcnt seems always 0. | ||
| 1703 | */ | ||
| 1704 | if (list_empty(&cs->css.cgroup->css_sets)) | ||
| 1705 | return; | ||
| 1706 | |||
| 1707 | /* | ||
| 1708 | * Find its next-highest non-empty parent, (top cpuset | ||
| 1709 | * has online cpus, so can't be empty). | ||
| 1710 | */ | ||
| 1711 | parent = cs->parent; | ||
| 1712 | while (cpus_empty(parent->cpus_allowed) || | ||
| 1713 | nodes_empty(parent->mems_allowed)) | ||
| 1714 | parent = parent->parent; | ||
| 1715 | |||
| 1716 | move_member_tasks_to_cpuset(cs, parent); | ||
| 1717 | } | ||
| 1718 | |||
| 1719 | /* | ||
| 1720 | * Walk the specified cpuset subtree and look for empty cpusets. | ||
| 1721 | * The tasks of such cpuset must be moved to a parent cpuset. | ||
| 1722 | * | ||
| 1723 | * Called with cgroup_mutex held. We take callback_mutex to modify | ||
| 1724 | * cpus_allowed and mems_allowed. | ||
| 1705 | * | 1725 | * |
| 1706 | * Call with both manage_mutex and callback_mutex held. | 1726 | * This walk processes the tree from top to bottom, completing one layer |
| 1727 | * before dropping down to the next. It always processes a node before | ||
| 1728 | * any of its children. | ||
| 1707 | * | 1729 | * |
| 1708 | * Recursive, on depth of cpuset subtree. | 1730 | * For now, since we lack memory hot unplug, we'll never see a cpuset |
| 1731 | * that has tasks along with an empty 'mems'. But if we did see such | ||
| 1732 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. | ||
| 1709 | */ | 1733 | */ |
| 1710 | 1734 | static void scan_for_empty_cpusets(const struct cpuset *root) | |
| 1711 | static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) | ||
| 1712 | { | 1735 | { |
| 1736 | struct cpuset *cp; /* scans cpusets being updated */ | ||
| 1737 | struct cpuset *child; /* scans child cpusets of cp */ | ||
| 1738 | struct list_head queue; | ||
| 1713 | struct cgroup *cont; | 1739 | struct cgroup *cont; |
| 1714 | struct cpuset *c; | ||
| 1715 | 1740 | ||
| 1716 | /* Each of our child cpusets mems must be online */ | 1741 | INIT_LIST_HEAD(&queue); |
| 1717 | list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { | 1742 | |
| 1718 | c = cgroup_cs(cont); | 1743 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
| 1719 | guarantee_online_cpus_mems_in_subtree(c); | 1744 | |
| 1720 | if (!cpus_empty(c->cpus_allowed)) | 1745 | while (!list_empty(&queue)) { |
| 1721 | guarantee_online_cpus(c, &c->cpus_allowed); | 1746 | cp = container_of(queue.next, struct cpuset, stack_list); |
| 1722 | if (!nodes_empty(c->mems_allowed)) | 1747 | list_del(queue.next); |
| 1723 | guarantee_online_mems(c, &c->mems_allowed); | 1748 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { |
| 1749 | child = cgroup_cs(cont); | ||
| 1750 | list_add_tail(&child->stack_list, &queue); | ||
| 1751 | } | ||
| 1752 | cont = cp->css.cgroup; | ||
| 1753 | |||
| 1754 | /* Continue past cpusets with all cpus, mems online */ | ||
| 1755 | if (cpus_subset(cp->cpus_allowed, cpu_online_map) && | ||
| 1756 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | ||
| 1757 | continue; | ||
| 1758 | |||
| 1759 | /* Remove offline cpus and mems from this cpuset. */ | ||
| 1760 | mutex_lock(&callback_mutex); | ||
| 1761 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); | ||
| 1762 | nodes_and(cp->mems_allowed, cp->mems_allowed, | ||
| 1763 | node_states[N_HIGH_MEMORY]); | ||
| 1764 | mutex_unlock(&callback_mutex); | ||
| 1765 | |||
| 1766 | /* Move tasks from the empty cpuset to a parent */ | ||
| 1767 | if (cpus_empty(cp->cpus_allowed) || | ||
| 1768 | nodes_empty(cp->mems_allowed)) | ||
| 1769 | remove_tasks_in_empty_cpuset(cp); | ||
| 1724 | } | 1770 | } |
| 1725 | } | 1771 | } |
| 1726 | 1772 | ||
| 1727 | /* | 1773 | /* |
| 1728 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | 1774 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track |
| 1729 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to | 1775 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to |
| 1730 | * track what's online after any CPU or memory node hotplug or unplug | 1776 | * track what's online after any CPU or memory node hotplug or unplug event. |
| 1731 | * event. | ||
| 1732 | * | ||
| 1733 | * To ensure that we don't remove a CPU or node from the top cpuset | ||
| 1734 | * that is currently in use by a child cpuset (which would violate | ||
| 1735 | * the rule that cpusets must be subsets of their parent), we first | ||
| 1736 | * call the recursive routine guarantee_online_cpus_mems_in_subtree(). | ||
| 1737 | * | 1777 | * |
| 1738 | * Since there are two callers of this routine, one for CPU hotplug | 1778 | * Since there are two callers of this routine, one for CPU hotplug |
| 1739 | * events and one for memory node hotplug events, we could have coded | 1779 | * events and one for memory node hotplug events, we could have coded |
| @@ -1744,13 +1784,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur) | |||
| 1744 | static void common_cpu_mem_hotplug_unplug(void) | 1784 | static void common_cpu_mem_hotplug_unplug(void) |
| 1745 | { | 1785 | { |
| 1746 | cgroup_lock(); | 1786 | cgroup_lock(); |
| 1747 | mutex_lock(&callback_mutex); | ||
| 1748 | 1787 | ||
| 1749 | guarantee_online_cpus_mems_in_subtree(&top_cpuset); | ||
| 1750 | top_cpuset.cpus_allowed = cpu_online_map; | 1788 | top_cpuset.cpus_allowed = cpu_online_map; |
| 1751 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 1789 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
| 1790 | scan_for_empty_cpusets(&top_cpuset); | ||
| 1752 | 1791 | ||
| 1753 | mutex_unlock(&callback_mutex); | ||
| 1754 | cgroup_unlock(); | 1792 | cgroup_unlock(); |
| 1755 | } | 1793 | } |
| 1756 | 1794 | ||
| @@ -1826,7 +1864,7 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) | |||
| 1826 | 1864 | ||
| 1827 | /** | 1865 | /** |
| 1828 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. | 1866 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. |
| 1829 | * Must be called with callback_mutex held. | 1867 | * Must be called with callback_mutex held. |
| 1830 | **/ | 1868 | **/ |
| 1831 | cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) | 1869 | cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) |
| 1832 | { | 1870 | { |
| @@ -2163,10 +2201,8 @@ void __cpuset_memory_pressure_bump(void) | |||
| 2163 | * - Used for /proc/<pid>/cpuset. | 2201 | * - Used for /proc/<pid>/cpuset. |
| 2164 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | 2202 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it |
| 2165 | * doesn't really matter if tsk->cpuset changes after we read it, | 2203 | * doesn't really matter if tsk->cpuset changes after we read it, |
| 2166 | * and we take manage_mutex, keeping attach_task() from changing it | 2204 | * and we take cgroup_mutex, keeping cpuset_attach() from changing it |
| 2167 | * anyway. No need to check that tsk->cpuset != NULL, thanks to | 2205 | * anyway. |
| 2168 | * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks | ||
| 2169 | * cpuset to top_cpuset. | ||
| 2170 | */ | 2206 | */ |
| 2171 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) | 2207 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) |
| 2172 | { | 2208 | { |
diff --git a/kernel/fork.c b/kernel/fork.c index 3995297567a9..b2ef8e4fad70 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -40,6 +40,7 @@ | |||
| 40 | #include <linux/ptrace.h> | 40 | #include <linux/ptrace.h> |
| 41 | #include <linux/mount.h> | 41 | #include <linux/mount.h> |
| 42 | #include <linux/audit.h> | 42 | #include <linux/audit.h> |
| 43 | #include <linux/memcontrol.h> | ||
| 43 | #include <linux/profile.h> | 44 | #include <linux/profile.h> |
| 44 | #include <linux/rmap.h> | 45 | #include <linux/rmap.h> |
| 45 | #include <linux/acct.h> | 46 | #include <linux/acct.h> |
| @@ -340,7 +341,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); | |||
| 340 | 341 | ||
| 341 | #include <linux/init_task.h> | 342 | #include <linux/init_task.h> |
| 342 | 343 | ||
| 343 | static struct mm_struct * mm_init(struct mm_struct * mm) | 344 | static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) |
| 344 | { | 345 | { |
| 345 | atomic_set(&mm->mm_users, 1); | 346 | atomic_set(&mm->mm_users, 1); |
| 346 | atomic_set(&mm->mm_count, 1); | 347 | atomic_set(&mm->mm_count, 1); |
| @@ -357,11 +358,14 @@ static struct mm_struct * mm_init(struct mm_struct * mm) | |||
| 357 | mm->ioctx_list = NULL; | 358 | mm->ioctx_list = NULL; |
| 358 | mm->free_area_cache = TASK_UNMAPPED_BASE; | 359 | mm->free_area_cache = TASK_UNMAPPED_BASE; |
| 359 | mm->cached_hole_size = ~0UL; | 360 | mm->cached_hole_size = ~0UL; |
| 361 | mm_init_cgroup(mm, p); | ||
| 360 | 362 | ||
| 361 | if (likely(!mm_alloc_pgd(mm))) { | 363 | if (likely(!mm_alloc_pgd(mm))) { |
| 362 | mm->def_flags = 0; | 364 | mm->def_flags = 0; |
| 363 | return mm; | 365 | return mm; |
| 364 | } | 366 | } |
| 367 | |||
| 368 | mm_free_cgroup(mm); | ||
| 365 | free_mm(mm); | 369 | free_mm(mm); |
| 366 | return NULL; | 370 | return NULL; |
| 367 | } | 371 | } |
| @@ -376,7 +380,7 @@ struct mm_struct * mm_alloc(void) | |||
| 376 | mm = allocate_mm(); | 380 | mm = allocate_mm(); |
| 377 | if (mm) { | 381 | if (mm) { |
| 378 | memset(mm, 0, sizeof(*mm)); | 382 | memset(mm, 0, sizeof(*mm)); |
| 379 | mm = mm_init(mm); | 383 | mm = mm_init(mm, current); |
| 380 | } | 384 | } |
| 381 | return mm; | 385 | return mm; |
| 382 | } | 386 | } |
| @@ -390,6 +394,7 @@ void fastcall __mmdrop(struct mm_struct *mm) | |||
| 390 | { | 394 | { |
| 391 | BUG_ON(mm == &init_mm); | 395 | BUG_ON(mm == &init_mm); |
| 392 | mm_free_pgd(mm); | 396 | mm_free_pgd(mm); |
| 397 | mm_free_cgroup(mm); | ||
| 393 | destroy_context(mm); | 398 | destroy_context(mm); |
| 394 | free_mm(mm); | 399 | free_mm(mm); |
| 395 | } | 400 | } |
| @@ -511,7 +516,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) | |||
| 511 | mm->token_priority = 0; | 516 | mm->token_priority = 0; |
| 512 | mm->last_interval = 0; | 517 | mm->last_interval = 0; |
| 513 | 518 | ||
| 514 | if (!mm_init(mm)) | 519 | if (!mm_init(mm, tsk)) |
| 515 | goto fail_nomem; | 520 | goto fail_nomem; |
| 516 | 521 | ||
| 517 | if (init_new_context(tsk, mm)) | 522 | if (init_new_context(tsk, mm)) |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 9a26eec9eb04..06a0e2775651 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -1361,8 +1361,8 @@ unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) | |||
| 1361 | 1361 | ||
| 1362 | static int __init crash_save_vmcoreinfo_init(void) | 1362 | static int __init crash_save_vmcoreinfo_init(void) |
| 1363 | { | 1363 | { |
| 1364 | vmcoreinfo_append_str("OSRELEASE=%s\n", init_uts_ns.name.release); | 1364 | VMCOREINFO_OSRELEASE(init_uts_ns.name.release); |
| 1365 | vmcoreinfo_append_str("PAGESIZE=%ld\n", PAGE_SIZE); | 1365 | VMCOREINFO_PAGESIZE(PAGE_SIZE); |
| 1366 | 1366 | ||
| 1367 | VMCOREINFO_SYMBOL(init_uts_ns); | 1367 | VMCOREINFO_SYMBOL(init_uts_ns); |
| 1368 | VMCOREINFO_SYMBOL(node_online_map); | 1368 | VMCOREINFO_SYMBOL(node_online_map); |
| @@ -1376,15 +1376,15 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
| 1376 | #ifdef CONFIG_SPARSEMEM | 1376 | #ifdef CONFIG_SPARSEMEM |
| 1377 | VMCOREINFO_SYMBOL(mem_section); | 1377 | VMCOREINFO_SYMBOL(mem_section); |
| 1378 | VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); | 1378 | VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); |
| 1379 | VMCOREINFO_SIZE(mem_section); | 1379 | VMCOREINFO_STRUCT_SIZE(mem_section); |
| 1380 | VMCOREINFO_OFFSET(mem_section, section_mem_map); | 1380 | VMCOREINFO_OFFSET(mem_section, section_mem_map); |
| 1381 | #endif | 1381 | #endif |
| 1382 | VMCOREINFO_SIZE(page); | 1382 | VMCOREINFO_STRUCT_SIZE(page); |
| 1383 | VMCOREINFO_SIZE(pglist_data); | 1383 | VMCOREINFO_STRUCT_SIZE(pglist_data); |
| 1384 | VMCOREINFO_SIZE(zone); | 1384 | VMCOREINFO_STRUCT_SIZE(zone); |
| 1385 | VMCOREINFO_SIZE(free_area); | 1385 | VMCOREINFO_STRUCT_SIZE(free_area); |
| 1386 | VMCOREINFO_SIZE(list_head); | 1386 | VMCOREINFO_STRUCT_SIZE(list_head); |
| 1387 | VMCOREINFO_TYPEDEF_SIZE(nodemask_t); | 1387 | VMCOREINFO_SIZE(nodemask_t); |
| 1388 | VMCOREINFO_OFFSET(page, flags); | 1388 | VMCOREINFO_OFFSET(page, flags); |
| 1389 | VMCOREINFO_OFFSET(page, _count); | 1389 | VMCOREINFO_OFFSET(page, _count); |
| 1390 | VMCOREINFO_OFFSET(page, mapping); | 1390 | VMCOREINFO_OFFSET(page, mapping); |
diff --git a/kernel/panic.c b/kernel/panic.c index d9e90cfe3298..24af9f8bac99 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -161,7 +161,7 @@ const char *print_tainted(void) | |||
| 161 | { | 161 | { |
| 162 | static char buf[20]; | 162 | static char buf[20]; |
| 163 | if (tainted) { | 163 | if (tainted) { |
| 164 | snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c", | 164 | snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c", |
| 165 | tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', | 165 | tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', |
| 166 | tainted & TAINT_FORCED_MODULE ? 'F' : ' ', | 166 | tainted & TAINT_FORCED_MODULE ? 'F' : ' ', |
| 167 | tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', | 167 | tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', |
| @@ -169,7 +169,8 @@ const char *print_tainted(void) | |||
| 169 | tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', | 169 | tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', |
| 170 | tainted & TAINT_BAD_PAGE ? 'B' : ' ', | 170 | tainted & TAINT_BAD_PAGE ? 'B' : ' ', |
| 171 | tainted & TAINT_USER ? 'U' : ' ', | 171 | tainted & TAINT_USER ? 'U' : ' ', |
| 172 | tainted & TAINT_DIE ? 'D' : ' '); | 172 | tainted & TAINT_DIE ? 'D' : ' ', |
| 173 | tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' '); | ||
| 173 | } | 174 | } |
| 174 | else | 175 | else |
| 175 | snprintf(buf, sizeof(buf), "Not tainted"); | 176 | snprintf(buf, sizeof(buf), "Not tainted"); |
diff --git a/kernel/pid.c b/kernel/pid.c index f815455431bf..3b30bccdfcdc 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -368,6 +368,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) | |||
| 368 | } | 368 | } |
| 369 | return result; | 369 | return result; |
| 370 | } | 370 | } |
| 371 | EXPORT_SYMBOL(pid_task); | ||
| 371 | 372 | ||
| 372 | /* | 373 | /* |
| 373 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. | 374 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ef9b802738a5..79833170bb9c 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -74,8 +74,8 @@ config PM_TRACE_RTC | |||
| 74 | RTC across reboots, so that you can debug a machine that just hangs | 74 | RTC across reboots, so that you can debug a machine that just hangs |
| 75 | during suspend (or more commonly, during resume). | 75 | during suspend (or more commonly, during resume). |
| 76 | 76 | ||
| 77 | To use this debugging feature you should attempt to suspend the machine, | 77 | To use this debugging feature you should attempt to suspend the |
| 78 | then reboot it, then run | 78 | machine, reboot it and then run |
| 79 | 79 | ||
| 80 | dmesg -s 1000000 | grep 'hash matches' | 80 | dmesg -s 1000000 | grep 'hash matches' |
| 81 | 81 | ||
| @@ -123,7 +123,10 @@ config HIBERNATION | |||
| 123 | called "hibernation" in user interfaces. STD checkpoints the | 123 | called "hibernation" in user interfaces. STD checkpoints the |
| 124 | system and powers it off; and restores that checkpoint on reboot. | 124 | system and powers it off; and restores that checkpoint on reboot. |
| 125 | 125 | ||
| 126 | You can suspend your machine with 'echo disk > /sys/power/state'. | 126 | You can suspend your machine with 'echo disk > /sys/power/state' |
| 127 | after placing resume=/dev/swappartition on the kernel command line | ||
| 128 | in your bootloader's configuration file. | ||
| 129 | |||
| 127 | Alternatively, you can use the additional userland tools available | 130 | Alternatively, you can use the additional userland tools available |
| 128 | from <http://suspend.sf.net>. | 131 | from <http://suspend.sf.net>. |
| 129 | 132 | ||
diff --git a/kernel/res_counter.c b/kernel/res_counter.c new file mode 100644 index 000000000000..16cbec2d5d60 --- /dev/null +++ b/kernel/res_counter.c | |||
| @@ -0,0 +1,134 @@ | |||
| 1 | /* | ||
| 2 | * resource cgroups | ||
| 3 | * | ||
| 4 | * Copyright 2007 OpenVZ SWsoft Inc | ||
| 5 | * | ||
| 6 | * Author: Pavel Emelianov <xemul@openvz.org> | ||
| 7 | * | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/types.h> | ||
| 11 | #include <linux/parser.h> | ||
| 12 | #include <linux/fs.h> | ||
| 13 | #include <linux/res_counter.h> | ||
| 14 | #include <linux/uaccess.h> | ||
| 15 | |||
| 16 | void res_counter_init(struct res_counter *counter) | ||
| 17 | { | ||
| 18 | spin_lock_init(&counter->lock); | ||
| 19 | counter->limit = (unsigned long long)LLONG_MAX; | ||
| 20 | } | ||
| 21 | |||
| 22 | int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | ||
| 23 | { | ||
| 24 | if (counter->usage + val > counter->limit) { | ||
| 25 | counter->failcnt++; | ||
| 26 | return -ENOMEM; | ||
| 27 | } | ||
| 28 | |||
| 29 | counter->usage += val; | ||
| 30 | return 0; | ||
| 31 | } | ||
| 32 | |||
| 33 | int res_counter_charge(struct res_counter *counter, unsigned long val) | ||
| 34 | { | ||
| 35 | int ret; | ||
| 36 | unsigned long flags; | ||
| 37 | |||
| 38 | spin_lock_irqsave(&counter->lock, flags); | ||
| 39 | ret = res_counter_charge_locked(counter, val); | ||
| 40 | spin_unlock_irqrestore(&counter->lock, flags); | ||
| 41 | return ret; | ||
| 42 | } | ||
| 43 | |||
| 44 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | ||
| 45 | { | ||
| 46 | if (WARN_ON(counter->usage < val)) | ||
| 47 | val = counter->usage; | ||
| 48 | |||
| 49 | counter->usage -= val; | ||
| 50 | } | ||
| 51 | |||
| 52 | void res_counter_uncharge(struct res_counter *counter, unsigned long val) | ||
| 53 | { | ||
| 54 | unsigned long flags; | ||
| 55 | |||
| 56 | spin_lock_irqsave(&counter->lock, flags); | ||
| 57 | res_counter_uncharge_locked(counter, val); | ||
| 58 | spin_unlock_irqrestore(&counter->lock, flags); | ||
| 59 | } | ||
| 60 | |||
| 61 | |||
| 62 | static inline unsigned long long * | ||
| 63 | res_counter_member(struct res_counter *counter, int member) | ||
| 64 | { | ||
| 65 | switch (member) { | ||
| 66 | case RES_USAGE: | ||
| 67 | return &counter->usage; | ||
| 68 | case RES_LIMIT: | ||
| 69 | return &counter->limit; | ||
| 70 | case RES_FAILCNT: | ||
| 71 | return &counter->failcnt; | ||
| 72 | }; | ||
| 73 | |||
| 74 | BUG(); | ||
| 75 | return NULL; | ||
| 76 | } | ||
| 77 | |||
| 78 | ssize_t res_counter_read(struct res_counter *counter, int member, | ||
| 79 | const char __user *userbuf, size_t nbytes, loff_t *pos, | ||
| 80 | int (*read_strategy)(unsigned long long val, char *st_buf)) | ||
| 81 | { | ||
| 82 | unsigned long long *val; | ||
| 83 | char buf[64], *s; | ||
| 84 | |||
| 85 | s = buf; | ||
| 86 | val = res_counter_member(counter, member); | ||
| 87 | if (read_strategy) | ||
| 88 | s += read_strategy(*val, s); | ||
| 89 | else | ||
| 90 | s += sprintf(s, "%llu\n", *val); | ||
| 91 | return simple_read_from_buffer((void __user *)userbuf, nbytes, | ||
| 92 | pos, buf, s - buf); | ||
| 93 | } | ||
| 94 | |||
| 95 | ssize_t res_counter_write(struct res_counter *counter, int member, | ||
| 96 | const char __user *userbuf, size_t nbytes, loff_t *pos, | ||
| 97 | int (*write_strategy)(char *st_buf, unsigned long long *val)) | ||
| 98 | { | ||
| 99 | int ret; | ||
| 100 | char *buf, *end; | ||
| 101 | unsigned long flags; | ||
| 102 | unsigned long long tmp, *val; | ||
| 103 | |||
| 104 | buf = kmalloc(nbytes + 1, GFP_KERNEL); | ||
| 105 | ret = -ENOMEM; | ||
| 106 | if (buf == NULL) | ||
| 107 | goto out; | ||
| 108 | |||
| 109 | buf[nbytes] = '\0'; | ||
| 110 | ret = -EFAULT; | ||
| 111 | if (copy_from_user(buf, userbuf, nbytes)) | ||
| 112 | goto out_free; | ||
| 113 | |||
| 114 | ret = -EINVAL; | ||
| 115 | |||
| 116 | if (write_strategy) { | ||
| 117 | if (write_strategy(buf, &tmp)) { | ||
| 118 | goto out_free; | ||
| 119 | } | ||
| 120 | } else { | ||
| 121 | tmp = simple_strtoull(buf, &end, 10); | ||
| 122 | if (*end != '\0') | ||
| 123 | goto out_free; | ||
| 124 | } | ||
| 125 | spin_lock_irqsave(&counter->lock, flags); | ||
| 126 | val = res_counter_member(counter, member); | ||
| 127 | *val = tmp; | ||
| 128 | spin_unlock_irqrestore(&counter->lock, flags); | ||
| 129 | ret = nbytes; | ||
| 130 | out_free: | ||
| 131 | kfree(buf); | ||
| 132 | out: | ||
| 133 | return ret; | ||
| 134 | } | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 86daaa26d120..8c98d8147d88 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -67,6 +67,7 @@ extern int sysctl_overcommit_memory; | |||
| 67 | extern int sysctl_overcommit_ratio; | 67 | extern int sysctl_overcommit_ratio; |
| 68 | extern int sysctl_panic_on_oom; | 68 | extern int sysctl_panic_on_oom; |
| 69 | extern int sysctl_oom_kill_allocating_task; | 69 | extern int sysctl_oom_kill_allocating_task; |
| 70 | extern int sysctl_oom_dump_tasks; | ||
| 70 | extern int max_threads; | 71 | extern int max_threads; |
| 71 | extern int core_uses_pid; | 72 | extern int core_uses_pid; |
| 72 | extern int suid_dumpable; | 73 | extern int suid_dumpable; |
| @@ -871,6 +872,14 @@ static struct ctl_table vm_table[] = { | |||
| 871 | .proc_handler = &proc_dointvec, | 872 | .proc_handler = &proc_dointvec, |
| 872 | }, | 873 | }, |
| 873 | { | 874 | { |
| 875 | .ctl_name = CTL_UNNUMBERED, | ||
| 876 | .procname = "oom_dump_tasks", | ||
| 877 | .data = &sysctl_oom_dump_tasks, | ||
| 878 | .maxlen = sizeof(sysctl_oom_dump_tasks), | ||
| 879 | .mode = 0644, | ||
| 880 | .proc_handler = &proc_dointvec, | ||
| 881 | }, | ||
| 882 | { | ||
| 874 | .ctl_name = VM_OVERCOMMIT_RATIO, | 883 | .ctl_name = VM_OVERCOMMIT_RATIO, |
| 875 | .procname = "overcommit_ratio", | 884 | .procname = "overcommit_ratio", |
| 876 | .data = &sysctl_overcommit_ratio, | 885 | .data = &sysctl_overcommit_ratio, |
