aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cgroup.c318
-rw-r--r--kernel/cpuset.c394
-rw-r--r--kernel/fork.c11
-rw-r--r--kernel/kexec.c18
-rw-r--r--kernel/panic.c5
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/res_counter.c134
-rw-r--r--kernel/sysctl.c9
10 files changed, 640 insertions, 260 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 135a1b943446..685697c0a181 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
43obj-$(CONFIG_CPUSETS) += cpuset.o 43obj-$(CONFIG_CPUSETS) += cpuset.o
44obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 44obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
45obj-$(CONFIG_IKCONFIG) += configs.o 45obj-$(CONFIG_IKCONFIG) += configs.o
46obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
46obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 47obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
47obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 48obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
48obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 49obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1a3c23936d43..4766bb65e4d9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -141,7 +141,7 @@ enum {
141 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 141 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
142}; 142};
143 143
144inline int cgroup_is_releasable(const struct cgroup *cgrp) 144static int cgroup_is_releasable(const struct cgroup *cgrp)
145{ 145{
146 const int bits = 146 const int bits =
147 (1 << CGRP_RELEASABLE) | 147 (1 << CGRP_RELEASABLE) |
@@ -149,7 +149,7 @@ inline int cgroup_is_releasable(const struct cgroup *cgrp)
149 return (cgrp->flags & bits) == bits; 149 return (cgrp->flags & bits) == bits;
150} 150}
151 151
152inline int notify_on_release(const struct cgroup *cgrp) 152static int notify_on_release(const struct cgroup *cgrp)
153{ 153{
154 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 154 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
155} 155}
@@ -489,7 +489,7 @@ static struct css_set *find_css_set(
489 * Any task can increment and decrement the count field without lock. 489 * Any task can increment and decrement the count field without lock.
490 * So in general, code holding cgroup_mutex can't rely on the count 490 * So in general, code holding cgroup_mutex can't rely on the count
491 * field not changing. However, if the count goes to zero, then only 491 * field not changing. However, if the count goes to zero, then only
492 * attach_task() can increment it again. Because a count of zero 492 * cgroup_attach_task() can increment it again. Because a count of zero
493 * means that no tasks are currently attached, therefore there is no 493 * means that no tasks are currently attached, therefore there is no
494 * way a task attached to that cgroup can fork (the other way to 494 * way a task attached to that cgroup can fork (the other way to
495 * increment the count). So code holding cgroup_mutex can safely 495 * increment the count). So code holding cgroup_mutex can safely
@@ -520,17 +520,17 @@ static struct css_set *find_css_set(
520 * The task_lock() exception 520 * The task_lock() exception
521 * 521 *
522 * The need for this exception arises from the action of 522 * The need for this exception arises from the action of
523 * attach_task(), which overwrites one tasks cgroup pointer with 523 * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
524 * another. It does so using cgroup_mutexe, however there are 524 * another. It does so using cgroup_mutexe, however there are
525 * several performance critical places that need to reference 525 * several performance critical places that need to reference
526 * task->cgroup without the expense of grabbing a system global 526 * task->cgroup without the expense of grabbing a system global
527 * mutex. Therefore except as noted below, when dereferencing or, as 527 * mutex. Therefore except as noted below, when dereferencing or, as
528 * in attach_task(), modifying a task'ss cgroup pointer we use 528 * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
529 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 529 * task_lock(), which acts on a spinlock (task->alloc_lock) already in
530 * the task_struct routinely used for such matters. 530 * the task_struct routinely used for such matters.
531 * 531 *
532 * P.S. One more locking exception. RCU is used to guard the 532 * P.S. One more locking exception. RCU is used to guard the
533 * update of a tasks cgroup pointer by attach_task() 533 * update of a tasks cgroup pointer by cgroup_attach_task()
534 */ 534 */
535 535
536/** 536/**
@@ -586,11 +586,27 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
586 return inode; 586 return inode;
587} 587}
588 588
589/*
590 * Call subsys's pre_destroy handler.
591 * This is called before css refcnt check.
592 */
593
594static void cgroup_call_pre_destroy(struct cgroup *cgrp)
595{
596 struct cgroup_subsys *ss;
597 for_each_subsys(cgrp->root, ss)
598 if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
599 ss->pre_destroy(ss, cgrp);
600 return;
601}
602
603
589static void cgroup_diput(struct dentry *dentry, struct inode *inode) 604static void cgroup_diput(struct dentry *dentry, struct inode *inode)
590{ 605{
591 /* is dentry a directory ? if so, kfree() associated cgroup */ 606 /* is dentry a directory ? if so, kfree() associated cgroup */
592 if (S_ISDIR(inode->i_mode)) { 607 if (S_ISDIR(inode->i_mode)) {
593 struct cgroup *cgrp = dentry->d_fsdata; 608 struct cgroup *cgrp = dentry->d_fsdata;
609 struct cgroup_subsys *ss;
594 BUG_ON(!(cgroup_is_removed(cgrp))); 610 BUG_ON(!(cgroup_is_removed(cgrp)));
595 /* It's possible for external users to be holding css 611 /* It's possible for external users to be holding css
596 * reference counts on a cgroup; css_put() needs to 612 * reference counts on a cgroup; css_put() needs to
@@ -599,6 +615,23 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
599 * queue the cgroup to be handled by the release 615 * queue the cgroup to be handled by the release
600 * agent */ 616 * agent */
601 synchronize_rcu(); 617 synchronize_rcu();
618
619 mutex_lock(&cgroup_mutex);
620 /*
621 * Release the subsystem state objects.
622 */
623 for_each_subsys(cgrp->root, ss) {
624 if (cgrp->subsys[ss->subsys_id])
625 ss->destroy(ss, cgrp);
626 }
627
628 cgrp->root->number_of_cgroups--;
629 mutex_unlock(&cgroup_mutex);
630
631 /* Drop the active superblock reference that we took when we
632 * created the cgroup */
633 deactivate_super(cgrp->root->sb);
634
602 kfree(cgrp); 635 kfree(cgrp);
603 } 636 }
604 iput(inode); 637 iput(inode);
@@ -1161,7 +1194,7 @@ static void get_first_subsys(const struct cgroup *cgrp,
1161 * Call holding cgroup_mutex. May take task_lock of 1194 * Call holding cgroup_mutex. May take task_lock of
1162 * the task 'pid' during call. 1195 * the task 'pid' during call.
1163 */ 1196 */
1164static int attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1197int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1165{ 1198{
1166 int retval = 0; 1199 int retval = 0;
1167 struct cgroup_subsys *ss; 1200 struct cgroup_subsys *ss;
@@ -1181,9 +1214,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1181 for_each_subsys(root, ss) { 1214 for_each_subsys(root, ss) {
1182 if (ss->can_attach) { 1215 if (ss->can_attach) {
1183 retval = ss->can_attach(ss, cgrp, tsk); 1216 retval = ss->can_attach(ss, cgrp, tsk);
1184 if (retval) { 1217 if (retval)
1185 return retval; 1218 return retval;
1186 }
1187 } 1219 }
1188 } 1220 }
1189 1221
@@ -1192,9 +1224,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1192 * based on its final set of cgroups 1224 * based on its final set of cgroups
1193 */ 1225 */
1194 newcg = find_css_set(cg, cgrp); 1226 newcg = find_css_set(cg, cgrp);
1195 if (!newcg) { 1227 if (!newcg)
1196 return -ENOMEM; 1228 return -ENOMEM;
1197 }
1198 1229
1199 task_lock(tsk); 1230 task_lock(tsk);
1200 if (tsk->flags & PF_EXITING) { 1231 if (tsk->flags & PF_EXITING) {
@@ -1214,9 +1245,8 @@ static int attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1214 write_unlock(&css_set_lock); 1245 write_unlock(&css_set_lock);
1215 1246
1216 for_each_subsys(root, ss) { 1247 for_each_subsys(root, ss) {
1217 if (ss->attach) { 1248 if (ss->attach)
1218 ss->attach(ss, cgrp, oldcgrp, tsk); 1249 ss->attach(ss, cgrp, oldcgrp, tsk);
1219 }
1220 } 1250 }
1221 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1251 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1222 synchronize_rcu(); 1252 synchronize_rcu();
@@ -1239,7 +1269,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1239 1269
1240 if (pid) { 1270 if (pid) {
1241 rcu_read_lock(); 1271 rcu_read_lock();
1242 tsk = find_task_by_pid(pid); 1272 tsk = find_task_by_vpid(pid);
1243 if (!tsk || tsk->flags & PF_EXITING) { 1273 if (!tsk || tsk->flags & PF_EXITING) {
1244 rcu_read_unlock(); 1274 rcu_read_unlock();
1245 return -ESRCH; 1275 return -ESRCH;
@@ -1257,7 +1287,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1257 get_task_struct(tsk); 1287 get_task_struct(tsk);
1258 } 1288 }
1259 1289
1260 ret = attach_task(cgrp, tsk); 1290 ret = cgroup_attach_task(cgrp, tsk);
1261 put_task_struct(tsk); 1291 put_task_struct(tsk);
1262 return ret; 1292 return ret;
1263} 1293}
@@ -1329,9 +1359,14 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
1329 goto out1; 1359 goto out1;
1330 } 1360 }
1331 buffer[nbytes] = 0; /* nul-terminate */ 1361 buffer[nbytes] = 0; /* nul-terminate */
1362 strstrip(buffer); /* strip -just- trailing whitespace */
1332 1363
1333 mutex_lock(&cgroup_mutex); 1364 mutex_lock(&cgroup_mutex);
1334 1365
1366 /*
1367 * This was already checked for in cgroup_file_write(), but
1368 * check again now we're holding cgroup_mutex.
1369 */
1335 if (cgroup_is_removed(cgrp)) { 1370 if (cgroup_is_removed(cgrp)) {
1336 retval = -ENODEV; 1371 retval = -ENODEV;
1337 goto out2; 1372 goto out2;
@@ -1349,24 +1384,9 @@ static ssize_t cgroup_common_file_write(struct cgroup *cgrp,
1349 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 1384 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1350 break; 1385 break;
1351 case FILE_RELEASE_AGENT: 1386 case FILE_RELEASE_AGENT:
1352 { 1387 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1353 struct cgroupfs_root *root = cgrp->root; 1388 strcpy(cgrp->root->release_agent_path, buffer);
1354 /* Strip trailing newline */
1355 if (nbytes && (buffer[nbytes-1] == '\n')) {
1356 buffer[nbytes-1] = 0;
1357 }
1358 if (nbytes < sizeof(root->release_agent_path)) {
1359 /* We never write anything other than '\0'
1360 * into the last char of release_agent_path,
1361 * so it always remains a NUL-terminated
1362 * string */
1363 strncpy(root->release_agent_path, buffer, nbytes);
1364 root->release_agent_path[nbytes] = 0;
1365 } else {
1366 retval = -ENOSPC;
1367 }
1368 break; 1389 break;
1369 }
1370 default: 1390 default:
1371 retval = -EINVAL; 1391 retval = -EINVAL;
1372 goto out2; 1392 goto out2;
@@ -1387,7 +1407,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1387 struct cftype *cft = __d_cft(file->f_dentry); 1407 struct cftype *cft = __d_cft(file->f_dentry);
1388 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1408 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1389 1409
1390 if (!cft) 1410 if (!cft || cgroup_is_removed(cgrp))
1391 return -ENODEV; 1411 return -ENODEV;
1392 if (cft->write) 1412 if (cft->write)
1393 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1413 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1457,7 +1477,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1457 struct cftype *cft = __d_cft(file->f_dentry); 1477 struct cftype *cft = __d_cft(file->f_dentry);
1458 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 1478 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
1459 1479
1460 if (!cft) 1480 if (!cft || cgroup_is_removed(cgrp))
1461 return -ENODEV; 1481 return -ENODEV;
1462 1482
1463 if (cft->read) 1483 if (cft->read)
@@ -1675,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
1675 it->task = cg->tasks.next; 1695 it->task = cg->tasks.next;
1676} 1696}
1677 1697
1698/*
1699 * To reduce the fork() overhead for systems that are not actually
1700 * using their cgroups capability, we don't maintain the lists running
1701 * through each css_set to its tasks until we see the list actually
1702 * used - in other words after the first call to cgroup_iter_start().
1703 *
1704 * The tasklist_lock is not held here, as do_each_thread() and
1705 * while_each_thread() are protected by RCU.
1706 */
1707void cgroup_enable_task_cg_lists(void)
1708{
1709 struct task_struct *p, *g;
1710 write_lock(&css_set_lock);
1711 use_task_css_set_links = 1;
1712 do_each_thread(g, p) {
1713 task_lock(p);
1714 if (list_empty(&p->cg_list))
1715 list_add(&p->cg_list, &p->cgroups->tasks);
1716 task_unlock(p);
1717 } while_each_thread(g, p);
1718 write_unlock(&css_set_lock);
1719}
1720
1678void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 1721void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1679{ 1722{
1680 /* 1723 /*
@@ -1682,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
1682 * we need to enable the list linking each css_set to its 1725 * we need to enable the list linking each css_set to its
1683 * tasks, and fix up all existing tasks. 1726 * tasks, and fix up all existing tasks.
1684 */ 1727 */
1685 if (!use_task_css_set_links) { 1728 if (!use_task_css_set_links)
1686 struct task_struct *p, *g; 1729 cgroup_enable_task_cg_lists();
1687 write_lock(&css_set_lock); 1730
1688 use_task_css_set_links = 1;
1689 do_each_thread(g, p) {
1690 task_lock(p);
1691 if (list_empty(&p->cg_list))
1692 list_add(&p->cg_list, &p->cgroups->tasks);
1693 task_unlock(p);
1694 } while_each_thread(g, p);
1695 write_unlock(&css_set_lock);
1696 }
1697 read_lock(&css_set_lock); 1731 read_lock(&css_set_lock);
1698 it->cg_link = &cgrp->css_sets; 1732 it->cg_link = &cgrp->css_sets;
1699 cgroup_advance_iter(cgrp, it); 1733 cgroup_advance_iter(cgrp, it);
@@ -1726,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
1726 read_unlock(&css_set_lock); 1760 read_unlock(&css_set_lock);
1727} 1761}
1728 1762
1763static inline int started_after_time(struct task_struct *t1,
1764 struct timespec *time,
1765 struct task_struct *t2)
1766{
1767 int start_diff = timespec_compare(&t1->start_time, time);
1768 if (start_diff > 0) {
1769 return 1;
1770 } else if (start_diff < 0) {
1771 return 0;
1772 } else {
1773 /*
1774 * Arbitrarily, if two processes started at the same
1775 * time, we'll say that the lower pointer value
1776 * started first. Note that t2 may have exited by now
1777 * so this may not be a valid pointer any longer, but
1778 * that's fine - it still serves to distinguish
1779 * between two tasks started (effectively) simultaneously.
1780 */
1781 return t1 > t2;
1782 }
1783}
1784
1785/*
1786 * This function is a callback from heap_insert() and is used to order
1787 * the heap.
1788 * In this case we order the heap in descending task start time.
1789 */
1790static inline int started_after(void *p1, void *p2)
1791{
1792 struct task_struct *t1 = p1;
1793 struct task_struct *t2 = p2;
1794 return started_after_time(t1, &t2->start_time, t2);
1795}
1796
1797/**
1798 * cgroup_scan_tasks - iterate though all the tasks in a cgroup
1799 * @scan: struct cgroup_scanner containing arguments for the scan
1800 *
1801 * Arguments include pointers to callback functions test_task() and
1802 * process_task().
1803 * Iterate through all the tasks in a cgroup, calling test_task() for each,
1804 * and if it returns true, call process_task() for it also.
1805 * The test_task pointer may be NULL, meaning always true (select all tasks).
1806 * Effectively duplicates cgroup_iter_{start,next,end}()
1807 * but does not lock css_set_lock for the call to process_task().
1808 * The struct cgroup_scanner may be embedded in any structure of the caller's
1809 * creation.
1810 * It is guaranteed that process_task() will act on every task that
1811 * is a member of the cgroup for the duration of this call. This
1812 * function may or may not call process_task() for tasks that exit
1813 * or move to a different cgroup during the call, or are forked or
1814 * move into the cgroup during the call.
1815 *
1816 * Note that test_task() may be called with locks held, and may in some
1817 * situations be called multiple times for the same task, so it should
1818 * be cheap.
1819 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
1820 * pre-allocated and will be used for heap operations (and its "gt" member will
1821 * be overwritten), else a temporary heap will be used (allocation of which
1822 * may cause this function to fail).
1823 */
1824int cgroup_scan_tasks(struct cgroup_scanner *scan)
1825{
1826 int retval, i;
1827 struct cgroup_iter it;
1828 struct task_struct *p, *dropped;
1829 /* Never dereference latest_task, since it's not refcounted */
1830 struct task_struct *latest_task = NULL;
1831 struct ptr_heap tmp_heap;
1832 struct ptr_heap *heap;
1833 struct timespec latest_time = { 0, 0 };
1834
1835 if (scan->heap) {
1836 /* The caller supplied our heap and pre-allocated its memory */
1837 heap = scan->heap;
1838 heap->gt = &started_after;
1839 } else {
1840 /* We need to allocate our own heap memory */
1841 heap = &tmp_heap;
1842 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
1843 if (retval)
1844 /* cannot allocate the heap */
1845 return retval;
1846 }
1847
1848 again:
1849 /*
1850 * Scan tasks in the cgroup, using the scanner's "test_task" callback
1851 * to determine which are of interest, and using the scanner's
1852 * "process_task" callback to process any of them that need an update.
1853 * Since we don't want to hold any locks during the task updates,
1854 * gather tasks to be processed in a heap structure.
1855 * The heap is sorted by descending task start time.
1856 * If the statically-sized heap fills up, we overflow tasks that
1857 * started later, and in future iterations only consider tasks that
1858 * started after the latest task in the previous pass. This
1859 * guarantees forward progress and that we don't miss any tasks.
1860 */
1861 heap->size = 0;
1862 cgroup_iter_start(scan->cg, &it);
1863 while ((p = cgroup_iter_next(scan->cg, &it))) {
1864 /*
1865 * Only affect tasks that qualify per the caller's callback,
1866 * if he provided one
1867 */
1868 if (scan->test_task && !scan->test_task(p, scan))
1869 continue;
1870 /*
1871 * Only process tasks that started after the last task
1872 * we processed
1873 */
1874 if (!started_after_time(p, &latest_time, latest_task))
1875 continue;
1876 dropped = heap_insert(heap, p);
1877 if (dropped == NULL) {
1878 /*
1879 * The new task was inserted; the heap wasn't
1880 * previously full
1881 */
1882 get_task_struct(p);
1883 } else if (dropped != p) {
1884 /*
1885 * The new task was inserted, and pushed out a
1886 * different task
1887 */
1888 get_task_struct(p);
1889 put_task_struct(dropped);
1890 }
1891 /*
1892 * Else the new task was newer than anything already in
1893 * the heap and wasn't inserted
1894 */
1895 }
1896 cgroup_iter_end(scan->cg, &it);
1897
1898 if (heap->size) {
1899 for (i = 0; i < heap->size; i++) {
1900 struct task_struct *p = heap->ptrs[i];
1901 if (i == 0) {
1902 latest_time = p->start_time;
1903 latest_task = p;
1904 }
1905 /* Process the task per the caller's callback */
1906 scan->process_task(p, scan);
1907 put_task_struct(p);
1908 }
1909 /*
1910 * If we had to process any tasks at all, scan again
1911 * in case some of them were in the middle of forking
1912 * children that didn't get processed.
1913 * Not the most efficient way to do it, but it avoids
1914 * having to take callback_mutex in the fork path
1915 */
1916 goto again;
1917 }
1918 if (heap == &tmp_heap)
1919 heap_free(&tmp_heap);
1920 return 0;
1921}
1922
1729/* 1923/*
1730 * Stuff for reading the 'tasks' file. 1924 * Stuff for reading the 'tasks' file.
1731 * 1925 *
@@ -1761,7 +1955,7 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
1761 while ((tsk = cgroup_iter_next(cgrp, &it))) { 1955 while ((tsk = cgroup_iter_next(cgrp, &it))) {
1762 if (unlikely(n == npids)) 1956 if (unlikely(n == npids))
1763 break; 1957 break;
1764 pidarray[n++] = task_pid_nr(tsk); 1958 pidarray[n++] = task_pid_vnr(tsk);
1765 } 1959 }
1766 cgroup_iter_end(cgrp, &it); 1960 cgroup_iter_end(cgrp, &it);
1767 return n; 1961 return n;
@@ -2126,9 +2320,8 @@ static inline int cgroup_has_css_refs(struct cgroup *cgrp)
2126 * matter, since it can only happen if the cgroup 2320 * matter, since it can only happen if the cgroup
2127 * has been deleted and hence no longer needs the 2321 * has been deleted and hence no longer needs the
2128 * release agent to be called anyway. */ 2322 * release agent to be called anyway. */
2129 if (css && atomic_read(&css->refcnt)) { 2323 if (css && atomic_read(&css->refcnt))
2130 return 1; 2324 return 1;
2131 }
2132 } 2325 }
2133 return 0; 2326 return 0;
2134} 2327}
@@ -2138,7 +2331,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2138 struct cgroup *cgrp = dentry->d_fsdata; 2331 struct cgroup *cgrp = dentry->d_fsdata;
2139 struct dentry *d; 2332 struct dentry *d;
2140 struct cgroup *parent; 2333 struct cgroup *parent;
2141 struct cgroup_subsys *ss;
2142 struct super_block *sb; 2334 struct super_block *sb;
2143 struct cgroupfs_root *root; 2335 struct cgroupfs_root *root;
2144 2336
@@ -2157,17 +2349,19 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2157 parent = cgrp->parent; 2349 parent = cgrp->parent;
2158 root = cgrp->root; 2350 root = cgrp->root;
2159 sb = root->sb; 2351 sb = root->sb;
2352 /*
2353 * Call pre_destroy handlers of subsys
2354 */
2355 cgroup_call_pre_destroy(cgrp);
2356 /*
2357 * Notify subsyses that rmdir() request comes.
2358 */
2160 2359
2161 if (cgroup_has_css_refs(cgrp)) { 2360 if (cgroup_has_css_refs(cgrp)) {
2162 mutex_unlock(&cgroup_mutex); 2361 mutex_unlock(&cgroup_mutex);
2163 return -EBUSY; 2362 return -EBUSY;
2164 } 2363 }
2165 2364
2166 for_each_subsys(root, ss) {
2167 if (cgrp->subsys[ss->subsys_id])
2168 ss->destroy(ss, cgrp);
2169 }
2170
2171 spin_lock(&release_list_lock); 2365 spin_lock(&release_list_lock);
2172 set_bit(CGRP_REMOVED, &cgrp->flags); 2366 set_bit(CGRP_REMOVED, &cgrp->flags);
2173 if (!list_empty(&cgrp->release_list)) 2367 if (!list_empty(&cgrp->release_list))
@@ -2182,15 +2376,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
2182 2376
2183 cgroup_d_remove_dir(d); 2377 cgroup_d_remove_dir(d);
2184 dput(d); 2378 dput(d);
2185 root->number_of_cgroups--;
2186 2379
2187 set_bit(CGRP_RELEASABLE, &parent->flags); 2380 set_bit(CGRP_RELEASABLE, &parent->flags);
2188 check_for_release(parent); 2381 check_for_release(parent);
2189 2382
2190 mutex_unlock(&cgroup_mutex); 2383 mutex_unlock(&cgroup_mutex);
2191 /* Drop the active superblock reference that we took when we
2192 * created the cgroup */
2193 deactivate_super(sb);
2194 return 0; 2384 return 0;
2195} 2385}
2196 2386
@@ -2324,7 +2514,7 @@ out:
2324 * - Used for /proc/<pid>/cgroup. 2514 * - Used for /proc/<pid>/cgroup.
2325 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it 2515 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it
2326 * doesn't really matter if tsk->cgroup changes after we read it, 2516 * doesn't really matter if tsk->cgroup changes after we read it,
2327 * and we take cgroup_mutex, keeping attach_task() from changing it 2517 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
2328 * anyway. No need to check that tsk->cgroup != NULL, thanks to 2518 * anyway. No need to check that tsk->cgroup != NULL, thanks to
2329 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks 2519 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
2330 * cgroup to top_cgroup. 2520 * cgroup to top_cgroup.
@@ -2435,7 +2625,7 @@ static struct file_operations proc_cgroupstats_operations = {
2435 * A pointer to the shared css_set was automatically copied in 2625 * A pointer to the shared css_set was automatically copied in
2436 * fork.c by dup_task_struct(). However, we ignore that copy, since 2626 * fork.c by dup_task_struct(). However, we ignore that copy, since
2437 * it was not made under the protection of RCU or cgroup_mutex, so 2627 * it was not made under the protection of RCU or cgroup_mutex, so
2438 * might no longer be a valid cgroup pointer. attach_task() might 2628 * might no longer be a valid cgroup pointer. cgroup_attach_task() might
2439 * have already changed current->cgroups, allowing the previously 2629 * have already changed current->cgroups, allowing the previously
2440 * referenced cgroup group to be removed and freed. 2630 * referenced cgroup group to be removed and freed.
2441 * 2631 *
@@ -2514,8 +2704,8 @@ void cgroup_post_fork(struct task_struct *child)
2514 * attach us to a different cgroup, decrementing the count on 2704 * attach us to a different cgroup, decrementing the count on
2515 * the first cgroup that we never incremented. But in this case, 2705 * the first cgroup that we never incremented. But in this case,
2516 * top_cgroup isn't going away, and either task has PF_EXITING set, 2706 * top_cgroup isn't going away, and either task has PF_EXITING set,
2517 * which wards off any attach_task() attempts, or task is a failed 2707 * which wards off any cgroup_attach_task() attempts, or task is a failed
2518 * fork, never visible to attach_task. 2708 * fork, never visible to cgroup_attach_task.
2519 * 2709 *
2520 */ 2710 */
2521void cgroup_exit(struct task_struct *tsk, int run_callbacks) 2711void cgroup_exit(struct task_struct *tsk, int run_callbacks)
@@ -2655,7 +2845,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2655 } 2845 }
2656 2846
2657 /* All seems fine. Finish by moving the task into the new cgroup */ 2847 /* All seems fine. Finish by moving the task into the new cgroup */
2658 ret = attach_task(child, tsk); 2848 ret = cgroup_attach_task(child, tsk);
2659 mutex_unlock(&cgroup_mutex); 2849 mutex_unlock(&cgroup_mutex);
2660 2850
2661 out_release: 2851 out_release:
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cfaf6419d817..67b2bfe27814 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -38,7 +38,6 @@
38#include <linux/mount.h> 38#include <linux/mount.h>
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/prio_heap.h>
42#include <linux/proc_fs.h> 41#include <linux/proc_fs.h>
43#include <linux/rcupdate.h> 42#include <linux/rcupdate.h>
44#include <linux/sched.h> 43#include <linux/sched.h>
@@ -56,6 +55,8 @@
56#include <asm/atomic.h> 55#include <asm/atomic.h>
57#include <linux/mutex.h> 56#include <linux/mutex.h>
58#include <linux/kfifo.h> 57#include <linux/kfifo.h>
58#include <linux/workqueue.h>
59#include <linux/cgroup.h>
59 60
60/* 61/*
61 * Tracks how many cpusets are currently defined in system. 62 * Tracks how many cpusets are currently defined in system.
@@ -64,7 +65,7 @@
64 */ 65 */
65int number_of_cpusets __read_mostly; 66int number_of_cpusets __read_mostly;
66 67
67/* Retrieve the cpuset from a cgroup */ 68/* Forward declare cgroup structures */
68struct cgroup_subsys cpuset_subsys; 69struct cgroup_subsys cpuset_subsys;
69struct cpuset; 70struct cpuset;
70 71
@@ -96,6 +97,9 @@ struct cpuset {
96 97
97 /* partition number for rebuild_sched_domains() */ 98 /* partition number for rebuild_sched_domains() */
98 int pn; 99 int pn;
100
101 /* used for walking a cpuset heirarchy */
102 struct list_head stack_list;
99}; 103};
100 104
101/* Retrieve the cpuset for a cgroup */ 105/* Retrieve the cpuset for a cgroup */
@@ -111,7 +115,10 @@ static inline struct cpuset *task_cs(struct task_struct *task)
111 return container_of(task_subsys_state(task, cpuset_subsys_id), 115 return container_of(task_subsys_state(task, cpuset_subsys_id),
112 struct cpuset, css); 116 struct cpuset, css);
113} 117}
114 118struct cpuset_hotplug_scanner {
119 struct cgroup_scanner scan;
120 struct cgroup *to;
121};
115 122
116/* bits in struct cpuset flags field */ 123/* bits in struct cpuset flags field */
117typedef enum { 124typedef enum {
@@ -160,17 +167,17 @@ static inline int is_spread_slab(const struct cpuset *cs)
160 * number, and avoid having to lock and reload mems_allowed unless 167 * number, and avoid having to lock and reload mems_allowed unless
161 * the cpuset they're using changes generation. 168 * the cpuset they're using changes generation.
162 * 169 *
163 * A single, global generation is needed because attach_task() could 170 * A single, global generation is needed because cpuset_attach_task() could
164 * reattach a task to a different cpuset, which must not have its 171 * reattach a task to a different cpuset, which must not have its
165 * generation numbers aliased with those of that tasks previous cpuset. 172 * generation numbers aliased with those of that tasks previous cpuset.
166 * 173 *
167 * Generations are needed for mems_allowed because one task cannot 174 * Generations are needed for mems_allowed because one task cannot
168 * modify anothers memory placement. So we must enable every task, 175 * modify another's memory placement. So we must enable every task,
169 * on every visit to __alloc_pages(), to efficiently check whether 176 * on every visit to __alloc_pages(), to efficiently check whether
170 * its current->cpuset->mems_allowed has changed, requiring an update 177 * its current->cpuset->mems_allowed has changed, requiring an update
171 * of its current->mems_allowed. 178 * of its current->mems_allowed.
172 * 179 *
173 * Since cpuset_mems_generation is guarded by manage_mutex, 180 * Since writes to cpuset_mems_generation are guarded by the cgroup lock
174 * there is no need to mark it atomic. 181 * there is no need to mark it atomic.
175 */ 182 */
176static int cpuset_mems_generation; 183static int cpuset_mems_generation;
@@ -182,17 +189,20 @@ static struct cpuset top_cpuset = {
182}; 189};
183 190
184/* 191/*
185 * We have two global cpuset mutexes below. They can nest. 192 * There are two global mutexes guarding cpuset structures. The first
186 * It is ok to first take manage_mutex, then nest callback_mutex. We also 193 * is the main control groups cgroup_mutex, accessed via
187 * require taking task_lock() when dereferencing a tasks cpuset pointer. 194 * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific
188 * See "The task_lock() exception", at the end of this comment. 195 * callback_mutex, below. They can nest. It is ok to first take
196 * cgroup_mutex, then nest callback_mutex. We also require taking
197 * task_lock() when dereferencing a task's cpuset pointer. See "The
198 * task_lock() exception", at the end of this comment.
189 * 199 *
190 * A task must hold both mutexes to modify cpusets. If a task 200 * A task must hold both mutexes to modify cpusets. If a task
191 * holds manage_mutex, then it blocks others wanting that mutex, 201 * holds cgroup_mutex, then it blocks others wanting that mutex,
192 * ensuring that it is the only task able to also acquire callback_mutex 202 * ensuring that it is the only task able to also acquire callback_mutex
193 * and be able to modify cpusets. It can perform various checks on 203 * and be able to modify cpusets. It can perform various checks on
194 * the cpuset structure first, knowing nothing will change. It can 204 * the cpuset structure first, knowing nothing will change. It can
195 * also allocate memory while just holding manage_mutex. While it is 205 * also allocate memory while just holding cgroup_mutex. While it is
196 * performing these checks, various callback routines can briefly 206 * performing these checks, various callback routines can briefly
197 * acquire callback_mutex to query cpusets. Once it is ready to make 207 * acquire callback_mutex to query cpusets. Once it is ready to make
198 * the changes, it takes callback_mutex, blocking everyone else. 208 * the changes, it takes callback_mutex, blocking everyone else.
@@ -208,60 +218,16 @@ static struct cpuset top_cpuset = {
208 * The task_struct fields mems_allowed and mems_generation may only 218 * The task_struct fields mems_allowed and mems_generation may only
209 * be accessed in the context of that task, so require no locks. 219 * be accessed in the context of that task, so require no locks.
210 * 220 *
211 * Any task can increment and decrement the count field without lock.
212 * So in general, code holding manage_mutex or callback_mutex can't rely
213 * on the count field not changing. However, if the count goes to
214 * zero, then only attach_task(), which holds both mutexes, can
215 * increment it again. Because a count of zero means that no tasks
216 * are currently attached, therefore there is no way a task attached
217 * to that cpuset can fork (the other way to increment the count).
218 * So code holding manage_mutex or callback_mutex can safely assume that
219 * if the count is zero, it will stay zero. Similarly, if a task
220 * holds manage_mutex or callback_mutex on a cpuset with zero count, it
221 * knows that the cpuset won't be removed, as cpuset_rmdir() needs
222 * both of those mutexes.
223 *
224 * The cpuset_common_file_write handler for operations that modify 221 * The cpuset_common_file_write handler for operations that modify
225 * the cpuset hierarchy holds manage_mutex across the entire operation, 222 * the cpuset hierarchy holds cgroup_mutex across the entire operation,
226 * single threading all such cpuset modifications across the system. 223 * single threading all such cpuset modifications across the system.
227 * 224 *
228 * The cpuset_common_file_read() handlers only hold callback_mutex across 225 * The cpuset_common_file_read() handlers only hold callback_mutex across
229 * small pieces of code, such as when reading out possibly multi-word 226 * small pieces of code, such as when reading out possibly multi-word
230 * cpumasks and nodemasks. 227 * cpumasks and nodemasks.
231 * 228 *
232 * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't 229 * Accessing a task's cpuset should be done in accordance with the
233 * (usually) take either mutex. These are the two most performance 230 * guidelines for accessing subsystem state in kernel/cgroup.c
234 * critical pieces of code here. The exception occurs on cpuset_exit(),
235 * when a task in a notify_on_release cpuset exits. Then manage_mutex
236 * is taken, and if the cpuset count is zero, a usermode call made
237 * to /sbin/cpuset_release_agent with the name of the cpuset (path
238 * relative to the root of cpuset file system) as the argument.
239 *
240 * A cpuset can only be deleted if both its 'count' of using tasks
241 * is zero, and its list of 'children' cpusets is empty. Since all
242 * tasks in the system use _some_ cpuset, and since there is always at
243 * least one task in the system (init), therefore, top_cpuset
244 * always has either children cpusets and/or using tasks. So we don't
245 * need a special hack to ensure that top_cpuset cannot be deleted.
246 *
247 * The above "Tale of Two Semaphores" would be complete, but for:
248 *
249 * The task_lock() exception
250 *
251 * The need for this exception arises from the action of attach_task(),
252 * which overwrites one tasks cpuset pointer with another. It does
253 * so using both mutexes, however there are several performance
254 * critical places that need to reference task->cpuset without the
255 * expense of grabbing a system global mutex. Therefore except as
256 * noted below, when dereferencing or, as in attach_task(), modifying
257 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
258 * (task->alloc_lock) already in the task_struct routinely used for
259 * such matters.
260 *
261 * P.S. One more locking exception. RCU is used to guard the
262 * update of a tasks cpuset pointer by attach_task() and the
263 * access of task->cpuset->mems_generation via that pointer in
264 * the routine cpuset_update_task_memory_state().
265 */ 231 */
266 232
267static DEFINE_MUTEX(callback_mutex); 233static DEFINE_MUTEX(callback_mutex);
@@ -354,15 +320,14 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
354 * Do not call this routine if in_interrupt(). 320 * Do not call this routine if in_interrupt().
355 * 321 *
356 * Call without callback_mutex or task_lock() held. May be 322 * Call without callback_mutex or task_lock() held. May be
357 * called with or without manage_mutex held. Thanks in part to 323 * called with or without cgroup_mutex held. Thanks in part to
358 * 'the_top_cpuset_hack', the tasks cpuset pointer will never 324 * 'the_top_cpuset_hack', the task's cpuset pointer will never
359 * be NULL. This routine also might acquire callback_mutex and 325 * be NULL. This routine also might acquire callback_mutex and
360 * current->mm->mmap_sem during call. 326 * current->mm->mmap_sem during call.
361 * 327 *
362 * Reading current->cpuset->mems_generation doesn't need task_lock 328 * Reading current->cpuset->mems_generation doesn't need task_lock
363 * to guard the current->cpuset derefence, because it is guarded 329 * to guard the current->cpuset derefence, because it is guarded
364 * from concurrent freeing of current->cpuset by attach_task(), 330 * from concurrent freeing of current->cpuset using RCU.
365 * using RCU.
366 * 331 *
367 * The rcu_dereference() is technically probably not needed, 332 * The rcu_dereference() is technically probably not needed,
368 * as I don't actually mind if I see a new cpuset pointer but 333 * as I don't actually mind if I see a new cpuset pointer but
@@ -424,7 +389,7 @@ void cpuset_update_task_memory_state(void)
424 * 389 *
425 * One cpuset is a subset of another if all its allowed CPUs and 390 * One cpuset is a subset of another if all its allowed CPUs and
426 * Memory Nodes are a subset of the other, and its exclusive flags 391 * Memory Nodes are a subset of the other, and its exclusive flags
427 * are only set if the other's are set. Call holding manage_mutex. 392 * are only set if the other's are set. Call holding cgroup_mutex.
428 */ 393 */
429 394
430static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) 395static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -442,7 +407,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
442 * If we replaced the flag and mask values of the current cpuset 407 * If we replaced the flag and mask values of the current cpuset
443 * (cur) with those values in the trial cpuset (trial), would 408 * (cur) with those values in the trial cpuset (trial), would
444 * our various subset and exclusive rules still be valid? Presumes 409 * our various subset and exclusive rules still be valid? Presumes
445 * manage_mutex held. 410 * cgroup_mutex held.
446 * 411 *
447 * 'cur' is the address of an actual, in-use cpuset. Operations 412 * 'cur' is the address of an actual, in-use cpuset. Operations
448 * such as list traversal that depend on the actual address of the 413 * such as list traversal that depend on the actual address of the
@@ -476,7 +441,10 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
476 if (!is_cpuset_subset(trial, par)) 441 if (!is_cpuset_subset(trial, par))
477 return -EACCES; 442 return -EACCES;
478 443
479 /* If either I or some sibling (!= me) is exclusive, we can't overlap */ 444 /*
445 * If either I or some sibling (!= me) is exclusive, we can't
446 * overlap
447 */
480 list_for_each_entry(cont, &par->css.cgroup->children, sibling) { 448 list_for_each_entry(cont, &par->css.cgroup->children, sibling) {
481 c = cgroup_cs(cont); 449 c = cgroup_cs(cont);
482 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && 450 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
@@ -732,22 +700,50 @@ static inline int started_after(void *p1, void *p2)
732 return started_after_time(t1, &t2->start_time, t2); 700 return started_after_time(t1, &t2->start_time, t2);
733} 701}
734 702
735/* 703/**
736 * Call with manage_mutex held. May take callback_mutex during call. 704 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
705 * @tsk: task to test
706 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
707 *
708 * Call with cgroup_mutex held. May take callback_mutex during call.
709 * Called for each task in a cgroup by cgroup_scan_tasks().
710 * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
711 * words, if its mask is not equal to its cpuset's mask).
712 */
713int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
714{
715 return !cpus_equal(tsk->cpus_allowed,
716 (cgroup_cs(scan->cg))->cpus_allowed);
717}
718
719/**
720 * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's
721 * @tsk: task to test
722 * @scan: struct cgroup_scanner containing the cgroup of the task
723 *
724 * Called by cgroup_scan_tasks() for each task in a cgroup whose
725 * cpus_allowed mask needs to be changed.
726 *
727 * We don't need to re-check for the cgroup/cpuset membership, since we're
728 * holding cgroup_lock() at this point.
737 */ 729 */
730void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan)
731{
732 set_cpus_allowed(tsk, (cgroup_cs(scan->cg))->cpus_allowed);
733}
738 734
735/**
736 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
737 * @cs: the cpuset to consider
738 * @buf: buffer of cpu numbers written to this cpuset
739 */
739static int update_cpumask(struct cpuset *cs, char *buf) 740static int update_cpumask(struct cpuset *cs, char *buf)
740{ 741{
741 struct cpuset trialcs; 742 struct cpuset trialcs;
742 int retval, i; 743 struct cgroup_scanner scan;
743 int is_load_balanced;
744 struct cgroup_iter it;
745 struct cgroup *cgrp = cs->css.cgroup;
746 struct task_struct *p, *dropped;
747 /* Never dereference latest_task, since it's not refcounted */
748 struct task_struct *latest_task = NULL;
749 struct ptr_heap heap; 744 struct ptr_heap heap;
750 struct timespec latest_time = { 0, 0 }; 745 int retval;
746 int is_load_balanced;
751 747
752 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 748 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
753 if (cs == &top_cpuset) 749 if (cs == &top_cpuset)
@@ -756,7 +752,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
756 trialcs = *cs; 752 trialcs = *cs;
757 753
758 /* 754 /*
759 * An empty cpus_allowed is ok iff there are no tasks in the cpuset. 755 * An empty cpus_allowed is ok only if the cpuset has no tasks.
760 * Since cpulist_parse() fails on an empty mask, we special case 756 * Since cpulist_parse() fails on an empty mask, we special case
761 * that parsing. The validate_change() call ensures that cpusets 757 * that parsing. The validate_change() call ensures that cpusets
762 * with tasks have cpus. 758 * with tasks have cpus.
@@ -777,6 +773,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
777 /* Nothing to do if the cpus didn't change */ 773 /* Nothing to do if the cpus didn't change */
778 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 774 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
779 return 0; 775 return 0;
776
780 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); 777 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
781 if (retval) 778 if (retval)
782 return retval; 779 return retval;
@@ -787,62 +784,19 @@ static int update_cpumask(struct cpuset *cs, char *buf)
787 cs->cpus_allowed = trialcs.cpus_allowed; 784 cs->cpus_allowed = trialcs.cpus_allowed;
788 mutex_unlock(&callback_mutex); 785 mutex_unlock(&callback_mutex);
789 786
790 again:
791 /* 787 /*
792 * Scan tasks in the cpuset, and update the cpumasks of any 788 * Scan tasks in the cpuset, and update the cpumasks of any
793 * that need an update. Since we can't call set_cpus_allowed() 789 * that need an update.
794 * while holding tasklist_lock, gather tasks to be processed
795 * in a heap structure. If the statically-sized heap fills up,
796 * overflow tasks that started later, and in future iterations
797 * only consider tasks that started after the latest task in
798 * the previous pass. This guarantees forward progress and
799 * that we don't miss any tasks
800 */ 790 */
801 heap.size = 0; 791 scan.cg = cs->css.cgroup;
802 cgroup_iter_start(cgrp, &it); 792 scan.test_task = cpuset_test_cpumask;
803 while ((p = cgroup_iter_next(cgrp, &it))) { 793 scan.process_task = cpuset_change_cpumask;
804 /* Only affect tasks that don't have the right cpus_allowed */ 794 scan.heap = &heap;
805 if (cpus_equal(p->cpus_allowed, cs->cpus_allowed)) 795 cgroup_scan_tasks(&scan);
806 continue;
807 /*
808 * Only process tasks that started after the last task
809 * we processed
810 */
811 if (!started_after_time(p, &latest_time, latest_task))
812 continue;
813 dropped = heap_insert(&heap, p);
814 if (dropped == NULL) {
815 get_task_struct(p);
816 } else if (dropped != p) {
817 get_task_struct(p);
818 put_task_struct(dropped);
819 }
820 }
821 cgroup_iter_end(cgrp, &it);
822 if (heap.size) {
823 for (i = 0; i < heap.size; i++) {
824 struct task_struct *p = heap.ptrs[i];
825 if (i == 0) {
826 latest_time = p->start_time;
827 latest_task = p;
828 }
829 set_cpus_allowed(p, cs->cpus_allowed);
830 put_task_struct(p);
831 }
832 /*
833 * If we had to process any tasks at all, scan again
834 * in case some of them were in the middle of forking
835 * children that didn't notice the new cpumask
836 * restriction. Not the most efficient way to do it,
837 * but it avoids having to take callback_mutex in the
838 * fork path
839 */
840 goto again;
841 }
842 heap_free(&heap); 796 heap_free(&heap);
797
843 if (is_load_balanced) 798 if (is_load_balanced)
844 rebuild_sched_domains(); 799 rebuild_sched_domains();
845
846 return 0; 800 return 0;
847} 801}
848 802
@@ -854,11 +808,11 @@ static int update_cpumask(struct cpuset *cs, char *buf)
854 * Temporarilly set tasks mems_allowed to target nodes of migration, 808 * Temporarilly set tasks mems_allowed to target nodes of migration,
855 * so that the migration code can allocate pages on these nodes. 809 * so that the migration code can allocate pages on these nodes.
856 * 810 *
857 * Call holding manage_mutex, so our current->cpuset won't change 811 * Call holding cgroup_mutex, so current's cpuset won't change
858 * during this call, as manage_mutex holds off any attach_task() 812 * during this call, as manage_mutex holds off any cpuset_attach()
859 * calls. Therefore we don't need to take task_lock around the 813 * calls. Therefore we don't need to take task_lock around the
860 * call to guarantee_online_mems(), as we know no one is changing 814 * call to guarantee_online_mems(), as we know no one is changing
861 * our tasks cpuset. 815 * our task's cpuset.
862 * 816 *
863 * Hold callback_mutex around the two modifications of our tasks 817 * Hold callback_mutex around the two modifications of our tasks
864 * mems_allowed to synchronize with cpuset_mems_allowed(). 818 * mems_allowed to synchronize with cpuset_mems_allowed().
@@ -903,7 +857,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
903 * the cpuset is marked 'memory_migrate', migrate the tasks 857 * the cpuset is marked 'memory_migrate', migrate the tasks
904 * pages to the new memory. 858 * pages to the new memory.
905 * 859 *
906 * Call with manage_mutex held. May take callback_mutex during call. 860 * Call with cgroup_mutex held. May take callback_mutex during call.
907 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 861 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
908 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 862 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
909 * their mempolicies to the cpusets new mems_allowed. 863 * their mempolicies to the cpusets new mems_allowed.
@@ -1016,7 +970,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1016 * tasklist_lock. Forks can happen again now - the mpol_copy() 970 * tasklist_lock. Forks can happen again now - the mpol_copy()
1017 * cpuset_being_rebound check will catch such forks, and rebind 971 * cpuset_being_rebound check will catch such forks, and rebind
1018 * their vma mempolicies too. Because we still hold the global 972 * their vma mempolicies too. Because we still hold the global
1019 * cpuset manage_mutex, we know that no other rebind effort will 973 * cgroup_mutex, we know that no other rebind effort will
1020 * be contending for the global variable cpuset_being_rebound. 974 * be contending for the global variable cpuset_being_rebound.
1021 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 975 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
1022 * is idempotent. Also migrate pages in each mm to new nodes. 976 * is idempotent. Also migrate pages in each mm to new nodes.
@@ -1031,7 +985,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1031 mmput(mm); 985 mmput(mm);
1032 } 986 }
1033 987
1034 /* We're done rebinding vma's to this cpusets new mems_allowed. */ 988 /* We're done rebinding vmas to this cpuset's new mems_allowed. */
1035 kfree(mmarray); 989 kfree(mmarray);
1036 cpuset_being_rebound = NULL; 990 cpuset_being_rebound = NULL;
1037 retval = 0; 991 retval = 0;
@@ -1045,7 +999,7 @@ int current_cpuset_is_being_rebound(void)
1045} 999}
1046 1000
1047/* 1001/*
1048 * Call with manage_mutex held. 1002 * Call with cgroup_mutex held.
1049 */ 1003 */
1050 1004
1051static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) 1005static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
@@ -1066,7 +1020,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
1066 * cs: the cpuset to update 1020 * cs: the cpuset to update
1067 * buf: the buffer where we read the 0 or 1 1021 * buf: the buffer where we read the 0 or 1
1068 * 1022 *
1069 * Call with manage_mutex held. 1023 * Call with cgroup_mutex held.
1070 */ 1024 */
1071 1025
1072static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) 1026static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -1200,6 +1154,7 @@ static int fmeter_getrate(struct fmeter *fmp)
1200 return val; 1154 return val;
1201} 1155}
1202 1156
1157/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1203static int cpuset_can_attach(struct cgroup_subsys *ss, 1158static int cpuset_can_attach(struct cgroup_subsys *ss,
1204 struct cgroup *cont, struct task_struct *tsk) 1159 struct cgroup *cont, struct task_struct *tsk)
1205{ 1160{
@@ -1547,7 +1502,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1547 * If this becomes a problem for some users who wish to 1502 * If this becomes a problem for some users who wish to
1548 * allow that scenario, then cpuset_post_clone() could be 1503 * allow that scenario, then cpuset_post_clone() could be
1549 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive 1504 * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
1550 * (and likewise for mems) to the new cgroup. 1505 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1506 * held.
1551 */ 1507 */
1552static void cpuset_post_clone(struct cgroup_subsys *ss, 1508static void cpuset_post_clone(struct cgroup_subsys *ss,
1553 struct cgroup *cgroup) 1509 struct cgroup *cgroup)
@@ -1571,11 +1527,8 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1571 1527
1572/* 1528/*
1573 * cpuset_create - create a cpuset 1529 * cpuset_create - create a cpuset
1574 * parent: cpuset that will be parent of the new cpuset. 1530 * ss: cpuset cgroup subsystem
1575 * name: name of the new cpuset. Will be strcpy'ed. 1531 * cont: control group that the new cpuset will be part of
1576 * mode: mode to set on new inode
1577 *
1578 * Must be called with the mutex on the parent inode held
1579 */ 1532 */
1580 1533
1581static struct cgroup_subsys_state *cpuset_create( 1534static struct cgroup_subsys_state *cpuset_create(
@@ -1687,53 +1640,140 @@ int __init cpuset_init(void)
1687 return 0; 1640 return 0;
1688} 1641}
1689 1642
1643/**
1644 * cpuset_do_move_task - move a given task to another cpuset
1645 * @tsk: pointer to task_struct the task to move
1646 * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
1647 *
1648 * Called by cgroup_scan_tasks() for each task in a cgroup.
1649 * Return nonzero to stop the walk through the tasks.
1650 */
1651void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan)
1652{
1653 struct cpuset_hotplug_scanner *chsp;
1654
1655 chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
1656 cgroup_attach_task(chsp->to, tsk);
1657}
1658
1659/**
1660 * move_member_tasks_to_cpuset - move tasks from one cpuset to another
1661 * @from: cpuset in which the tasks currently reside
1662 * @to: cpuset to which the tasks will be moved
1663 *
1664 * Called with cgroup_mutex held
1665 * callback_mutex must not be held, as cpuset_attach() will take it.
1666 *
1667 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
1668 * calling callback functions for each.
1669 */
1670static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1671{
1672 struct cpuset_hotplug_scanner scan;
1673
1674 scan.scan.cg = from->css.cgroup;
1675 scan.scan.test_task = NULL; /* select all tasks in cgroup */
1676 scan.scan.process_task = cpuset_do_move_task;
1677 scan.scan.heap = NULL;
1678 scan.to = to->css.cgroup;
1679
1680 if (cgroup_scan_tasks((struct cgroup_scanner *)&scan))
1681 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1682 "cgroup_scan_tasks failed\n");
1683}
1684
1690/* 1685/*
1691 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1686 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
1692 * or memory nodes, we need to walk over the cpuset hierarchy, 1687 * or memory nodes, we need to walk over the cpuset hierarchy,
1693 * removing that CPU or node from all cpusets. If this removes the 1688 * removing that CPU or node from all cpusets. If this removes the
1694 * last CPU or node from a cpuset, then the guarantee_online_cpus() 1689 * last CPU or node from a cpuset, then move the tasks in the empty
1695 * or guarantee_online_mems() code will use that emptied cpusets 1690 * cpuset to its next-highest non-empty parent.
1696 * parent online CPUs or nodes. Cpusets that were already empty of
1697 * CPUs or nodes are left empty.
1698 * 1691 *
1699 * This routine is intentionally inefficient in a couple of regards. 1692 * Called with cgroup_mutex held
1700 * It will check all cpusets in a subtree even if the top cpuset of 1693 * callback_mutex must not be held, as cpuset_attach() will take it.
1701 * the subtree has no offline CPUs or nodes. It checks both CPUs and 1694 */
1702 * nodes, even though the caller could have been coded to know that 1695static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1703 * only one of CPUs or nodes needed to be checked on a given call. 1696{
1704 * This was done to minimize text size rather than cpu cycles. 1697 struct cpuset *parent;
1698
1699 /*
1700 * The cgroup's css_sets list is in use if there are tasks
1701 * in the cpuset; the list is empty if there are none;
1702 * the cs->css.refcnt seems always 0.
1703 */
1704 if (list_empty(&cs->css.cgroup->css_sets))
1705 return;
1706
1707 /*
1708 * Find its next-highest non-empty parent, (top cpuset
1709 * has online cpus, so can't be empty).
1710 */
1711 parent = cs->parent;
1712 while (cpus_empty(parent->cpus_allowed) ||
1713 nodes_empty(parent->mems_allowed))
1714 parent = parent->parent;
1715
1716 move_member_tasks_to_cpuset(cs, parent);
1717}
1718
1719/*
1720 * Walk the specified cpuset subtree and look for empty cpusets.
1721 * The tasks of such cpuset must be moved to a parent cpuset.
1722 *
1723 * Called with cgroup_mutex held. We take callback_mutex to modify
1724 * cpus_allowed and mems_allowed.
1705 * 1725 *
1706 * Call with both manage_mutex and callback_mutex held. 1726 * This walk processes the tree from top to bottom, completing one layer
1727 * before dropping down to the next. It always processes a node before
1728 * any of its children.
1707 * 1729 *
1708 * Recursive, on depth of cpuset subtree. 1730 * For now, since we lack memory hot unplug, we'll never see a cpuset
1731 * that has tasks along with an empty 'mems'. But if we did see such
1732 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1709 */ 1733 */
1710 1734static void scan_for_empty_cpusets(const struct cpuset *root)
1711static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
1712{ 1735{
1736 struct cpuset *cp; /* scans cpusets being updated */
1737 struct cpuset *child; /* scans child cpusets of cp */
1738 struct list_head queue;
1713 struct cgroup *cont; 1739 struct cgroup *cont;
1714 struct cpuset *c;
1715 1740
1716 /* Each of our child cpusets mems must be online */ 1741 INIT_LIST_HEAD(&queue);
1717 list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { 1742
1718 c = cgroup_cs(cont); 1743 list_add_tail((struct list_head *)&root->stack_list, &queue);
1719 guarantee_online_cpus_mems_in_subtree(c); 1744
1720 if (!cpus_empty(c->cpus_allowed)) 1745 while (!list_empty(&queue)) {
1721 guarantee_online_cpus(c, &c->cpus_allowed); 1746 cp = container_of(queue.next, struct cpuset, stack_list);
1722 if (!nodes_empty(c->mems_allowed)) 1747 list_del(queue.next);
1723 guarantee_online_mems(c, &c->mems_allowed); 1748 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
1749 child = cgroup_cs(cont);
1750 list_add_tail(&child->stack_list, &queue);
1751 }
1752 cont = cp->css.cgroup;
1753
1754 /* Continue past cpusets with all cpus, mems online */
1755 if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
1756 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1757 continue;
1758
1759 /* Remove offline cpus and mems from this cpuset. */
1760 mutex_lock(&callback_mutex);
1761 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
1762 nodes_and(cp->mems_allowed, cp->mems_allowed,
1763 node_states[N_HIGH_MEMORY]);
1764 mutex_unlock(&callback_mutex);
1765
1766 /* Move tasks from the empty cpuset to a parent */
1767 if (cpus_empty(cp->cpus_allowed) ||
1768 nodes_empty(cp->mems_allowed))
1769 remove_tasks_in_empty_cpuset(cp);
1724 } 1770 }
1725} 1771}
1726 1772
1727/* 1773/*
1728 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track 1774 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1729 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to 1775 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1730 * track what's online after any CPU or memory node hotplug or unplug 1776 * track what's online after any CPU or memory node hotplug or unplug event.
1731 * event.
1732 *
1733 * To ensure that we don't remove a CPU or node from the top cpuset
1734 * that is currently in use by a child cpuset (which would violate
1735 * the rule that cpusets must be subsets of their parent), we first
1736 * call the recursive routine guarantee_online_cpus_mems_in_subtree().
1737 * 1777 *
1738 * Since there are two callers of this routine, one for CPU hotplug 1778 * Since there are two callers of this routine, one for CPU hotplug
1739 * events and one for memory node hotplug events, we could have coded 1779 * events and one for memory node hotplug events, we could have coded
@@ -1744,13 +1784,11 @@ static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
1744static void common_cpu_mem_hotplug_unplug(void) 1784static void common_cpu_mem_hotplug_unplug(void)
1745{ 1785{
1746 cgroup_lock(); 1786 cgroup_lock();
1747 mutex_lock(&callback_mutex);
1748 1787
1749 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
1750 top_cpuset.cpus_allowed = cpu_online_map; 1788 top_cpuset.cpus_allowed = cpu_online_map;
1751 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 1789 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1790 scan_for_empty_cpusets(&top_cpuset);
1752 1791
1753 mutex_unlock(&callback_mutex);
1754 cgroup_unlock(); 1792 cgroup_unlock();
1755} 1793}
1756 1794
@@ -1826,7 +1864,7 @@ cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
1826 1864
1827/** 1865/**
1828 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. 1866 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
1829 * Must be called with callback_mutex held. 1867 * Must be called with callback_mutex held.
1830 **/ 1868 **/
1831cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk) 1869cpumask_t cpuset_cpus_allowed_locked(struct task_struct *tsk)
1832{ 1870{
@@ -2163,10 +2201,8 @@ void __cpuset_memory_pressure_bump(void)
2163 * - Used for /proc/<pid>/cpuset. 2201 * - Used for /proc/<pid>/cpuset.
2164 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2202 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2165 * doesn't really matter if tsk->cpuset changes after we read it, 2203 * doesn't really matter if tsk->cpuset changes after we read it,
2166 * and we take manage_mutex, keeping attach_task() from changing it 2204 * and we take cgroup_mutex, keeping cpuset_attach() from changing it
2167 * anyway. No need to check that tsk->cpuset != NULL, thanks to 2205 * anyway.
2168 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
2169 * cpuset to top_cpuset.
2170 */ 2206 */
2171static int proc_cpuset_show(struct seq_file *m, void *unused_v) 2207static int proc_cpuset_show(struct seq_file *m, void *unused_v)
2172{ 2208{
diff --git a/kernel/fork.c b/kernel/fork.c
index 3995297567a9..b2ef8e4fad70 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
40#include <linux/ptrace.h> 40#include <linux/ptrace.h>
41#include <linux/mount.h> 41#include <linux/mount.h>
42#include <linux/audit.h> 42#include <linux/audit.h>
43#include <linux/memcontrol.h>
43#include <linux/profile.h> 44#include <linux/profile.h>
44#include <linux/rmap.h> 45#include <linux/rmap.h>
45#include <linux/acct.h> 46#include <linux/acct.h>
@@ -340,7 +341,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
340 341
341#include <linux/init_task.h> 342#include <linux/init_task.h>
342 343
343static struct mm_struct * mm_init(struct mm_struct * mm) 344static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
344{ 345{
345 atomic_set(&mm->mm_users, 1); 346 atomic_set(&mm->mm_users, 1);
346 atomic_set(&mm->mm_count, 1); 347 atomic_set(&mm->mm_count, 1);
@@ -357,11 +358,14 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
357 mm->ioctx_list = NULL; 358 mm->ioctx_list = NULL;
358 mm->free_area_cache = TASK_UNMAPPED_BASE; 359 mm->free_area_cache = TASK_UNMAPPED_BASE;
359 mm->cached_hole_size = ~0UL; 360 mm->cached_hole_size = ~0UL;
361 mm_init_cgroup(mm, p);
360 362
361 if (likely(!mm_alloc_pgd(mm))) { 363 if (likely(!mm_alloc_pgd(mm))) {
362 mm->def_flags = 0; 364 mm->def_flags = 0;
363 return mm; 365 return mm;
364 } 366 }
367
368 mm_free_cgroup(mm);
365 free_mm(mm); 369 free_mm(mm);
366 return NULL; 370 return NULL;
367} 371}
@@ -376,7 +380,7 @@ struct mm_struct * mm_alloc(void)
376 mm = allocate_mm(); 380 mm = allocate_mm();
377 if (mm) { 381 if (mm) {
378 memset(mm, 0, sizeof(*mm)); 382 memset(mm, 0, sizeof(*mm));
379 mm = mm_init(mm); 383 mm = mm_init(mm, current);
380 } 384 }
381 return mm; 385 return mm;
382} 386}
@@ -390,6 +394,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
390{ 394{
391 BUG_ON(mm == &init_mm); 395 BUG_ON(mm == &init_mm);
392 mm_free_pgd(mm); 396 mm_free_pgd(mm);
397 mm_free_cgroup(mm);
393 destroy_context(mm); 398 destroy_context(mm);
394 free_mm(mm); 399 free_mm(mm);
395} 400}
@@ -511,7 +516,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
511 mm->token_priority = 0; 516 mm->token_priority = 0;
512 mm->last_interval = 0; 517 mm->last_interval = 0;
513 518
514 if (!mm_init(mm)) 519 if (!mm_init(mm, tsk))
515 goto fail_nomem; 520 goto fail_nomem;
516 521
517 if (init_new_context(tsk, mm)) 522 if (init_new_context(tsk, mm))
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 9a26eec9eb04..06a0e2775651 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1361,8 +1361,8 @@ unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
1361 1361
1362static int __init crash_save_vmcoreinfo_init(void) 1362static int __init crash_save_vmcoreinfo_init(void)
1363{ 1363{
1364 vmcoreinfo_append_str("OSRELEASE=%s\n", init_uts_ns.name.release); 1364 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1365 vmcoreinfo_append_str("PAGESIZE=%ld\n", PAGE_SIZE); 1365 VMCOREINFO_PAGESIZE(PAGE_SIZE);
1366 1366
1367 VMCOREINFO_SYMBOL(init_uts_ns); 1367 VMCOREINFO_SYMBOL(init_uts_ns);
1368 VMCOREINFO_SYMBOL(node_online_map); 1368 VMCOREINFO_SYMBOL(node_online_map);
@@ -1376,15 +1376,15 @@ static int __init crash_save_vmcoreinfo_init(void)
1376#ifdef CONFIG_SPARSEMEM 1376#ifdef CONFIG_SPARSEMEM
1377 VMCOREINFO_SYMBOL(mem_section); 1377 VMCOREINFO_SYMBOL(mem_section);
1378 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); 1378 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1379 VMCOREINFO_SIZE(mem_section); 1379 VMCOREINFO_STRUCT_SIZE(mem_section);
1380 VMCOREINFO_OFFSET(mem_section, section_mem_map); 1380 VMCOREINFO_OFFSET(mem_section, section_mem_map);
1381#endif 1381#endif
1382 VMCOREINFO_SIZE(page); 1382 VMCOREINFO_STRUCT_SIZE(page);
1383 VMCOREINFO_SIZE(pglist_data); 1383 VMCOREINFO_STRUCT_SIZE(pglist_data);
1384 VMCOREINFO_SIZE(zone); 1384 VMCOREINFO_STRUCT_SIZE(zone);
1385 VMCOREINFO_SIZE(free_area); 1385 VMCOREINFO_STRUCT_SIZE(free_area);
1386 VMCOREINFO_SIZE(list_head); 1386 VMCOREINFO_STRUCT_SIZE(list_head);
1387 VMCOREINFO_TYPEDEF_SIZE(nodemask_t); 1387 VMCOREINFO_SIZE(nodemask_t);
1388 VMCOREINFO_OFFSET(page, flags); 1388 VMCOREINFO_OFFSET(page, flags);
1389 VMCOREINFO_OFFSET(page, _count); 1389 VMCOREINFO_OFFSET(page, _count);
1390 VMCOREINFO_OFFSET(page, mapping); 1390 VMCOREINFO_OFFSET(page, mapping);
diff --git a/kernel/panic.c b/kernel/panic.c
index d9e90cfe3298..24af9f8bac99 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -161,7 +161,7 @@ const char *print_tainted(void)
161{ 161{
162 static char buf[20]; 162 static char buf[20];
163 if (tainted) { 163 if (tainted) {
164 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c", 164 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c",
165 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 165 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G',
166 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 166 tainted & TAINT_FORCED_MODULE ? 'F' : ' ',
167 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 167 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ',
@@ -169,7 +169,8 @@ const char *print_tainted(void)
169 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 169 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ',
170 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 170 tainted & TAINT_BAD_PAGE ? 'B' : ' ',
171 tainted & TAINT_USER ? 'U' : ' ', 171 tainted & TAINT_USER ? 'U' : ' ',
172 tainted & TAINT_DIE ? 'D' : ' '); 172 tainted & TAINT_DIE ? 'D' : ' ',
173 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ');
173 } 174 }
174 else 175 else
175 snprintf(buf, sizeof(buf), "Not tainted"); 176 snprintf(buf, sizeof(buf), "Not tainted");
diff --git a/kernel/pid.c b/kernel/pid.c
index f815455431bf..3b30bccdfcdc 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -368,6 +368,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
368 } 368 }
369 return result; 369 return result;
370} 370}
371EXPORT_SYMBOL(pid_task);
371 372
372/* 373/*
373 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 374 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ef9b802738a5..79833170bb9c 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -74,8 +74,8 @@ config PM_TRACE_RTC
74 RTC across reboots, so that you can debug a machine that just hangs 74 RTC across reboots, so that you can debug a machine that just hangs
75 during suspend (or more commonly, during resume). 75 during suspend (or more commonly, during resume).
76 76
77 To use this debugging feature you should attempt to suspend the machine, 77 To use this debugging feature you should attempt to suspend the
78 then reboot it, then run 78 machine, reboot it and then run
79 79
80 dmesg -s 1000000 | grep 'hash matches' 80 dmesg -s 1000000 | grep 'hash matches'
81 81
@@ -123,7 +123,10 @@ config HIBERNATION
123 called "hibernation" in user interfaces. STD checkpoints the 123 called "hibernation" in user interfaces. STD checkpoints the
124 system and powers it off; and restores that checkpoint on reboot. 124 system and powers it off; and restores that checkpoint on reboot.
125 125
126 You can suspend your machine with 'echo disk > /sys/power/state'. 126 You can suspend your machine with 'echo disk > /sys/power/state'
127 after placing resume=/dev/swappartition on the kernel command line
128 in your bootloader's configuration file.
129
127 Alternatively, you can use the additional userland tools available 130 Alternatively, you can use the additional userland tools available
128 from <http://suspend.sf.net>. 131 from <http://suspend.sf.net>.
129 132
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
new file mode 100644
index 000000000000..16cbec2d5d60
--- /dev/null
+++ b/kernel/res_counter.c
@@ -0,0 +1,134 @@
1/*
2 * resource cgroups
3 *
4 * Copyright 2007 OpenVZ SWsoft Inc
5 *
6 * Author: Pavel Emelianov <xemul@openvz.org>
7 *
8 */
9
10#include <linux/types.h>
11#include <linux/parser.h>
12#include <linux/fs.h>
13#include <linux/res_counter.h>
14#include <linux/uaccess.h>
15
16void res_counter_init(struct res_counter *counter)
17{
18 spin_lock_init(&counter->lock);
19 counter->limit = (unsigned long long)LLONG_MAX;
20}
21
22int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
23{
24 if (counter->usage + val > counter->limit) {
25 counter->failcnt++;
26 return -ENOMEM;
27 }
28
29 counter->usage += val;
30 return 0;
31}
32
33int res_counter_charge(struct res_counter *counter, unsigned long val)
34{
35 int ret;
36 unsigned long flags;
37
38 spin_lock_irqsave(&counter->lock, flags);
39 ret = res_counter_charge_locked(counter, val);
40 spin_unlock_irqrestore(&counter->lock, flags);
41 return ret;
42}
43
44void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
45{
46 if (WARN_ON(counter->usage < val))
47 val = counter->usage;
48
49 counter->usage -= val;
50}
51
52void res_counter_uncharge(struct res_counter *counter, unsigned long val)
53{
54 unsigned long flags;
55
56 spin_lock_irqsave(&counter->lock, flags);
57 res_counter_uncharge_locked(counter, val);
58 spin_unlock_irqrestore(&counter->lock, flags);
59}
60
61
62static inline unsigned long long *
63res_counter_member(struct res_counter *counter, int member)
64{
65 switch (member) {
66 case RES_USAGE:
67 return &counter->usage;
68 case RES_LIMIT:
69 return &counter->limit;
70 case RES_FAILCNT:
71 return &counter->failcnt;
72 };
73
74 BUG();
75 return NULL;
76}
77
78ssize_t res_counter_read(struct res_counter *counter, int member,
79 const char __user *userbuf, size_t nbytes, loff_t *pos,
80 int (*read_strategy)(unsigned long long val, char *st_buf))
81{
82 unsigned long long *val;
83 char buf[64], *s;
84
85 s = buf;
86 val = res_counter_member(counter, member);
87 if (read_strategy)
88 s += read_strategy(*val, s);
89 else
90 s += sprintf(s, "%llu\n", *val);
91 return simple_read_from_buffer((void __user *)userbuf, nbytes,
92 pos, buf, s - buf);
93}
94
95ssize_t res_counter_write(struct res_counter *counter, int member,
96 const char __user *userbuf, size_t nbytes, loff_t *pos,
97 int (*write_strategy)(char *st_buf, unsigned long long *val))
98{
99 int ret;
100 char *buf, *end;
101 unsigned long flags;
102 unsigned long long tmp, *val;
103
104 buf = kmalloc(nbytes + 1, GFP_KERNEL);
105 ret = -ENOMEM;
106 if (buf == NULL)
107 goto out;
108
109 buf[nbytes] = '\0';
110 ret = -EFAULT;
111 if (copy_from_user(buf, userbuf, nbytes))
112 goto out_free;
113
114 ret = -EINVAL;
115
116 if (write_strategy) {
117 if (write_strategy(buf, &tmp)) {
118 goto out_free;
119 }
120 } else {
121 tmp = simple_strtoull(buf, &end, 10);
122 if (*end != '\0')
123 goto out_free;
124 }
125 spin_lock_irqsave(&counter->lock, flags);
126 val = res_counter_member(counter, member);
127 *val = tmp;
128 spin_unlock_irqrestore(&counter->lock, flags);
129 ret = nbytes;
130out_free:
131 kfree(buf);
132out:
133 return ret;
134}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 86daaa26d120..8c98d8147d88 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -67,6 +67,7 @@ extern int sysctl_overcommit_memory;
67extern int sysctl_overcommit_ratio; 67extern int sysctl_overcommit_ratio;
68extern int sysctl_panic_on_oom; 68extern int sysctl_panic_on_oom;
69extern int sysctl_oom_kill_allocating_task; 69extern int sysctl_oom_kill_allocating_task;
70extern int sysctl_oom_dump_tasks;
70extern int max_threads; 71extern int max_threads;
71extern int core_uses_pid; 72extern int core_uses_pid;
72extern int suid_dumpable; 73extern int suid_dumpable;
@@ -871,6 +872,14 @@ static struct ctl_table vm_table[] = {
871 .proc_handler = &proc_dointvec, 872 .proc_handler = &proc_dointvec,
872 }, 873 },
873 { 874 {
875 .ctl_name = CTL_UNNUMBERED,
876 .procname = "oom_dump_tasks",
877 .data = &sysctl_oom_dump_tasks,
878 .maxlen = sizeof(sysctl_oom_dump_tasks),
879 .mode = 0644,
880 .proc_handler = &proc_dointvec,
881 },
882 {
874 .ctl_name = VM_OVERCOMMIT_RATIO, 883 .ctl_name = VM_OVERCOMMIT_RATIO,
875 .procname = "overcommit_ratio", 884 .procname = "overcommit_ratio",
876 .data = &sysctl_overcommit_ratio, 885 .data = &sysctl_overcommit_ratio,