aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cgroup.c587
-rw-r--r--kernel/cgroup_freezer.c26
-rw-r--r--kernel/cpuset.c107
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/events/core.c30
-rw-r--r--kernel/fork.c110
-rw-r--r--kernel/irq/handle.c6
-rw-r--r--kernel/irq/irqdesc.c14
-rw-r--r--kernel/irq/manage.c24
-rw-r--r--kernel/irq/proc.c1
-rw-r--r--kernel/irq/spurious.c31
-rw-r--r--kernel/jump_label.c18
-rw-r--r--kernel/kthread.c4
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/ns_cgroup.c118
-rw-r--r--kernel/nsproxy.c46
-rw-r--r--kernel/pm_qos_params.c70
-rw-r--r--kernel/power/hibernate.c220
-rw-r--r--kernel/profile.c16
-rw-r--r--kernel/rcutree.c208
-rw-r--r--kernel/rcutree.h30
-rw-r--r--kernel/rcutree_plugin.h33
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c127
-rw-r--r--kernel/sched_fair.c5
-rw-r--r--kernel/sched_rt.c10
-rw-r--r--kernel/sched_stats.h4
-rw-r--r--kernel/sysctl.c6
-rw-r--r--kernel/time/clockevents.c5
-rw-r--r--kernel/timer.c15
-rw-r--r--kernel/trace/ftrace.c31
-rw-r--r--kernel/trace/ring_buffer.c10
-rw-r--r--kernel/trace/trace.h15
-rw-r--r--kernel/trace/trace_events.c7
-rw-r--r--kernel/trace/trace_output.c27
-rw-r--r--kernel/utsname.c39
-rw-r--r--kernel/watchdog.c9
38 files changed, 1204 insertions, 822 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e9cf19155b46..2d64cfcc8b42 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -61,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o
61obj-$(CONFIG_CGROUPS) += cgroup.o 61obj-$(CONFIG_CGROUPS) += cgroup.o
62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 62obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
63obj-$(CONFIG_CPUSETS) += cpuset.o 63obj-$(CONFIG_CPUSETS) += cpuset.o
64obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
65obj-$(CONFIG_UTS_NS) += utsname.o 64obj-$(CONFIG_UTS_NS) += utsname.o
66obj-$(CONFIG_USER_NS) += user_namespace.o 65obj-$(CONFIG_USER_NS) += user_namespace.o
67obj-$(CONFIG_PID_NS) += pid_namespace.o 66obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 909a35510af5..2731d115d725 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ 57#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
58#include <linux/eventfd.h> 58#include <linux/eventfd.h>
59#include <linux/poll.h> 59#include <linux/poll.h>
60#include <linux/flex_array.h> /* used in cgroup_attach_proc */
60 61
61#include <asm/atomic.h> 62#include <asm/atomic.h>
62 63
@@ -1735,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1735} 1736}
1736EXPORT_SYMBOL_GPL(cgroup_path); 1737EXPORT_SYMBOL_GPL(cgroup_path);
1737 1738
1739/*
1740 * cgroup_task_migrate - move a task from one cgroup to another.
1741 *
1742 * 'guarantee' is set if the caller promises that a new css_set for the task
1743 * will already exist. If not set, this function might sleep, and can fail with
1744 * -ENOMEM. Otherwise, it can only fail with -ESRCH.
1745 */
1746static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1747 struct task_struct *tsk, bool guarantee)
1748{
1749 struct css_set *oldcg;
1750 struct css_set *newcg;
1751
1752 /*
1753 * get old css_set. we need to take task_lock and refcount it, because
1754 * an exiting task can change its css_set to init_css_set and drop its
1755 * old one without taking cgroup_mutex.
1756 */
1757 task_lock(tsk);
1758 oldcg = tsk->cgroups;
1759 get_css_set(oldcg);
1760 task_unlock(tsk);
1761
1762 /* locate or allocate a new css_set for this task. */
1763 if (guarantee) {
1764 /* we know the css_set we want already exists. */
1765 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1766 read_lock(&css_set_lock);
1767 newcg = find_existing_css_set(oldcg, cgrp, template);
1768 BUG_ON(!newcg);
1769 get_css_set(newcg);
1770 read_unlock(&css_set_lock);
1771 } else {
1772 might_sleep();
1773 /* find_css_set will give us newcg already referenced. */
1774 newcg = find_css_set(oldcg, cgrp);
1775 if (!newcg) {
1776 put_css_set(oldcg);
1777 return -ENOMEM;
1778 }
1779 }
1780 put_css_set(oldcg);
1781
1782 /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
1783 task_lock(tsk);
1784 if (tsk->flags & PF_EXITING) {
1785 task_unlock(tsk);
1786 put_css_set(newcg);
1787 return -ESRCH;
1788 }
1789 rcu_assign_pointer(tsk->cgroups, newcg);
1790 task_unlock(tsk);
1791
1792 /* Update the css_set linked lists if we're using them */
1793 write_lock(&css_set_lock);
1794 if (!list_empty(&tsk->cg_list))
1795 list_move(&tsk->cg_list, &newcg->tasks);
1796 write_unlock(&css_set_lock);
1797
1798 /*
1799 * We just gained a reference on oldcg by taking it from the task. As
1800 * trading it for newcg is protected by cgroup_mutex, we're safe to drop
1801 * it here; it will be freed under RCU.
1802 */
1803 put_css_set(oldcg);
1804
1805 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1806 return 0;
1807}
1808
1738/** 1809/**
1739 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1810 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
1740 * @cgrp: the cgroup the task is attaching to 1811 * @cgrp: the cgroup the task is attaching to
@@ -1745,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1745 */ 1816 */
1746int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1817int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1747{ 1818{
1748 int retval = 0; 1819 int retval;
1749 struct cgroup_subsys *ss, *failed_ss = NULL; 1820 struct cgroup_subsys *ss, *failed_ss = NULL;
1750 struct cgroup *oldcgrp; 1821 struct cgroup *oldcgrp;
1751 struct css_set *cg;
1752 struct css_set *newcg;
1753 struct cgroupfs_root *root = cgrp->root; 1822 struct cgroupfs_root *root = cgrp->root;
1754 1823
1755 /* Nothing to do if the task is already in that cgroup */ 1824 /* Nothing to do if the task is already in that cgroup */
@@ -1759,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1759 1828
1760 for_each_subsys(root, ss) { 1829 for_each_subsys(root, ss) {
1761 if (ss->can_attach) { 1830 if (ss->can_attach) {
1762 retval = ss->can_attach(ss, cgrp, tsk, false); 1831 retval = ss->can_attach(ss, cgrp, tsk);
1763 if (retval) { 1832 if (retval) {
1764 /* 1833 /*
1765 * Remember on which subsystem the can_attach() 1834 * Remember on which subsystem the can_attach()
@@ -1771,46 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1771 goto out; 1840 goto out;
1772 } 1841 }
1773 } 1842 }
1843 if (ss->can_attach_task) {
1844 retval = ss->can_attach_task(cgrp, tsk);
1845 if (retval) {
1846 failed_ss = ss;
1847 goto out;
1848 }
1849 }
1774 } 1850 }
1775 1851
1776 task_lock(tsk); 1852 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
1777 cg = tsk->cgroups; 1853 if (retval)
1778 get_css_set(cg);
1779 task_unlock(tsk);
1780 /*
1781 * Locate or allocate a new css_set for this task,
1782 * based on its final set of cgroups
1783 */
1784 newcg = find_css_set(cg, cgrp);
1785 put_css_set(cg);
1786 if (!newcg) {
1787 retval = -ENOMEM;
1788 goto out;
1789 }
1790
1791 task_lock(tsk);
1792 if (tsk->flags & PF_EXITING) {
1793 task_unlock(tsk);
1794 put_css_set(newcg);
1795 retval = -ESRCH;
1796 goto out; 1854 goto out;
1797 }
1798 rcu_assign_pointer(tsk->cgroups, newcg);
1799 task_unlock(tsk);
1800
1801 /* Update the css_set linked lists if we're using them */
1802 write_lock(&css_set_lock);
1803 if (!list_empty(&tsk->cg_list))
1804 list_move(&tsk->cg_list, &newcg->tasks);
1805 write_unlock(&css_set_lock);
1806 1855
1807 for_each_subsys(root, ss) { 1856 for_each_subsys(root, ss) {
1857 if (ss->pre_attach)
1858 ss->pre_attach(cgrp);
1859 if (ss->attach_task)
1860 ss->attach_task(cgrp, tsk);
1808 if (ss->attach) 1861 if (ss->attach)
1809 ss->attach(ss, cgrp, oldcgrp, tsk, false); 1862 ss->attach(ss, cgrp, oldcgrp, tsk);
1810 } 1863 }
1811 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1864
1812 synchronize_rcu(); 1865 synchronize_rcu();
1813 put_css_set(cg);
1814 1866
1815 /* 1867 /*
1816 * wake up rmdir() waiter. the rmdir should fail since the cgroup 1868 * wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1829,7 +1881,7 @@ out:
1829 */ 1881 */
1830 break; 1882 break;
1831 if (ss->cancel_attach) 1883 if (ss->cancel_attach)
1832 ss->cancel_attach(ss, cgrp, tsk, false); 1884 ss->cancel_attach(ss, cgrp, tsk);
1833 } 1885 }
1834 } 1886 }
1835 return retval; 1887 return retval;
@@ -1860,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1860EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1912EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1861 1913
1862/* 1914/*
1863 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1915 * cgroup_attach_proc works in two stages, the first of which prefetches all
1864 * held. May take task_lock of task 1916 * new css_sets needed (to make sure we have enough memory before committing
1917 * to the move) and stores them in a list of entries of the following type.
1918 * TODO: possible optimization: use css_set->rcu_head for chaining instead
1919 */
1920struct cg_list_entry {
1921 struct css_set *cg;
1922 struct list_head links;
1923};
1924
1925static bool css_set_check_fetched(struct cgroup *cgrp,
1926 struct task_struct *tsk, struct css_set *cg,
1927 struct list_head *newcg_list)
1928{
1929 struct css_set *newcg;
1930 struct cg_list_entry *cg_entry;
1931 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1932
1933 read_lock(&css_set_lock);
1934 newcg = find_existing_css_set(cg, cgrp, template);
1935 if (newcg)
1936 get_css_set(newcg);
1937 read_unlock(&css_set_lock);
1938
1939 /* doesn't exist at all? */
1940 if (!newcg)
1941 return false;
1942 /* see if it's already in the list */
1943 list_for_each_entry(cg_entry, newcg_list, links) {
1944 if (cg_entry->cg == newcg) {
1945 put_css_set(newcg);
1946 return true;
1947 }
1948 }
1949
1950 /* not found */
1951 put_css_set(newcg);
1952 return false;
1953}
1954
1955/*
1956 * Find the new css_set and store it in the list in preparation for moving the
1957 * given task to the given cgroup. Returns 0 or -ENOMEM.
1958 */
1959static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
1960 struct list_head *newcg_list)
1961{
1962 struct css_set *newcg;
1963 struct cg_list_entry *cg_entry;
1964
1965 /* ensure a new css_set will exist for this thread */
1966 newcg = find_css_set(cg, cgrp);
1967 if (!newcg)
1968 return -ENOMEM;
1969 /* add it to the list */
1970 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
1971 if (!cg_entry) {
1972 put_css_set(newcg);
1973 return -ENOMEM;
1974 }
1975 cg_entry->cg = newcg;
1976 list_add(&cg_entry->links, newcg_list);
1977 return 0;
1978}
1979
1980/**
1981 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
1982 * @cgrp: the cgroup to attach to
1983 * @leader: the threadgroup leader task_struct of the group to be attached
1984 *
1985 * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
1986 * take task_lock of each thread in leader's threadgroup individually in turn.
1987 */
1988int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
1989{
1990 int retval, i, group_size;
1991 struct cgroup_subsys *ss, *failed_ss = NULL;
1992 bool cancel_failed_ss = false;
1993 /* guaranteed to be initialized later, but the compiler needs this */
1994 struct cgroup *oldcgrp = NULL;
1995 struct css_set *oldcg;
1996 struct cgroupfs_root *root = cgrp->root;
1997 /* threadgroup list cursor and array */
1998 struct task_struct *tsk;
1999 struct flex_array *group;
2000 /*
2001 * we need to make sure we have css_sets for all the tasks we're
2002 * going to move -before- we actually start moving them, so that in
2003 * case we get an ENOMEM we can bail out before making any changes.
2004 */
2005 struct list_head newcg_list;
2006 struct cg_list_entry *cg_entry, *temp_nobe;
2007
2008 /*
2009 * step 0: in order to do expensive, possibly blocking operations for
2010 * every thread, we cannot iterate the thread group list, since it needs
2011 * rcu or tasklist locked. instead, build an array of all threads in the
2012 * group - threadgroup_fork_lock prevents new threads from appearing,
2013 * and if threads exit, this will just be an over-estimate.
2014 */
2015 group_size = get_nr_threads(leader);
2016 /* flex_array supports very large thread-groups better than kmalloc. */
2017 group = flex_array_alloc(sizeof(struct task_struct *), group_size,
2018 GFP_KERNEL);
2019 if (!group)
2020 return -ENOMEM;
2021 /* pre-allocate to guarantee space while iterating in rcu read-side. */
2022 retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
2023 if (retval)
2024 goto out_free_group_list;
2025
2026 /* prevent changes to the threadgroup list while we take a snapshot. */
2027 rcu_read_lock();
2028 if (!thread_group_leader(leader)) {
2029 /*
2030 * a race with de_thread from another thread's exec() may strip
2031 * us of our leadership, making while_each_thread unsafe to use
2032 * on this task. if this happens, there is no choice but to
2033 * throw this task away and try again (from cgroup_procs_write);
2034 * this is "double-double-toil-and-trouble-check locking".
2035 */
2036 rcu_read_unlock();
2037 retval = -EAGAIN;
2038 goto out_free_group_list;
2039 }
2040 /* take a reference on each task in the group to go in the array. */
2041 tsk = leader;
2042 i = 0;
2043 do {
2044 /* as per above, nr_threads may decrease, but not increase. */
2045 BUG_ON(i >= group_size);
2046 get_task_struct(tsk);
2047 /*
2048 * saying GFP_ATOMIC has no effect here because we did prealloc
2049 * earlier, but it's good form to communicate our expectations.
2050 */
2051 retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
2052 BUG_ON(retval != 0);
2053 i++;
2054 } while_each_thread(leader, tsk);
2055 /* remember the number of threads in the array for later. */
2056 group_size = i;
2057 rcu_read_unlock();
2058
2059 /*
2060 * step 1: check that we can legitimately attach to the cgroup.
2061 */
2062 for_each_subsys(root, ss) {
2063 if (ss->can_attach) {
2064 retval = ss->can_attach(ss, cgrp, leader);
2065 if (retval) {
2066 failed_ss = ss;
2067 goto out_cancel_attach;
2068 }
2069 }
2070 /* a callback to be run on every thread in the threadgroup. */
2071 if (ss->can_attach_task) {
2072 /* run on each task in the threadgroup. */
2073 for (i = 0; i < group_size; i++) {
2074 tsk = flex_array_get_ptr(group, i);
2075 retval = ss->can_attach_task(cgrp, tsk);
2076 if (retval) {
2077 failed_ss = ss;
2078 cancel_failed_ss = true;
2079 goto out_cancel_attach;
2080 }
2081 }
2082 }
2083 }
2084
2085 /*
2086 * step 2: make sure css_sets exist for all threads to be migrated.
2087 * we use find_css_set, which allocates a new one if necessary.
2088 */
2089 INIT_LIST_HEAD(&newcg_list);
2090 for (i = 0; i < group_size; i++) {
2091 tsk = flex_array_get_ptr(group, i);
2092 /* nothing to do if this task is already in the cgroup */
2093 oldcgrp = task_cgroup_from_root(tsk, root);
2094 if (cgrp == oldcgrp)
2095 continue;
2096 /* get old css_set pointer */
2097 task_lock(tsk);
2098 if (tsk->flags & PF_EXITING) {
2099 /* ignore this task if it's going away */
2100 task_unlock(tsk);
2101 continue;
2102 }
2103 oldcg = tsk->cgroups;
2104 get_css_set(oldcg);
2105 task_unlock(tsk);
2106 /* see if the new one for us is already in the list? */
2107 if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
2108 /* was already there, nothing to do. */
2109 put_css_set(oldcg);
2110 } else {
2111 /* we don't already have it. get new one. */
2112 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2113 put_css_set(oldcg);
2114 if (retval)
2115 goto out_list_teardown;
2116 }
2117 }
2118
2119 /*
2120 * step 3: now that we're guaranteed success wrt the css_sets, proceed
2121 * to move all tasks to the new cgroup, calling ss->attach_task for each
2122 * one along the way. there are no failure cases after here, so this is
2123 * the commit point.
2124 */
2125 for_each_subsys(root, ss) {
2126 if (ss->pre_attach)
2127 ss->pre_attach(cgrp);
2128 }
2129 for (i = 0; i < group_size; i++) {
2130 tsk = flex_array_get_ptr(group, i);
2131 /* leave current thread as it is if it's already there */
2132 oldcgrp = task_cgroup_from_root(tsk, root);
2133 if (cgrp == oldcgrp)
2134 continue;
2135 /* attach each task to each subsystem */
2136 for_each_subsys(root, ss) {
2137 if (ss->attach_task)
2138 ss->attach_task(cgrp, tsk);
2139 }
2140 /* if the thread is PF_EXITING, it can just get skipped. */
2141 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
2142 BUG_ON(retval != 0 && retval != -ESRCH);
2143 }
2144 /* nothing is sensitive to fork() after this point. */
2145
2146 /*
2147 * step 4: do expensive, non-thread-specific subsystem callbacks.
2148 * TODO: if ever a subsystem needs to know the oldcgrp for each task
2149 * being moved, this call will need to be reworked to communicate that.
2150 */
2151 for_each_subsys(root, ss) {
2152 if (ss->attach)
2153 ss->attach(ss, cgrp, oldcgrp, leader);
2154 }
2155
2156 /*
2157 * step 5: success! and cleanup
2158 */
2159 synchronize_rcu();
2160 cgroup_wakeup_rmdir_waiter(cgrp);
2161 retval = 0;
2162out_list_teardown:
2163 /* clean up the list of prefetched css_sets. */
2164 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
2165 list_del(&cg_entry->links);
2166 put_css_set(cg_entry->cg);
2167 kfree(cg_entry);
2168 }
2169out_cancel_attach:
2170 /* same deal as in cgroup_attach_task */
2171 if (retval) {
2172 for_each_subsys(root, ss) {
2173 if (ss == failed_ss) {
2174 if (cancel_failed_ss && ss->cancel_attach)
2175 ss->cancel_attach(ss, cgrp, leader);
2176 break;
2177 }
2178 if (ss->cancel_attach)
2179 ss->cancel_attach(ss, cgrp, leader);
2180 }
2181 }
2182 /* clean up the array of referenced threads in the group. */
2183 for (i = 0; i < group_size; i++) {
2184 tsk = flex_array_get_ptr(group, i);
2185 put_task_struct(tsk);
2186 }
2187out_free_group_list:
2188 flex_array_free(group);
2189 return retval;
2190}
2191
2192/*
2193 * Find the task_struct of the task to attach by vpid and pass it along to the
2194 * function to attach either it or all tasks in its threadgroup. Will take
2195 * cgroup_mutex; may take task_lock of task.
1865 */ 2196 */
1866static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) 2197static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
1867{ 2198{
1868 struct task_struct *tsk; 2199 struct task_struct *tsk;
1869 const struct cred *cred = current_cred(), *tcred; 2200 const struct cred *cred = current_cred(), *tcred;
1870 int ret; 2201 int ret;
1871 2202
2203 if (!cgroup_lock_live_group(cgrp))
2204 return -ENODEV;
2205
1872 if (pid) { 2206 if (pid) {
1873 rcu_read_lock(); 2207 rcu_read_lock();
1874 tsk = find_task_by_vpid(pid); 2208 tsk = find_task_by_vpid(pid);
1875 if (!tsk || tsk->flags & PF_EXITING) { 2209 if (!tsk) {
1876 rcu_read_unlock(); 2210 rcu_read_unlock();
2211 cgroup_unlock();
2212 return -ESRCH;
2213 }
2214 if (threadgroup) {
2215 /*
2216 * RCU protects this access, since tsk was found in the
2217 * tid map. a race with de_thread may cause group_leader
2218 * to stop being the leader, but cgroup_attach_proc will
2219 * detect it later.
2220 */
2221 tsk = tsk->group_leader;
2222 } else if (tsk->flags & PF_EXITING) {
2223 /* optimization for the single-task-only case */
2224 rcu_read_unlock();
2225 cgroup_unlock();
1877 return -ESRCH; 2226 return -ESRCH;
1878 } 2227 }
1879 2228
2229 /*
2230 * even if we're attaching all tasks in the thread group, we
2231 * only need to check permissions on one of them.
2232 */
1880 tcred = __task_cred(tsk); 2233 tcred = __task_cred(tsk);
1881 if (cred->euid && 2234 if (cred->euid &&
1882 cred->euid != tcred->uid && 2235 cred->euid != tcred->uid &&
1883 cred->euid != tcred->suid) { 2236 cred->euid != tcred->suid) {
1884 rcu_read_unlock(); 2237 rcu_read_unlock();
2238 cgroup_unlock();
1885 return -EACCES; 2239 return -EACCES;
1886 } 2240 }
1887 get_task_struct(tsk); 2241 get_task_struct(tsk);
1888 rcu_read_unlock(); 2242 rcu_read_unlock();
1889 } else { 2243 } else {
1890 tsk = current; 2244 if (threadgroup)
2245 tsk = current->group_leader;
2246 else
2247 tsk = current;
1891 get_task_struct(tsk); 2248 get_task_struct(tsk);
1892 } 2249 }
1893 2250
1894 ret = cgroup_attach_task(cgrp, tsk); 2251 if (threadgroup) {
2252 threadgroup_fork_write_lock(tsk);
2253 ret = cgroup_attach_proc(cgrp, tsk);
2254 threadgroup_fork_write_unlock(tsk);
2255 } else {
2256 ret = cgroup_attach_task(cgrp, tsk);
2257 }
1895 put_task_struct(tsk); 2258 put_task_struct(tsk);
2259 cgroup_unlock();
1896 return ret; 2260 return ret;
1897} 2261}
1898 2262
1899static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2263static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1900{ 2264{
2265 return attach_task_by_pid(cgrp, pid, false);
2266}
2267
2268static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2269{
1901 int ret; 2270 int ret;
1902 if (!cgroup_lock_live_group(cgrp)) 2271 do {
1903 return -ENODEV; 2272 /*
1904 ret = attach_task_by_pid(cgrp, pid); 2273 * attach_proc fails with -EAGAIN if threadgroup leadership
1905 cgroup_unlock(); 2274 * changes in the middle of the operation, in which case we need
2275 * to find the task_struct for the new leader and start over.
2276 */
2277 ret = attach_task_by_pid(cgrp, tgid, true);
2278 } while (ret == -EAGAIN);
1906 return ret; 2279 return ret;
1907} 2280}
1908 2281
@@ -3259,9 +3632,9 @@ static struct cftype files[] = {
3259 { 3632 {
3260 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 3633 .name = CGROUP_FILE_GENERIC_PREFIX "procs",
3261 .open = cgroup_procs_open, 3634 .open = cgroup_procs_open,
3262 /* .write_u64 = cgroup_procs_write, TODO */ 3635 .write_u64 = cgroup_procs_write,
3263 .release = cgroup_pidlist_release, 3636 .release = cgroup_pidlist_release,
3264 .mode = S_IRUGO, 3637 .mode = S_IRUGO | S_IWUSR,
3265 }, 3638 },
3266 { 3639 {
3267 .name = "notify_on_release", 3640 .name = "notify_on_release",
@@ -4257,122 +4630,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4257} 4630}
4258 4631
4259/** 4632/**
4260 * cgroup_clone - clone the cgroup the given subsystem is attached to
4261 * @tsk: the task to be moved
4262 * @subsys: the given subsystem
4263 * @nodename: the name for the new cgroup
4264 *
4265 * Duplicate the current cgroup in the hierarchy that the given
4266 * subsystem is attached to, and move this task into the new
4267 * child.
4268 */
4269int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
4270 char *nodename)
4271{
4272 struct dentry *dentry;
4273 int ret = 0;
4274 struct cgroup *parent, *child;
4275 struct inode *inode;
4276 struct css_set *cg;
4277 struct cgroupfs_root *root;
4278 struct cgroup_subsys *ss;
4279
4280 /* We shouldn't be called by an unregistered subsystem */
4281 BUG_ON(!subsys->active);
4282
4283 /* First figure out what hierarchy and cgroup we're dealing
4284 * with, and pin them so we can drop cgroup_mutex */
4285 mutex_lock(&cgroup_mutex);
4286 again:
4287 root = subsys->root;
4288 if (root == &rootnode) {
4289 mutex_unlock(&cgroup_mutex);
4290 return 0;
4291 }
4292
4293 /* Pin the hierarchy */
4294 if (!atomic_inc_not_zero(&root->sb->s_active)) {
4295 /* We race with the final deactivate_super() */
4296 mutex_unlock(&cgroup_mutex);
4297 return 0;
4298 }
4299
4300 /* Keep the cgroup alive */
4301 task_lock(tsk);
4302 parent = task_cgroup(tsk, subsys->subsys_id);
4303 cg = tsk->cgroups;
4304 get_css_set(cg);
4305 task_unlock(tsk);
4306
4307 mutex_unlock(&cgroup_mutex);
4308
4309 /* Now do the VFS work to create a cgroup */
4310 inode = parent->dentry->d_inode;
4311
4312 /* Hold the parent directory mutex across this operation to
4313 * stop anyone else deleting the new cgroup */
4314 mutex_lock(&inode->i_mutex);
4315 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
4316 if (IS_ERR(dentry)) {
4317 printk(KERN_INFO
4318 "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
4319 PTR_ERR(dentry));
4320 ret = PTR_ERR(dentry);
4321 goto out_release;
4322 }
4323
4324 /* Create the cgroup directory, which also creates the cgroup */
4325 ret = vfs_mkdir(inode, dentry, 0755);
4326 child = __d_cgrp(dentry);
4327 dput(dentry);
4328 if (ret) {
4329 printk(KERN_INFO
4330 "Failed to create cgroup %s: %d\n", nodename,
4331 ret);
4332 goto out_release;
4333 }
4334
4335 /* The cgroup now exists. Retake cgroup_mutex and check
4336 * that we're still in the same state that we thought we
4337 * were. */
4338 mutex_lock(&cgroup_mutex);
4339 if ((root != subsys->root) ||
4340 (parent != task_cgroup(tsk, subsys->subsys_id))) {
4341 /* Aargh, we raced ... */
4342 mutex_unlock(&inode->i_mutex);
4343 put_css_set(cg);
4344
4345 deactivate_super(root->sb);
4346 /* The cgroup is still accessible in the VFS, but
4347 * we're not going to try to rmdir() it at this
4348 * point. */
4349 printk(KERN_INFO
4350 "Race in cgroup_clone() - leaking cgroup %s\n",
4351 nodename);
4352 goto again;
4353 }
4354
4355 /* do any required auto-setup */
4356 for_each_subsys(root, ss) {
4357 if (ss->post_clone)
4358 ss->post_clone(ss, child);
4359 }
4360
4361 /* All seems fine. Finish by moving the task into the new cgroup */
4362 ret = cgroup_attach_task(child, tsk);
4363 mutex_unlock(&cgroup_mutex);
4364
4365 out_release:
4366 mutex_unlock(&inode->i_mutex);
4367
4368 mutex_lock(&cgroup_mutex);
4369 put_css_set(cg);
4370 mutex_unlock(&cgroup_mutex);
4371 deactivate_super(root->sb);
4372 return ret;
4373}
4374
4375/**
4376 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp 4633 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
4377 * @cgrp: the cgroup in question 4634 * @cgrp: the cgroup in question
4378 * @task: the task in question 4635 * @task: the task in question
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7c6c38..e691818d7e45 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
160 */ 160 */
161static int freezer_can_attach(struct cgroup_subsys *ss, 161static int freezer_can_attach(struct cgroup_subsys *ss,
162 struct cgroup *new_cgroup, 162 struct cgroup *new_cgroup,
163 struct task_struct *task, bool threadgroup) 163 struct task_struct *task)
164{ 164{
165 struct freezer *freezer; 165 struct freezer *freezer;
166 166
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
172 if (freezer->state != CGROUP_THAWED) 172 if (freezer->state != CGROUP_THAWED)
173 return -EBUSY; 173 return -EBUSY;
174 174
175 return 0;
176}
177
178static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
179{
175 rcu_read_lock(); 180 rcu_read_lock();
176 if (__cgroup_freezing_or_frozen(task)) { 181 if (__cgroup_freezing_or_frozen(tsk)) {
177 rcu_read_unlock(); 182 rcu_read_unlock();
178 return -EBUSY; 183 return -EBUSY;
179 } 184 }
180 rcu_read_unlock(); 185 rcu_read_unlock();
181
182 if (threadgroup) {
183 struct task_struct *c;
184
185 rcu_read_lock();
186 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
187 if (__cgroup_freezing_or_frozen(c)) {
188 rcu_read_unlock();
189 return -EBUSY;
190 }
191 }
192 rcu_read_unlock();
193 }
194
195 return 0; 186 return 0;
196} 187}
197 188
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
390 .populate = freezer_populate, 381 .populate = freezer_populate,
391 .subsys_id = freezer_subsys_id, 382 .subsys_id = freezer_subsys_id,
392 .can_attach = freezer_can_attach, 383 .can_attach = freezer_can_attach,
384 .can_attach_task = freezer_can_attach_task,
385 .pre_attach = NULL,
386 .attach_task = NULL,
393 .attach = NULL, 387 .attach = NULL,
394 .fork = freezer_fork, 388 .fork = freezer_fork,
395 .exit = NULL, 389 .exit = NULL,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2bb8c2e98fff..9c9b7545c810 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
1367 return val; 1367 return val;
1368} 1368}
1369 1369
1370/* Protected by cgroup_lock */
1371static cpumask_var_t cpus_attach;
1372
1373/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1370/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1374static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1371static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1375 struct task_struct *tsk, bool threadgroup) 1372 struct task_struct *tsk)
1376{ 1373{
1377 int ret;
1378 struct cpuset *cs = cgroup_cs(cont); 1374 struct cpuset *cs = cgroup_cs(cont);
1379 1375
1380 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1376 if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1391 if (tsk->flags & PF_THREAD_BOUND) 1387 if (tsk->flags & PF_THREAD_BOUND)
1392 return -EINVAL; 1388 return -EINVAL;
1393 1389
1394 ret = security_task_setscheduler(tsk);
1395 if (ret)
1396 return ret;
1397 if (threadgroup) {
1398 struct task_struct *c;
1399
1400 rcu_read_lock();
1401 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1402 ret = security_task_setscheduler(c);
1403 if (ret) {
1404 rcu_read_unlock();
1405 return ret;
1406 }
1407 }
1408 rcu_read_unlock();
1409 }
1410 return 0; 1390 return 0;
1411} 1391}
1412 1392
1413static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, 1393static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
1414 struct cpuset *cs) 1394{
1395 return security_task_setscheduler(task);
1396}
1397
1398/*
1399 * Protected by cgroup_lock. The nodemasks must be stored globally because
1400 * dynamically allocating them is not allowed in pre_attach, and they must
1401 * persist among pre_attach, attach_task, and attach.
1402 */
1403static cpumask_var_t cpus_attach;
1404static nodemask_t cpuset_attach_nodemask_from;
1405static nodemask_t cpuset_attach_nodemask_to;
1406
1407/* Set-up work for before attaching each task. */
1408static void cpuset_pre_attach(struct cgroup *cont)
1409{
1410 struct cpuset *cs = cgroup_cs(cont);
1411
1412 if (cs == &top_cpuset)
1413 cpumask_copy(cpus_attach, cpu_possible_mask);
1414 else
1415 guarantee_online_cpus(cs, cpus_attach);
1416
1417 guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
1418}
1419
1420/* Per-thread attachment work. */
1421static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
1415{ 1422{
1416 int err; 1423 int err;
1424 struct cpuset *cs = cgroup_cs(cont);
1425
1417 /* 1426 /*
1418 * can_attach beforehand should guarantee that this doesn't fail. 1427 * can_attach beforehand should guarantee that this doesn't fail.
1419 * TODO: have a better way to handle failure here 1428 * TODO: have a better way to handle failure here
@@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1421 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1430 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1422 WARN_ON_ONCE(err); 1431 WARN_ON_ONCE(err);
1423 1432
1424 cpuset_change_task_nodemask(tsk, to); 1433 cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
1425 cpuset_update_task_spread_flag(cs, tsk); 1434 cpuset_update_task_spread_flag(cs, tsk);
1426
1427} 1435}
1428 1436
1429static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1437static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1430 struct cgroup *oldcont, struct task_struct *tsk, 1438 struct cgroup *oldcont, struct task_struct *tsk)
1431 bool threadgroup)
1432{ 1439{
1433 struct mm_struct *mm; 1440 struct mm_struct *mm;
1434 struct cpuset *cs = cgroup_cs(cont); 1441 struct cpuset *cs = cgroup_cs(cont);
1435 struct cpuset *oldcs = cgroup_cs(oldcont); 1442 struct cpuset *oldcs = cgroup_cs(oldcont);
1436 static nodemask_t to; /* protected by cgroup_mutex */
1437 1443
1438 if (cs == &top_cpuset) { 1444 /*
1439 cpumask_copy(cpus_attach, cpu_possible_mask); 1445 * Change mm, possibly for multiple threads in a threadgroup. This is
1440 } else { 1446 * expensive and may sleep.
1441 guarantee_online_cpus(cs, cpus_attach); 1447 */
1442 } 1448 cpuset_attach_nodemask_from = oldcs->mems_allowed;
1443 guarantee_online_mems(cs, &to); 1449 cpuset_attach_nodemask_to = cs->mems_allowed;
1444
1445 /* do per-task migration stuff possibly for each in the threadgroup */
1446 cpuset_attach_task(tsk, &to, cs);
1447 if (threadgroup) {
1448 struct task_struct *c;
1449 rcu_read_lock();
1450 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1451 cpuset_attach_task(c, &to, cs);
1452 }
1453 rcu_read_unlock();
1454 }
1455
1456 /* change mm; only needs to be done once even if threadgroup */
1457 to = cs->mems_allowed;
1458 mm = get_task_mm(tsk); 1450 mm = get_task_mm(tsk);
1459 if (mm) { 1451 if (mm) {
1460 mpol_rebind_mm(mm, &to); 1452 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
1461 if (is_memory_migrate(cs)) 1453 if (is_memory_migrate(cs))
1462 cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); 1454 cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
1455 &cpuset_attach_nodemask_to);
1463 mmput(mm); 1456 mmput(mm);
1464 } 1457 }
1465} 1458}
@@ -1809,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1809} 1802}
1810 1803
1811/* 1804/*
1812 * post_clone() is called at the end of cgroup_clone(). 1805 * post_clone() is called during cgroup_create() when the
1813 * 'cgroup' was just created automatically as a result of 1806 * clone_children mount argument was specified. The cgroup
1814 * a cgroup_clone(), and the current task is about to 1807 * can not yet have any tasks.
1815 * be moved into 'cgroup'.
1816 * 1808 *
1817 * Currently we refuse to set up the cgroup - thereby 1809 * Currently we refuse to set up the cgroup - thereby
1818 * refusing the task to be entered, and as a result refusing 1810 * refusing the task to be entered, and as a result refusing
@@ -1911,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
1911 .create = cpuset_create, 1903 .create = cpuset_create,
1912 .destroy = cpuset_destroy, 1904 .destroy = cpuset_destroy,
1913 .can_attach = cpuset_can_attach, 1905 .can_attach = cpuset_can_attach,
1906 .can_attach_task = cpuset_can_attach_task,
1907 .pre_attach = cpuset_pre_attach,
1908 .attach_task = cpuset_attach_task,
1914 .attach = cpuset_attach, 1909 .attach = cpuset_attach,
1915 .populate = cpuset_populate, 1910 .populate = cpuset_populate,
1916 .post_clone = cpuset_post_clone, 1911 .post_clone = cpuset_post_clone,
@@ -2195,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2195 rcu_read_lock(); 2190 rcu_read_lock();
2196 cs = task_cs(tsk); 2191 cs = task_cs(tsk);
2197 if (cs) 2192 if (cs)
2198 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); 2193 do_set_cpus_allowed(tsk, cs->cpus_allowed);
2199 rcu_read_unlock(); 2194 rcu_read_unlock();
2200 2195
2201 /* 2196 /*
@@ -2222,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2222 * Like above we can temporary set any mask and rely on 2217 * Like above we can temporary set any mask and rely on
2223 * set_cpus_allowed_ptr() as synchronization point. 2218 * set_cpus_allowed_ptr() as synchronization point.
2224 */ 2219 */
2225 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); 2220 do_set_cpus_allowed(tsk, cpu_possible_mask);
2226 cpu = cpumask_any(cpu_active_mask); 2221 cpu = cpumask_any(cpu_active_mask);
2227 } 2222 }
2228 2223
diff --git a/kernel/cred.c b/kernel/cred.c
index e12c8af793f8..174fa84eca30 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
1/* Task credentials management - see Documentation/credentials.txt 1/* Task credentials management - see Documentation/security/credentials.txt
2 * 2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. 3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com) 4 * Written by David Howells (dhowells@redhat.com)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c09767f7db3e..9efe7108ccaf 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5028,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
5028 else 5028 else
5029 perf_event_output(event, nmi, data, regs); 5029 perf_event_output(event, nmi, data, regs);
5030 5030
5031 if (event->fasync && event->pending_kill) {
5032 if (nmi) {
5033 event->pending_wakeup = 1;
5034 irq_work_queue(&event->pending);
5035 } else
5036 perf_event_wakeup(event);
5037 }
5038
5031 return ret; 5039 return ret;
5032} 5040}
5033 5041
@@ -7394,26 +7402,12 @@ static int __perf_cgroup_move(void *info)
7394 return 0; 7402 return 0;
7395} 7403}
7396 7404
7397static void perf_cgroup_move(struct task_struct *task) 7405static void
7406perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
7398{ 7407{
7399 task_function_call(task, __perf_cgroup_move, task); 7408 task_function_call(task, __perf_cgroup_move, task);
7400} 7409}
7401 7410
7402static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7403 struct cgroup *old_cgrp, struct task_struct *task,
7404 bool threadgroup)
7405{
7406 perf_cgroup_move(task);
7407 if (threadgroup) {
7408 struct task_struct *c;
7409 rcu_read_lock();
7410 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7411 perf_cgroup_move(c);
7412 }
7413 rcu_read_unlock();
7414 }
7415}
7416
7417static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7411static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7418 struct cgroup *old_cgrp, struct task_struct *task) 7412 struct cgroup *old_cgrp, struct task_struct *task)
7419{ 7413{
@@ -7425,7 +7419,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7425 if (!(task->flags & PF_EXITING)) 7419 if (!(task->flags & PF_EXITING))
7426 return; 7420 return;
7427 7421
7428 perf_cgroup_move(task); 7422 perf_cgroup_attach_task(cgrp, task);
7429} 7423}
7430 7424
7431struct cgroup_subsys perf_subsys = { 7425struct cgroup_subsys perf_subsys = {
@@ -7434,6 +7428,6 @@ struct cgroup_subsys perf_subsys = {
7434 .create = perf_cgroup_create, 7428 .create = perf_cgroup_create,
7435 .destroy = perf_cgroup_destroy, 7429 .destroy = perf_cgroup_destroy,
7436 .exit = perf_cgroup_exit, 7430 .exit = perf_cgroup_exit,
7437 .attach = perf_cgroup_attach, 7431 .attach_task = perf_cgroup_attach_task,
7438}; 7432};
7439#endif /* CONFIG_CGROUP_PERF */ 7433#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e7e135d0817..0276c30401a0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -59,7 +59,6 @@
59#include <linux/taskstats_kern.h> 59#include <linux/taskstats_kern.h>
60#include <linux/random.h> 60#include <linux/random.h>
61#include <linux/tty.h> 61#include <linux/tty.h>
62#include <linux/proc_fs.h>
63#include <linux/blkdev.h> 62#include <linux/blkdev.h>
64#include <linux/fs_struct.h> 63#include <linux/fs_struct.h>
65#include <linux/magic.h> 64#include <linux/magic.h>
@@ -485,20 +484,6 @@ static void mm_init_aio(struct mm_struct *mm)
485#endif 484#endif
486} 485}
487 486
488int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm)
489{
490#ifdef CONFIG_CPUMASK_OFFSTACK
491 if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL))
492 return -ENOMEM;
493
494 if (oldmm)
495 cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm));
496 else
497 memset(mm_cpumask(mm), 0, cpumask_size());
498#endif
499 return 0;
500}
501
502static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) 487static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
503{ 488{
504 atomic_set(&mm->mm_users, 1); 489 atomic_set(&mm->mm_users, 1);
@@ -539,17 +524,8 @@ struct mm_struct * mm_alloc(void)
539 return NULL; 524 return NULL;
540 525
541 memset(mm, 0, sizeof(*mm)); 526 memset(mm, 0, sizeof(*mm));
542 mm = mm_init(mm, current); 527 mm_init_cpumask(mm);
543 if (!mm) 528 return mm_init(mm, current);
544 return NULL;
545
546 if (mm_init_cpumask(mm, NULL)) {
547 mm_free_pgd(mm);
548 free_mm(mm);
549 return NULL;
550 }
551
552 return mm;
553} 529}
554 530
555/* 531/*
@@ -560,7 +536,6 @@ struct mm_struct * mm_alloc(void)
560void __mmdrop(struct mm_struct *mm) 536void __mmdrop(struct mm_struct *mm)
561{ 537{
562 BUG_ON(mm == &init_mm); 538 BUG_ON(mm == &init_mm);
563 free_cpumask_var(mm->cpu_vm_mask_var);
564 mm_free_pgd(mm); 539 mm_free_pgd(mm);
565 destroy_context(mm); 540 destroy_context(mm);
566 mmu_notifier_mm_destroy(mm); 541 mmu_notifier_mm_destroy(mm);
@@ -597,6 +572,57 @@ void mmput(struct mm_struct *mm)
597} 572}
598EXPORT_SYMBOL_GPL(mmput); 573EXPORT_SYMBOL_GPL(mmput);
599 574
575/*
576 * We added or removed a vma mapping the executable. The vmas are only mapped
577 * during exec and are not mapped with the mmap system call.
578 * Callers must hold down_write() on the mm's mmap_sem for these
579 */
580void added_exe_file_vma(struct mm_struct *mm)
581{
582 mm->num_exe_file_vmas++;
583}
584
585void removed_exe_file_vma(struct mm_struct *mm)
586{
587 mm->num_exe_file_vmas--;
588 if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
589 fput(mm->exe_file);
590 mm->exe_file = NULL;
591 }
592
593}
594
595void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
596{
597 if (new_exe_file)
598 get_file(new_exe_file);
599 if (mm->exe_file)
600 fput(mm->exe_file);
601 mm->exe_file = new_exe_file;
602 mm->num_exe_file_vmas = 0;
603}
604
605struct file *get_mm_exe_file(struct mm_struct *mm)
606{
607 struct file *exe_file;
608
609 /* We need mmap_sem to protect against races with removal of
610 * VM_EXECUTABLE vmas */
611 down_read(&mm->mmap_sem);
612 exe_file = mm->exe_file;
613 if (exe_file)
614 get_file(exe_file);
615 up_read(&mm->mmap_sem);
616 return exe_file;
617}
618
619static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
620{
621 /* It's safe to write the exe_file pointer without exe_file_lock because
622 * this is called during fork when the task is not yet in /proc */
623 newmm->exe_file = get_mm_exe_file(oldmm);
624}
625
600/** 626/**
601 * get_task_mm - acquire a reference to the task's mm 627 * get_task_mm - acquire a reference to the task's mm
602 * 628 *
@@ -703,6 +729,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
703 goto fail_nomem; 729 goto fail_nomem;
704 730
705 memcpy(mm, oldmm, sizeof(*mm)); 731 memcpy(mm, oldmm, sizeof(*mm));
732 mm_init_cpumask(mm);
706 733
707 /* Initializing for Swap token stuff */ 734 /* Initializing for Swap token stuff */
708 mm->token_priority = 0; 735 mm->token_priority = 0;
@@ -715,9 +742,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
715 if (!mm_init(mm, tsk)) 742 if (!mm_init(mm, tsk))
716 goto fail_nomem; 743 goto fail_nomem;
717 744
718 if (mm_init_cpumask(mm, oldmm))
719 goto fail_nocpumask;
720
721 if (init_new_context(tsk, mm)) 745 if (init_new_context(tsk, mm))
722 goto fail_nocontext; 746 goto fail_nocontext;
723 747
@@ -744,9 +768,6 @@ fail_nomem:
744 return NULL; 768 return NULL;
745 769
746fail_nocontext: 770fail_nocontext:
747 free_cpumask_var(mm->cpu_vm_mask_var);
748
749fail_nocpumask:
750 /* 771 /*
751 * If init_new_context() failed, we cannot use mmput() to free the mm 772 * If init_new_context() failed, we cannot use mmput() to free the mm
752 * because it calls destroy_context() 773 * because it calls destroy_context()
@@ -957,6 +978,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
957 tty_audit_fork(sig); 978 tty_audit_fork(sig);
958 sched_autogroup_fork(sig); 979 sched_autogroup_fork(sig);
959 980
981#ifdef CONFIG_CGROUPS
982 init_rwsem(&sig->threadgroup_fork_lock);
983#endif
984
960 sig->oom_adj = current->signal->oom_adj; 985 sig->oom_adj = current->signal->oom_adj;
961 sig->oom_score_adj = current->signal->oom_score_adj; 986 sig->oom_score_adj = current->signal->oom_score_adj;
962 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 987 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1138,6 +1163,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1138 monotonic_to_bootbased(&p->real_start_time); 1163 monotonic_to_bootbased(&p->real_start_time);
1139 p->io_context = NULL; 1164 p->io_context = NULL;
1140 p->audit_context = NULL; 1165 p->audit_context = NULL;
1166 if (clone_flags & CLONE_THREAD)
1167 threadgroup_fork_read_lock(current);
1141 cgroup_fork(p); 1168 cgroup_fork(p);
1142#ifdef CONFIG_NUMA 1169#ifdef CONFIG_NUMA
1143 p->mempolicy = mpol_dup(p->mempolicy); 1170 p->mempolicy = mpol_dup(p->mempolicy);
@@ -1223,12 +1250,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1223 if (clone_flags & CLONE_THREAD) 1250 if (clone_flags & CLONE_THREAD)
1224 p->tgid = current->tgid; 1251 p->tgid = current->tgid;
1225 1252
1226 if (current->nsproxy != p->nsproxy) {
1227 retval = ns_cgroup_clone(p, pid);
1228 if (retval)
1229 goto bad_fork_free_pid;
1230 }
1231
1232 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1253 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1233 /* 1254 /*
1234 * Clear TID on mm_release()? 1255 * Clear TID on mm_release()?
@@ -1342,6 +1363,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1342 write_unlock_irq(&tasklist_lock); 1363 write_unlock_irq(&tasklist_lock);
1343 proc_fork_connector(p); 1364 proc_fork_connector(p);
1344 cgroup_post_fork(p); 1365 cgroup_post_fork(p);
1366 if (clone_flags & CLONE_THREAD)
1367 threadgroup_fork_read_unlock(current);
1345 perf_event_fork(p); 1368 perf_event_fork(p);
1346 return p; 1369 return p;
1347 1370
@@ -1380,6 +1403,8 @@ bad_fork_cleanup_policy:
1380 mpol_put(p->mempolicy); 1403 mpol_put(p->mempolicy);
1381bad_fork_cleanup_cgroup: 1404bad_fork_cleanup_cgroup:
1382#endif 1405#endif
1406 if (clone_flags & CLONE_THREAD)
1407 threadgroup_fork_read_unlock(current);
1383 cgroup_exit(p, cgroup_callbacks_done); 1408 cgroup_exit(p, cgroup_callbacks_done);
1384 delayacct_tsk_free(p); 1409 delayacct_tsk_free(p);
1385 module_put(task_thread_info(p)->exec_domain->module); 1410 module_put(task_thread_info(p)->exec_domain->module);
@@ -1537,6 +1562,13 @@ void __init proc_caches_init(void)
1537 fs_cachep = kmem_cache_create("fs_cache", 1562 fs_cachep = kmem_cache_create("fs_cache",
1538 sizeof(struct fs_struct), 0, 1563 sizeof(struct fs_struct), 0,
1539 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1564 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
1565 /*
1566 * FIXME! The "sizeof(struct mm_struct)" currently includes the
1567 * whole struct cpumask for the OFFSTACK case. We could change
1568 * this to *only* allocate as much of it as required by the
1569 * maximum number of CPU's we can ever have. The cpumask_allocation
1570 * is at the end of the structure, exactly for that reason.
1571 */
1540 mm_cachep = kmem_cache_create("mm_struct", 1572 mm_cachep = kmem_cache_create("mm_struct",
1541 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, 1573 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
1542 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); 1574 SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 90cb55f6d7eb..470d08c82bbe 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,12 +133,6 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
133 switch (res) { 133 switch (res) {
134 case IRQ_WAKE_THREAD: 134 case IRQ_WAKE_THREAD:
135 /* 135 /*
136 * Set result to handled so the spurious check
137 * does not trigger.
138 */
139 res = IRQ_HANDLED;
140
141 /*
142 * Catch drivers which return WAKE_THREAD but 136 * Catch drivers which return WAKE_THREAD but
143 * did not set up a thread function 137 * did not set up a thread function
144 */ 138 */
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 886e80347b32..4c60a50e66b2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -257,13 +257,11 @@ int __init early_irq_init(void)
257 count = ARRAY_SIZE(irq_desc); 257 count = ARRAY_SIZE(irq_desc);
258 258
259 for (i = 0; i < count; i++) { 259 for (i = 0; i < count; i++) {
260 desc[i].irq_data.irq = i;
261 desc[i].irq_data.chip = &no_irq_chip;
262 desc[i].kstat_irqs = alloc_percpu(unsigned int); 260 desc[i].kstat_irqs = alloc_percpu(unsigned int);
263 irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); 261 alloc_masks(&desc[i], GFP_KERNEL, node);
264 alloc_masks(desc + i, GFP_KERNEL, node); 262 raw_spin_lock_init(&desc[i].lock);
265 desc_smp_init(desc + i, node);
266 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); 263 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
264 desc_set_defaults(i, &desc[i], node);
267 } 265 }
268 return arch_early_irq_init(); 266 return arch_early_irq_init();
269} 267}
@@ -346,6 +344,12 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
346 if (!cnt) 344 if (!cnt)
347 return -EINVAL; 345 return -EINVAL;
348 346
347 if (irq >= 0) {
348 if (from > irq)
349 return -EINVAL;
350 from = irq;
351 }
352
349 mutex_lock(&sparse_irq_lock); 353 mutex_lock(&sparse_irq_lock);
350 354
351 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, 355 start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f7ce0021e1c4..d64bafb1afd0 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -723,13 +723,16 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
723 * context. So we need to disable bh here to avoid deadlocks and other 723 * context. So we need to disable bh here to avoid deadlocks and other
724 * side effects. 724 * side effects.
725 */ 725 */
726static void 726static irqreturn_t
727irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) 727irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
728{ 728{
729 irqreturn_t ret;
730
729 local_bh_disable(); 731 local_bh_disable();
730 action->thread_fn(action->irq, action->dev_id); 732 ret = action->thread_fn(action->irq, action->dev_id);
731 irq_finalize_oneshot(desc, action, false); 733 irq_finalize_oneshot(desc, action, false);
732 local_bh_enable(); 734 local_bh_enable();
735 return ret;
733} 736}
734 737
735/* 738/*
@@ -737,10 +740,14 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
737 * preemtible - many of them need to sleep and wait for slow busses to 740 * preemtible - many of them need to sleep and wait for slow busses to
738 * complete. 741 * complete.
739 */ 742 */
740static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action) 743static irqreturn_t irq_thread_fn(struct irq_desc *desc,
744 struct irqaction *action)
741{ 745{
742 action->thread_fn(action->irq, action->dev_id); 746 irqreturn_t ret;
747
748 ret = action->thread_fn(action->irq, action->dev_id);
743 irq_finalize_oneshot(desc, action, false); 749 irq_finalize_oneshot(desc, action, false);
750 return ret;
744} 751}
745 752
746/* 753/*
@@ -753,7 +760,8 @@ static int irq_thread(void *data)
753 }; 760 };
754 struct irqaction *action = data; 761 struct irqaction *action = data;
755 struct irq_desc *desc = irq_to_desc(action->irq); 762 struct irq_desc *desc = irq_to_desc(action->irq);
756 void (*handler_fn)(struct irq_desc *desc, struct irqaction *action); 763 irqreturn_t (*handler_fn)(struct irq_desc *desc,
764 struct irqaction *action);
757 int wake; 765 int wake;
758 766
759 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, 767 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
@@ -783,8 +791,12 @@ static int irq_thread(void *data)
783 desc->istate |= IRQS_PENDING; 791 desc->istate |= IRQS_PENDING;
784 raw_spin_unlock_irq(&desc->lock); 792 raw_spin_unlock_irq(&desc->lock);
785 } else { 793 } else {
794 irqreturn_t action_ret;
795
786 raw_spin_unlock_irq(&desc->lock); 796 raw_spin_unlock_irq(&desc->lock);
787 handler_fn(desc, action); 797 action_ret = handler_fn(desc, action);
798 if (!noirqdebug)
799 note_interrupt(action->irq, desc, action_ret);
788 } 800 }
789 801
790 wake = atomic_dec_and_test(&desc->threads_active); 802 wake = atomic_dec_and_test(&desc->threads_active);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 64e3df6ab1ef..4bd4faa6323a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -352,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
352#ifdef CONFIG_SMP 352#ifdef CONFIG_SMP
353 remove_proc_entry("smp_affinity", desc->dir); 353 remove_proc_entry("smp_affinity", desc->dir);
354 remove_proc_entry("affinity_hint", desc->dir); 354 remove_proc_entry("affinity_hint", desc->dir);
355 remove_proc_entry("smp_affinity_list", desc->dir);
355 remove_proc_entry("node", desc->dir); 356 remove_proc_entry("node", desc->dir);
356#endif 357#endif
357 remove_proc_entry("spurious", desc->dir); 358 remove_proc_entry("spurious", desc->dir);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dfbd550401b2..aa57d5da18c1 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -167,6 +167,13 @@ out:
167 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 167 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
168} 168}
169 169
170static inline int bad_action_ret(irqreturn_t action_ret)
171{
172 if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD)))
173 return 0;
174 return 1;
175}
176
170/* 177/*
171 * If 99,900 of the previous 100,000 interrupts have not been handled 178 * If 99,900 of the previous 100,000 interrupts have not been handled
172 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 179 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -182,7 +189,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
182 struct irqaction *action; 189 struct irqaction *action;
183 unsigned long flags; 190 unsigned long flags;
184 191
185 if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) { 192 if (bad_action_ret(action_ret)) {
186 printk(KERN_ERR "irq event %d: bogus return value %x\n", 193 printk(KERN_ERR "irq event %d: bogus return value %x\n",
187 irq, action_ret); 194 irq, action_ret);
188 } else { 195 } else {
@@ -201,10 +208,11 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
201 raw_spin_lock_irqsave(&desc->lock, flags); 208 raw_spin_lock_irqsave(&desc->lock, flags);
202 action = desc->action; 209 action = desc->action;
203 while (action) { 210 while (action) {
204 printk(KERN_ERR "[<%p>]", action->handler); 211 printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
205 print_symbol(" (%s)", 212 if (action->thread_fn)
206 (unsigned long)action->handler); 213 printk(KERN_CONT " threaded [<%p>] %pf",
207 printk("\n"); 214 action->thread_fn, action->thread_fn);
215 printk(KERN_CONT "\n");
208 action = action->next; 216 action = action->next;
209 } 217 }
210 raw_spin_unlock_irqrestore(&desc->lock, flags); 218 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -262,7 +270,16 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
262 if (desc->istate & IRQS_POLL_INPROGRESS) 270 if (desc->istate & IRQS_POLL_INPROGRESS)
263 return; 271 return;
264 272
265 if (unlikely(action_ret != IRQ_HANDLED)) { 273 /* we get here again via the threaded handler */
274 if (action_ret == IRQ_WAKE_THREAD)
275 return;
276
277 if (bad_action_ret(action_ret)) {
278 report_bad_irq(irq, desc, action_ret);
279 return;
280 }
281
282 if (unlikely(action_ret == IRQ_NONE)) {
266 /* 283 /*
267 * If we are seeing only the odd spurious IRQ caused by 284 * If we are seeing only the odd spurious IRQ caused by
268 * bus asynchronicity then don't eventually trigger an error, 285 * bus asynchronicity then don't eventually trigger an error,
@@ -274,8 +291,6 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
274 else 291 else
275 desc->irqs_unhandled++; 292 desc->irqs_unhandled++;
276 desc->last_unhandled = jiffies; 293 desc->last_unhandled = jiffies;
277 if (unlikely(action_ret != IRQ_NONE))
278 report_bad_irq(irq, desc, action_ret);
279 } 294 }
280 295
281 if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { 296 if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 74d1c099fbd1..fa27e750dbc0 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -105,9 +105,12 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
105} 105}
106 106
107static void __jump_label_update(struct jump_label_key *key, 107static void __jump_label_update(struct jump_label_key *key,
108 struct jump_entry *entry, int enable) 108 struct jump_entry *entry,
109 struct jump_entry *stop, int enable)
109{ 110{
110 for (; entry->key == (jump_label_t)(unsigned long)key; entry++) { 111 for (; (entry < stop) &&
112 (entry->key == (jump_label_t)(unsigned long)key);
113 entry++) {
111 /* 114 /*
112 * entry->code set to 0 invalidates module init text sections 115 * entry->code set to 0 invalidates module init text sections
113 * kernel_text_address() verifies we are not in core kernel 116 * kernel_text_address() verifies we are not in core kernel
@@ -181,7 +184,11 @@ static void __jump_label_mod_update(struct jump_label_key *key, int enable)
181 struct jump_label_mod *mod = key->next; 184 struct jump_label_mod *mod = key->next;
182 185
183 while (mod) { 186 while (mod) {
184 __jump_label_update(key, mod->entries, enable); 187 struct module *m = mod->mod;
188
189 __jump_label_update(key, mod->entries,
190 m->jump_entries + m->num_jump_entries,
191 enable);
185 mod = mod->next; 192 mod = mod->next;
186 } 193 }
187} 194}
@@ -245,7 +252,8 @@ static int jump_label_add_module(struct module *mod)
245 key->next = jlm; 252 key->next = jlm;
246 253
247 if (jump_label_enabled(key)) 254 if (jump_label_enabled(key))
248 __jump_label_update(key, iter, JUMP_LABEL_ENABLE); 255 __jump_label_update(key, iter, iter_stop,
256 JUMP_LABEL_ENABLE);
249 } 257 }
250 258
251 return 0; 259 return 0;
@@ -371,7 +379,7 @@ static void jump_label_update(struct jump_label_key *key, int enable)
371 379
372 /* if there are no users, entry can be NULL */ 380 /* if there are no users, entry can be NULL */
373 if (entry) 381 if (entry)
374 __jump_label_update(key, entry, enable); 382 __jump_label_update(key, entry, __stop___jump_table, enable);
375 383
376#ifdef CONFIG_MODULES 384#ifdef CONFIG_MODULES
377 __jump_label_mod_update(key, enable); 385 __jump_label_mod_update(key, enable);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3b34d2732bce..4ba7cccb4994 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -202,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
202 return; 202 return;
203 } 203 }
204 204
205 p->cpus_allowed = cpumask_of_cpu(cpu); 205 /* It's safe because the task is inactive. */
206 p->rt.nr_cpus_allowed = 1; 206 do_set_cpus_allowed(p, cpumask_of(cpu));
207 p->flags |= PF_THREAD_BOUND; 207 p->flags |= PF_THREAD_BOUND;
208} 208}
209EXPORT_SYMBOL(kthread_bind); 209EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 63437d065ac8..298c9276dfdb 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3426,7 +3426,7 @@ int lock_is_held(struct lockdep_map *lock)
3426 int ret = 0; 3426 int ret = 0;
3427 3427
3428 if (unlikely(current->lockdep_recursion)) 3428 if (unlikely(current->lockdep_recursion))
3429 return ret; 3429 return 1; /* avoid false negative lockdep_assert_held() */
3430 3430
3431 raw_local_irq_save(flags); 3431 raw_local_irq_save(flags);
3432 check_flags(flags); 3432 check_flags(flags);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
deleted file mode 100644
index 2c98ad94ba0e..000000000000
--- a/kernel/ns_cgroup.c
+++ /dev/null
@@ -1,118 +0,0 @@
1/*
2 * ns_cgroup.c - namespace cgroup subsystem
3 *
4 * Copyright 2006, 2007 IBM Corp
5 */
6
7#include <linux/module.h>
8#include <linux/cgroup.h>
9#include <linux/fs.h>
10#include <linux/proc_fs.h>
11#include <linux/slab.h>
12#include <linux/nsproxy.h>
13
14struct ns_cgroup {
15 struct cgroup_subsys_state css;
16};
17
18struct cgroup_subsys ns_subsys;
19
20static inline struct ns_cgroup *cgroup_to_ns(
21 struct cgroup *cgroup)
22{
23 return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
24 struct ns_cgroup, css);
25}
26
27int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
28{
29 char name[PROC_NUMBUF];
30
31 snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
32 return cgroup_clone(task, &ns_subsys, name);
33}
34
35/*
36 * Rules:
37 * 1. you can only enter a cgroup which is a descendant of your current
38 * cgroup
39 * 2. you can only place another process into a cgroup if
40 * a. you have CAP_SYS_ADMIN
41 * b. your cgroup is an ancestor of task's destination cgroup
42 * (hence either you are in the same cgroup as task, or in an
43 * ancestor cgroup thereof)
44 */
45static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
46 struct task_struct *task, bool threadgroup)
47{
48 if (current != task) {
49 if (!capable(CAP_SYS_ADMIN))
50 return -EPERM;
51
52 if (!cgroup_is_descendant(new_cgroup, current))
53 return -EPERM;
54 }
55
56 if (!cgroup_is_descendant(new_cgroup, task))
57 return -EPERM;
58
59 if (threadgroup) {
60 struct task_struct *c;
61 rcu_read_lock();
62 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
63 if (!cgroup_is_descendant(new_cgroup, c)) {
64 rcu_read_unlock();
65 return -EPERM;
66 }
67 }
68 rcu_read_unlock();
69 }
70
71 return 0;
72}
73
74/*
75 * Rules: you can only create a cgroup if
76 * 1. you are capable(CAP_SYS_ADMIN)
77 * 2. the target cgroup is a descendant of your own cgroup
78 */
79static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
80 struct cgroup *cgroup)
81{
82 struct ns_cgroup *ns_cgroup;
83
84 if (!capable(CAP_SYS_ADMIN))
85 return ERR_PTR(-EPERM);
86 if (!cgroup_is_descendant(cgroup, current))
87 return ERR_PTR(-EPERM);
88 if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
89 printk("ns_cgroup can't be created with parent "
90 "'clone_children' set.\n");
91 return ERR_PTR(-EINVAL);
92 }
93
94 printk_once("ns_cgroup deprecated: consider using the "
95 "'clone_children' flag without the ns_cgroup.\n");
96
97 ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
98 if (!ns_cgroup)
99 return ERR_PTR(-ENOMEM);
100 return &ns_cgroup->css;
101}
102
103static void ns_destroy(struct cgroup_subsys *ss,
104 struct cgroup *cgroup)
105{
106 struct ns_cgroup *ns_cgroup;
107
108 ns_cgroup = cgroup_to_ns(cgroup);
109 kfree(ns_cgroup);
110}
111
112struct cgroup_subsys ns_subsys = {
113 .name = "ns",
114 .can_attach = ns_can_attach,
115 .create = ns_create,
116 .destroy = ns_destroy,
117 .subsys_id = ns_subsys_id,
118};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a05d191ffdd9..d6a00f3de15d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <net/net_namespace.h> 23#include <net/net_namespace.h>
24#include <linux/ipc_namespace.h> 24#include <linux/ipc_namespace.h>
25#include <linux/proc_fs.h>
26#include <linux/file.h>
27#include <linux/syscalls.h>
25 28
26static struct kmem_cache *nsproxy_cachep; 29static struct kmem_cache *nsproxy_cachep;
27 30
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
198 goto out; 201 goto out;
199 } 202 }
200 203
201 err = ns_cgroup_clone(current, task_pid(current));
202 if (err)
203 put_nsproxy(*new_nsp);
204
205out: 204out:
206 return err; 205 return err;
207} 206}
@@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p)
233 switch_task_namespaces(p, NULL); 232 switch_task_namespaces(p, NULL);
234} 233}
235 234
235SYSCALL_DEFINE2(setns, int, fd, int, nstype)
236{
237 const struct proc_ns_operations *ops;
238 struct task_struct *tsk = current;
239 struct nsproxy *new_nsproxy;
240 struct proc_inode *ei;
241 struct file *file;
242 int err;
243
244 if (!capable(CAP_SYS_ADMIN))
245 return -EPERM;
246
247 file = proc_ns_fget(fd);
248 if (IS_ERR(file))
249 return PTR_ERR(file);
250
251 err = -EINVAL;
252 ei = PROC_I(file->f_dentry->d_inode);
253 ops = ei->ns_ops;
254 if (nstype && (ops->type != nstype))
255 goto out;
256
257 new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
258 if (IS_ERR(new_nsproxy)) {
259 err = PTR_ERR(new_nsproxy);
260 goto out;
261 }
262
263 err = ops->install(new_nsproxy, ei->ns);
264 if (err) {
265 free_nsproxy(new_nsproxy);
266 goto out;
267 }
268 switch_task_namespaces(tsk, new_nsproxy);
269out:
270 fput(file);
271 return err;
272}
273
236static int __init nsproxy_cache_init(void) 274static int __init nsproxy_cache_init(void)
237{ 275{
238 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); 276 nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index beb184689af9..6824ca7d4d0c 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -40,6 +40,7 @@
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/platform_device.h> 41#include <linux/platform_device.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/kernel.h>
43 44
44#include <linux/uaccess.h> 45#include <linux/uaccess.h>
45 46
@@ -53,11 +54,17 @@ enum pm_qos_type {
53 PM_QOS_MIN /* return the smallest value */ 54 PM_QOS_MIN /* return the smallest value */
54}; 55};
55 56
57/*
58 * Note: The lockless read path depends on the CPU accessing
59 * target_value atomically. Atomic access is only guaranteed on all CPU
60 * types linux supports for 32 bit quantites
61 */
56struct pm_qos_object { 62struct pm_qos_object {
57 struct plist_head requests; 63 struct plist_head requests;
58 struct blocking_notifier_head *notifiers; 64 struct blocking_notifier_head *notifiers;
59 struct miscdevice pm_qos_power_miscdev; 65 struct miscdevice pm_qos_power_miscdev;
60 char *name; 66 char *name;
67 s32 target_value; /* Do not change to 64 bit */
61 s32 default_value; 68 s32 default_value;
62 enum pm_qos_type type; 69 enum pm_qos_type type;
63}; 70};
@@ -70,7 +77,8 @@ static struct pm_qos_object cpu_dma_pm_qos = {
70 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), 77 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
71 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
72 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
73 .default_value = 2000 * USEC_PER_SEC, 80 .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
81 .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
74 .type = PM_QOS_MIN, 82 .type = PM_QOS_MIN,
75}; 83};
76 84
@@ -79,7 +87,8 @@ static struct pm_qos_object network_lat_pm_qos = {
79 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), 87 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
80 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
81 .name = "network_latency", 89 .name = "network_latency",
82 .default_value = 2000 * USEC_PER_SEC, 90 .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
91 .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
83 .type = PM_QOS_MIN 92 .type = PM_QOS_MIN
84}; 93};
85 94
@@ -89,7 +98,8 @@ static struct pm_qos_object network_throughput_pm_qos = {
89 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), 98 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
90 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
91 .name = "network_throughput", 100 .name = "network_throughput",
92 .default_value = 0, 101 .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
102 .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
93 .type = PM_QOS_MAX, 103 .type = PM_QOS_MAX,
94}; 104};
95 105
@@ -135,6 +145,16 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
135 } 145 }
136} 146}
137 147
148static inline s32 pm_qos_read_value(struct pm_qos_object *o)
149{
150 return o->target_value;
151}
152
153static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
154{
155 o->target_value = value;
156}
157
138static void update_target(struct pm_qos_object *o, struct plist_node *node, 158static void update_target(struct pm_qos_object *o, struct plist_node *node,
139 int del, int value) 159 int del, int value)
140{ 160{
@@ -159,6 +179,7 @@ static void update_target(struct pm_qos_object *o, struct plist_node *node,
159 plist_add(node, &o->requests); 179 plist_add(node, &o->requests);
160 } 180 }
161 curr_value = pm_qos_get_value(o); 181 curr_value = pm_qos_get_value(o);
182 pm_qos_set_value(o, curr_value);
162 spin_unlock_irqrestore(&pm_qos_lock, flags); 183 spin_unlock_irqrestore(&pm_qos_lock, flags);
163 184
164 if (prev_value != curr_value) 185 if (prev_value != curr_value)
@@ -193,18 +214,11 @@ static int find_pm_qos_object_by_minor(int minor)
193 * pm_qos_request - returns current system wide qos expectation 214 * pm_qos_request - returns current system wide qos expectation
194 * @pm_qos_class: identification of which qos value is requested 215 * @pm_qos_class: identification of which qos value is requested
195 * 216 *
196 * This function returns the current target value in an atomic manner. 217 * This function returns the current target value.
197 */ 218 */
198int pm_qos_request(int pm_qos_class) 219int pm_qos_request(int pm_qos_class)
199{ 220{
200 unsigned long flags; 221 return pm_qos_read_value(pm_qos_array[pm_qos_class]);
201 int value;
202
203 spin_lock_irqsave(&pm_qos_lock, flags);
204 value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
205 spin_unlock_irqrestore(&pm_qos_lock, flags);
206
207 return value;
208} 222}
209EXPORT_SYMBOL_GPL(pm_qos_request); 223EXPORT_SYMBOL_GPL(pm_qos_request);
210 224
@@ -404,24 +418,36 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
404 size_t count, loff_t *f_pos) 418 size_t count, loff_t *f_pos)
405{ 419{
406 s32 value; 420 s32 value;
407 int x;
408 char ascii_value[11];
409 struct pm_qos_request_list *pm_qos_req; 421 struct pm_qos_request_list *pm_qos_req;
410 422
411 if (count == sizeof(s32)) { 423 if (count == sizeof(s32)) {
412 if (copy_from_user(&value, buf, sizeof(s32))) 424 if (copy_from_user(&value, buf, sizeof(s32)))
413 return -EFAULT; 425 return -EFAULT;
414 } else if (count == 11) { /* len('0x12345678/0') */ 426 } else if (count <= 11) { /* ASCII perhaps? */
415 if (copy_from_user(ascii_value, buf, 11)) 427 char ascii_value[11];
428 unsigned long int ulval;
429 int ret;
430
431 if (copy_from_user(ascii_value, buf, count))
416 return -EFAULT; 432 return -EFAULT;
417 if (strlen(ascii_value) != 10) 433
418 return -EINVAL; 434 if (count > 10) {
419 x = sscanf(ascii_value, "%x", &value); 435 if (ascii_value[10] == '\n')
420 if (x != 1) 436 ascii_value[10] = '\0';
437 else
438 return -EINVAL;
439 } else {
440 ascii_value[count] = '\0';
441 }
442 ret = strict_strtoul(ascii_value, 16, &ulval);
443 if (ret) {
444 pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
421 return -EINVAL; 445 return -EINVAL;
422 pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); 446 }
423 } else 447 value = (s32)lower_32_bits(ulval);
448 } else {
424 return -EINVAL; 449 return -EINVAL;
450 }
425 451
426 pm_qos_req = filp->private_data; 452 pm_qos_req = filp->private_data;
427 pm_qos_update_request(pm_qos_req, value); 453 pm_qos_update_request(pm_qos_req, value);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index f9bec56d8825..8f7b1db1ece1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,7 +25,6 @@
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <linux/syscore_ops.h> 26#include <linux/syscore_ops.h>
27#include <scsi/scsi_scan.h> 27#include <scsi/scsi_scan.h>
28#include <asm/suspend.h>
29 28
30#include "power.h" 29#include "power.h"
31 30
@@ -55,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN;
55static const struct platform_hibernation_ops *hibernation_ops; 54static const struct platform_hibernation_ops *hibernation_ops;
56 55
57/** 56/**
58 * hibernation_set_ops - set the global hibernate operations 57 * hibernation_set_ops - Set the global hibernate operations.
59 * @ops: the hibernation operations to use in subsequent hibernation transitions 58 * @ops: Hibernation operations to use in subsequent hibernation transitions.
60 */ 59 */
61
62void hibernation_set_ops(const struct platform_hibernation_ops *ops) 60void hibernation_set_ops(const struct platform_hibernation_ops *ops)
63{ 61{
64 if (ops && !(ops->begin && ops->end && ops->pre_snapshot 62 if (ops && !(ops->begin && ops->end && ops->pre_snapshot
@@ -115,10 +113,9 @@ static int hibernation_test(int level) { return 0; }
115#endif /* !CONFIG_PM_DEBUG */ 113#endif /* !CONFIG_PM_DEBUG */
116 114
117/** 115/**
118 * platform_begin - tell the platform driver that we're starting 116 * platform_begin - Call platform to start hibernation.
119 * hibernation 117 * @platform_mode: Whether or not to use the platform driver.
120 */ 118 */
121
122static int platform_begin(int platform_mode) 119static int platform_begin(int platform_mode)
123{ 120{
124 return (platform_mode && hibernation_ops) ? 121 return (platform_mode && hibernation_ops) ?
@@ -126,10 +123,9 @@ static int platform_begin(int platform_mode)
126} 123}
127 124
128/** 125/**
129 * platform_end - tell the platform driver that we've entered the 126 * platform_end - Call platform to finish transition to the working state.
130 * working state 127 * @platform_mode: Whether or not to use the platform driver.
131 */ 128 */
132
133static void platform_end(int platform_mode) 129static void platform_end(int platform_mode)
134{ 130{
135 if (platform_mode && hibernation_ops) 131 if (platform_mode && hibernation_ops)
@@ -137,8 +133,11 @@ static void platform_end(int platform_mode)
137} 133}
138 134
139/** 135/**
140 * platform_pre_snapshot - prepare the machine for hibernation using the 136 * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
141 * platform driver if so configured and return an error code if it fails 137 * @platform_mode: Whether or not to use the platform driver.
138 *
139 * Use the platform driver to prepare the system for creating a hibernate image,
140 * if so configured, and return an error code if that fails.
142 */ 141 */
143 142
144static int platform_pre_snapshot(int platform_mode) 143static int platform_pre_snapshot(int platform_mode)
@@ -148,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode)
148} 147}
149 148
150/** 149/**
151 * platform_leave - prepare the machine for switching to the normal mode 150 * platform_leave - Call platform to prepare a transition to the working state.
152 * of operation using the platform driver (called with interrupts disabled) 151 * @platform_mode: Whether or not to use the platform driver.
152 *
153 * Use the platform driver prepare to prepare the machine for switching to the
154 * normal mode of operation.
155 *
156 * This routine is called on one CPU with interrupts disabled.
153 */ 157 */
154
155static void platform_leave(int platform_mode) 158static void platform_leave(int platform_mode)
156{ 159{
157 if (platform_mode && hibernation_ops) 160 if (platform_mode && hibernation_ops)
@@ -159,10 +162,14 @@ static void platform_leave(int platform_mode)
159} 162}
160 163
161/** 164/**
162 * platform_finish - switch the machine to the normal mode of operation 165 * platform_finish - Call platform to switch the system to the working state.
163 * using the platform driver (must be called after platform_prepare()) 166 * @platform_mode: Whether or not to use the platform driver.
167 *
168 * Use the platform driver to switch the machine to the normal mode of
169 * operation.
170 *
171 * This routine must be called after platform_prepare().
164 */ 172 */
165
166static void platform_finish(int platform_mode) 173static void platform_finish(int platform_mode)
167{ 174{
168 if (platform_mode && hibernation_ops) 175 if (platform_mode && hibernation_ops)
@@ -170,11 +177,15 @@ static void platform_finish(int platform_mode)
170} 177}
171 178
172/** 179/**
173 * platform_pre_restore - prepare the platform for the restoration from a 180 * platform_pre_restore - Prepare for hibernate image restoration.
174 * hibernation image. If the restore fails after this function has been 181 * @platform_mode: Whether or not to use the platform driver.
175 * called, platform_restore_cleanup() must be called. 182 *
183 * Use the platform driver to prepare the system for resume from a hibernation
184 * image.
185 *
186 * If the restore fails after this function has been called,
187 * platform_restore_cleanup() must be called.
176 */ 188 */
177
178static int platform_pre_restore(int platform_mode) 189static int platform_pre_restore(int platform_mode)
179{ 190{
180 return (platform_mode && hibernation_ops) ? 191 return (platform_mode && hibernation_ops) ?
@@ -182,12 +193,16 @@ static int platform_pre_restore(int platform_mode)
182} 193}
183 194
184/** 195/**
185 * platform_restore_cleanup - switch the platform to the normal mode of 196 * platform_restore_cleanup - Switch to the working state after failing restore.
186 * operation after a failing restore. If platform_pre_restore() has been 197 * @platform_mode: Whether or not to use the platform driver.
187 * called before the failing restore, this function must be called too, 198 *
188 * regardless of the result of platform_pre_restore(). 199 * Use the platform driver to switch the system to the normal mode of operation
200 * after a failing restore.
201 *
202 * If platform_pre_restore() has been called before the failing restore, this
203 * function must be called too, regardless of the result of
204 * platform_pre_restore().
189 */ 205 */
190
191static void platform_restore_cleanup(int platform_mode) 206static void platform_restore_cleanup(int platform_mode)
192{ 207{
193 if (platform_mode && hibernation_ops) 208 if (platform_mode && hibernation_ops)
@@ -195,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode)
195} 210}
196 211
197/** 212/**
198 * platform_recover - recover the platform from a failure to suspend 213 * platform_recover - Recover from a failure to suspend devices.
199 * devices. 214 * @platform_mode: Whether or not to use the platform driver.
200 */ 215 */
201
202static void platform_recover(int platform_mode) 216static void platform_recover(int platform_mode)
203{ 217{
204 if (platform_mode && hibernation_ops && hibernation_ops->recover) 218 if (platform_mode && hibernation_ops && hibernation_ops->recover)
@@ -206,13 +220,12 @@ static void platform_recover(int platform_mode)
206} 220}
207 221
208/** 222/**
209 * swsusp_show_speed - print the time elapsed between two events. 223 * swsusp_show_speed - Print time elapsed between two events during hibernation.
210 * @start: Starting event. 224 * @start: Starting event.
211 * @stop: Final event. 225 * @stop: Final event.
212 * @nr_pages - number of pages processed between @start and @stop 226 * @nr_pages: Number of memory pages processed between @start and @stop.
213 * @msg - introductory message to print 227 * @msg: Additional diagnostic message to print.
214 */ 228 */
215
216void swsusp_show_speed(struct timeval *start, struct timeval *stop, 229void swsusp_show_speed(struct timeval *start, struct timeval *stop,
217 unsigned nr_pages, char *msg) 230 unsigned nr_pages, char *msg)
218{ 231{
@@ -235,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
235} 248}
236 249
237/** 250/**
238 * create_image - freeze devices that need to be frozen with interrupts 251 * create_image - Create a hibernation image.
239 * off, create the hibernation image and thaw those devices. Control 252 * @platform_mode: Whether or not to use the platform driver.
240 * reappears in this routine after a restore. 253 *
254 * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
255 * and execute the drivers' .thaw_noirq() callbacks.
256 *
257 * Control reappears in this routine after the subsequent restore.
241 */ 258 */
242
243static int create_image(int platform_mode) 259static int create_image(int platform_mode)
244{ 260{
245 int error; 261 int error;
246 262
247 error = arch_prepare_suspend();
248 if (error)
249 return error;
250
251 /* At this point, dpm_suspend_start() has been called, but *not*
252 * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
253 * Otherwise, drivers for some devices (e.g. interrupt controllers)
254 * become desynchronized with the actual state of the hardware
255 * at resume time, and evil weirdness ensues.
256 */
257 error = dpm_suspend_noirq(PMSG_FREEZE); 263 error = dpm_suspend_noirq(PMSG_FREEZE);
258 if (error) { 264 if (error) {
259 printk(KERN_ERR "PM: Some devices failed to power down, " 265 printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -297,9 +303,6 @@ static int create_image(int platform_mode)
297 303
298 Power_up: 304 Power_up:
299 syscore_resume(); 305 syscore_resume();
300 /* NOTE: dpm_resume_noirq() is just a resume() for devices
301 * that suspended with irqs off ... no overall powerup.
302 */
303 306
304 Enable_irqs: 307 Enable_irqs:
305 local_irq_enable(); 308 local_irq_enable();
@@ -317,14 +320,11 @@ static int create_image(int platform_mode)
317} 320}
318 321
319/** 322/**
320 * hibernation_snapshot - quiesce devices and create the hibernation 323 * hibernation_snapshot - Quiesce devices and create a hibernation image.
321 * snapshot image. 324 * @platform_mode: If set, use platform driver to prepare for the transition.
322 * @platform_mode - if set, use the platform driver, if available, to
323 * prepare the platform firmware for the power transition.
324 * 325 *
325 * Must be called with pm_mutex held 326 * This routine must be called with pm_mutex held.
326 */ 327 */
327
328int hibernation_snapshot(int platform_mode) 328int hibernation_snapshot(int platform_mode)
329{ 329{
330 pm_message_t msg = PMSG_RECOVER; 330 pm_message_t msg = PMSG_RECOVER;
@@ -384,13 +384,14 @@ int hibernation_snapshot(int platform_mode)
384} 384}
385 385
386/** 386/**
387 * resume_target_kernel - prepare devices that need to be suspended with 387 * resume_target_kernel - Restore system state from a hibernation image.
388 * interrupts off, restore the contents of highmem that have not been 388 * @platform_mode: Whether or not to use the platform driver.
389 * restored yet from the image and run the low level code that will restore 389 *
390 * the remaining contents of memory and switch to the just restored target 390 * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
391 * kernel. 391 * highmem that have not been restored yet from the image and run the low-level
392 * code that will restore the remaining contents of memory and switch to the
393 * just restored target kernel.
392 */ 394 */
393
394static int resume_target_kernel(bool platform_mode) 395static int resume_target_kernel(bool platform_mode)
395{ 396{
396 int error; 397 int error;
@@ -416,24 +417,26 @@ static int resume_target_kernel(bool platform_mode)
416 if (error) 417 if (error)
417 goto Enable_irqs; 418 goto Enable_irqs;
418 419
419 /* We'll ignore saved state, but this gets preempt count (etc) right */
420 save_processor_state(); 420 save_processor_state();
421 error = restore_highmem(); 421 error = restore_highmem();
422 if (!error) { 422 if (!error) {
423 error = swsusp_arch_resume(); 423 error = swsusp_arch_resume();
424 /* 424 /*
425 * The code below is only ever reached in case of a failure. 425 * The code below is only ever reached in case of a failure.
426 * Otherwise execution continues at place where 426 * Otherwise, execution continues at the place where
427 * swsusp_arch_suspend() was called 427 * swsusp_arch_suspend() was called.
428 */ 428 */
429 BUG_ON(!error); 429 BUG_ON(!error);
430 /* This call to restore_highmem() undos the previous one */ 430 /*
431 * This call to restore_highmem() reverts the changes made by
432 * the previous one.
433 */
431 restore_highmem(); 434 restore_highmem();
432 } 435 }
433 /* 436 /*
434 * The only reason why swsusp_arch_resume() can fail is memory being 437 * The only reason why swsusp_arch_resume() can fail is memory being
435 * very tight, so we have to free it as soon as we can to avoid 438 * very tight, so we have to free it as soon as we can to avoid
436 * subsequent failures 439 * subsequent failures.
437 */ 440 */
438 swsusp_free(); 441 swsusp_free();
439 restore_processor_state(); 442 restore_processor_state();
@@ -456,14 +459,12 @@ static int resume_target_kernel(bool platform_mode)
456} 459}
457 460
458/** 461/**
459 * hibernation_restore - quiesce devices and restore the hibernation 462 * hibernation_restore - Quiesce devices and restore from a hibernation image.
460 * snapshot image. If successful, control returns in hibernation_snaphot() 463 * @platform_mode: If set, use platform driver to prepare for the transition.
461 * @platform_mode - if set, use the platform driver, if available, to
462 * prepare the platform firmware for the transition.
463 * 464 *
464 * Must be called with pm_mutex held 465 * This routine must be called with pm_mutex held. If it is successful, control
466 * reappears in the restored target kernel in hibernation_snaphot().
465 */ 467 */
466
467int hibernation_restore(int platform_mode) 468int hibernation_restore(int platform_mode)
468{ 469{
469 int error; 470 int error;
@@ -483,10 +484,8 @@ int hibernation_restore(int platform_mode)
483} 484}
484 485
485/** 486/**
486 * hibernation_platform_enter - enter the hibernation state using the 487 * hibernation_platform_enter - Power off the system using the platform driver.
487 * platform driver (if available)
488 */ 488 */
489
490int hibernation_platform_enter(void) 489int hibernation_platform_enter(void)
491{ 490{
492 int error; 491 int error;
@@ -557,12 +556,12 @@ int hibernation_platform_enter(void)
557} 556}
558 557
559/** 558/**
560 * power_down - Shut the machine down for hibernation. 559 * power_down - Shut the machine down for hibernation.
561 * 560 *
562 * Use the platform driver, if configured so; otherwise try 561 * Use the platform driver, if configured, to put the system into the sleep
563 * to power off or reboot. 562 * state corresponding to hibernation, or try to power it off or reboot,
563 * depending on the value of hibernation_mode.
564 */ 564 */
565
566static void power_down(void) 565static void power_down(void)
567{ 566{
568 switch (hibernation_mode) { 567 switch (hibernation_mode) {
@@ -599,9 +598,8 @@ static int prepare_processes(void)
599} 598}
600 599
601/** 600/**
602 * hibernate - The granpappy of the built-in hibernation management 601 * hibernate - Carry out system hibernation, including saving the image.
603 */ 602 */
604
605int hibernate(void) 603int hibernate(void)
606{ 604{
607 int error; 605 int error;
@@ -679,17 +677,20 @@ int hibernate(void)
679 677
680 678
681/** 679/**
682 * software_resume - Resume from a saved image. 680 * software_resume - Resume from a saved hibernation image.
683 * 681 *
684 * Called as a late_initcall (so all devices are discovered and 682 * This routine is called as a late initcall, when all devices have been
685 * initialized), we call swsusp to see if we have a saved image or not. 683 * discovered and initialized already.
686 * If so, we quiesce devices, the restore the saved image. We will
687 * return above (in hibernate() ) if everything goes well.
688 * Otherwise, we fail gracefully and return to the normally
689 * scheduled program.
690 * 684 *
685 * The image reading code is called to see if there is a hibernation image
686 * available for reading. If that is the case, devices are quiesced and the
687 * contents of memory is restored from the saved image.
688 *
689 * If this is successful, control reappears in the restored target kernel in
690 * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine
691 * attempts to recover gracefully and make the kernel return to the normal mode
692 * of operation.
691 */ 693 */
692
693static int software_resume(void) 694static int software_resume(void)
694{ 695{
695 int error; 696 int error;
@@ -819,21 +820,17 @@ static const char * const hibernation_modes[] = {
819 [HIBERNATION_TESTPROC] = "testproc", 820 [HIBERNATION_TESTPROC] = "testproc",
820}; 821};
821 822
822/** 823/*
823 * disk - Control hibernation mode 824 * /sys/power/disk - Control hibernation mode.
824 *
825 * Suspend-to-disk can be handled in several ways. We have a few options
826 * for putting the system to sleep - using the platform driver (e.g. ACPI
827 * or other hibernation_ops), powering off the system or rebooting the
828 * system (for testing) as well as the two test modes.
829 * 825 *
830 * The system can support 'platform', and that is known a priori (and 826 * Hibernation can be handled in several ways. There are a few different ways
831 * encoded by the presence of hibernation_ops). However, the user may 827 * to put the system into the sleep state: using the platform driver (e.g. ACPI
832 * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the 828 * or other hibernation_ops), powering it off or rebooting it (for testing
833 * test modes, 'test' or 'testproc'. 829 * mostly), or using one of the two available test modes.
834 * 830 *
835 * show() will display what the mode is currently set to. 831 * The sysfs file /sys/power/disk provides an interface for selecting the
836 * store() will accept one of 832 * hibernation mode to use. Reading from this file causes the available modes
833 * to be printed. There are 5 modes that can be supported:
837 * 834 *
838 * 'platform' 835 * 'platform'
839 * 'shutdown' 836 * 'shutdown'
@@ -841,8 +838,14 @@ static const char * const hibernation_modes[] = {
841 * 'test' 838 * 'test'
842 * 'testproc' 839 * 'testproc'
843 * 840 *
844 * It will only change to 'platform' if the system 841 * If a platform hibernation driver is in use, 'platform' will be supported
845 * supports it (as determined by having hibernation_ops). 842 * and will be used by default. Otherwise, 'shutdown' will be used by default.
843 * The selected option (i.e. the one corresponding to the current value of
844 * hibernation_mode) is enclosed by a square bracket.
845 *
846 * To select a given hibernation mode it is necessary to write the mode's
847 * string representation (as returned by reading from /sys/power/disk) back
848 * into /sys/power/disk.
846 */ 849 */
847 850
848static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, 851static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -875,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
875 return buf-start; 878 return buf-start;
876} 879}
877 880
878
879static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, 881static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
880 const char *buf, size_t n) 882 const char *buf, size_t n)
881{ 883{
diff --git a/kernel/profile.c b/kernel/profile.c
index 14c9f87b9fc9..961b389fe52f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -303,14 +303,12 @@ static void profile_discard_flip_buffers(void)
303 mutex_unlock(&profile_flip_mutex); 303 mutex_unlock(&profile_flip_mutex);
304} 304}
305 305
306void profile_hits(int type, void *__pc, unsigned int nr_hits) 306static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
307{ 307{
308 unsigned long primary, secondary, flags, pc = (unsigned long)__pc; 308 unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
309 int i, j, cpu; 309 int i, j, cpu;
310 struct profile_hit *hits; 310 struct profile_hit *hits;
311 311
312 if (prof_on != type || !prof_buffer)
313 return;
314 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); 312 pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
315 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 313 i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
316 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; 314 secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
@@ -417,16 +415,20 @@ out_free:
417#define profile_discard_flip_buffers() do { } while (0) 415#define profile_discard_flip_buffers() do { } while (0)
418#define profile_cpu_callback NULL 416#define profile_cpu_callback NULL
419 417
420void profile_hits(int type, void *__pc, unsigned int nr_hits) 418static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
421{ 419{
422 unsigned long pc; 420 unsigned long pc;
423
424 if (prof_on != type || !prof_buffer)
425 return;
426 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; 421 pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
427 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); 422 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
428} 423}
429#endif /* !CONFIG_SMP */ 424#endif /* !CONFIG_SMP */
425
426void profile_hits(int type, void *__pc, unsigned int nr_hits)
427{
428 if (prof_on != type || !prof_buffer)
429 return;
430 do_profile_hits(type, __pc, nr_hits);
431}
430EXPORT_SYMBOL_GPL(profile_hits); 432EXPORT_SYMBOL_GPL(profile_hits);
431 433
432void profile_tick(int type) 434void profile_tick(int type)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f07d2f03181a..89419ff92e99 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -36,7 +36,7 @@
36#include <linux/interrupt.h> 36#include <linux/interrupt.h>
37#include <linux/sched.h> 37#include <linux/sched.h>
38#include <linux/nmi.h> 38#include <linux/nmi.h>
39#include <asm/atomic.h> 39#include <linux/atomic.h>
40#include <linux/bitops.h> 40#include <linux/bitops.h>
41#include <linux/module.h> 41#include <linux/module.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
@@ -95,7 +95,6 @@ static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
95DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); 95DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
96DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); 96DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
97DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); 97DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
98static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
99DEFINE_PER_CPU(char, rcu_cpu_has_work); 98DEFINE_PER_CPU(char, rcu_cpu_has_work);
100static char rcu_kthreads_spawnable; 99static char rcu_kthreads_spawnable;
101 100
@@ -163,7 +162,7 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
163#ifdef CONFIG_NO_HZ 162#ifdef CONFIG_NO_HZ
164DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 163DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
165 .dynticks_nesting = 1, 164 .dynticks_nesting = 1,
166 .dynticks = 1, 165 .dynticks = ATOMIC_INIT(1),
167}; 166};
168#endif /* #ifdef CONFIG_NO_HZ */ 167#endif /* #ifdef CONFIG_NO_HZ */
169 168
@@ -322,13 +321,25 @@ void rcu_enter_nohz(void)
322 unsigned long flags; 321 unsigned long flags;
323 struct rcu_dynticks *rdtp; 322 struct rcu_dynticks *rdtp;
324 323
325 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
326 local_irq_save(flags); 324 local_irq_save(flags);
327 rdtp = &__get_cpu_var(rcu_dynticks); 325 rdtp = &__get_cpu_var(rcu_dynticks);
328 rdtp->dynticks++; 326 if (--rdtp->dynticks_nesting) {
329 rdtp->dynticks_nesting--; 327 local_irq_restore(flags);
330 WARN_ON_ONCE(rdtp->dynticks & 0x1); 328 return;
329 }
330 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
331 smp_mb__before_atomic_inc(); /* See above. */
332 atomic_inc(&rdtp->dynticks);
333 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
334 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
331 local_irq_restore(flags); 335 local_irq_restore(flags);
336
337 /* If the interrupt queued a callback, get out of dyntick mode. */
338 if (in_irq() &&
339 (__get_cpu_var(rcu_sched_data).nxtlist ||
340 __get_cpu_var(rcu_bh_data).nxtlist ||
341 rcu_preempt_needs_cpu(smp_processor_id())))
342 set_need_resched();
332} 343}
333 344
334/* 345/*
@@ -344,11 +355,16 @@ void rcu_exit_nohz(void)
344 355
345 local_irq_save(flags); 356 local_irq_save(flags);
346 rdtp = &__get_cpu_var(rcu_dynticks); 357 rdtp = &__get_cpu_var(rcu_dynticks);
347 rdtp->dynticks++; 358 if (rdtp->dynticks_nesting++) {
348 rdtp->dynticks_nesting++; 359 local_irq_restore(flags);
349 WARN_ON_ONCE(!(rdtp->dynticks & 0x1)); 360 return;
361 }
362 smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
363 atomic_inc(&rdtp->dynticks);
364 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
365 smp_mb__after_atomic_inc(); /* See above. */
366 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
350 local_irq_restore(flags); 367 local_irq_restore(flags);
351 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
352} 368}
353 369
354/** 370/**
@@ -362,11 +378,15 @@ void rcu_nmi_enter(void)
362{ 378{
363 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 379 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
364 380
365 if (rdtp->dynticks & 0x1) 381 if (rdtp->dynticks_nmi_nesting == 0 &&
382 (atomic_read(&rdtp->dynticks) & 0x1))
366 return; 383 return;
367 rdtp->dynticks_nmi++; 384 rdtp->dynticks_nmi_nesting++;
368 WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1)); 385 smp_mb__before_atomic_inc(); /* Force delay from prior write. */
369 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ 386 atomic_inc(&rdtp->dynticks);
387 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
388 smp_mb__after_atomic_inc(); /* See above. */
389 WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
370} 390}
371 391
372/** 392/**
@@ -380,11 +400,14 @@ void rcu_nmi_exit(void)
380{ 400{
381 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 401 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
382 402
383 if (rdtp->dynticks & 0x1) 403 if (rdtp->dynticks_nmi_nesting == 0 ||
404 --rdtp->dynticks_nmi_nesting != 0)
384 return; 405 return;
385 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ 406 /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
386 rdtp->dynticks_nmi++; 407 smp_mb__before_atomic_inc(); /* See above. */
387 WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1); 408 atomic_inc(&rdtp->dynticks);
409 smp_mb__after_atomic_inc(); /* Force delay to next write. */
410 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
388} 411}
389 412
390/** 413/**
@@ -395,13 +418,7 @@ void rcu_nmi_exit(void)
395 */ 418 */
396void rcu_irq_enter(void) 419void rcu_irq_enter(void)
397{ 420{
398 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 421 rcu_exit_nohz();
399
400 if (rdtp->dynticks_nesting++)
401 return;
402 rdtp->dynticks++;
403 WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
404 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
405} 422}
406 423
407/** 424/**
@@ -413,18 +430,7 @@ void rcu_irq_enter(void)
413 */ 430 */
414void rcu_irq_exit(void) 431void rcu_irq_exit(void)
415{ 432{
416 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 433 rcu_enter_nohz();
417
418 if (--rdtp->dynticks_nesting)
419 return;
420 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
421 rdtp->dynticks++;
422 WARN_ON_ONCE(rdtp->dynticks & 0x1);
423
424 /* If the interrupt queued a callback, get out of dyntick mode. */
425 if (__this_cpu_read(rcu_sched_data.nxtlist) ||
426 __this_cpu_read(rcu_bh_data.nxtlist))
427 set_need_resched();
428} 434}
429 435
430#ifdef CONFIG_SMP 436#ifdef CONFIG_SMP
@@ -436,19 +442,8 @@ void rcu_irq_exit(void)
436 */ 442 */
437static int dyntick_save_progress_counter(struct rcu_data *rdp) 443static int dyntick_save_progress_counter(struct rcu_data *rdp)
438{ 444{
439 int ret; 445 rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
440 int snap; 446 return 0;
441 int snap_nmi;
442
443 snap = rdp->dynticks->dynticks;
444 snap_nmi = rdp->dynticks->dynticks_nmi;
445 smp_mb(); /* Order sampling of snap with end of grace period. */
446 rdp->dynticks_snap = snap;
447 rdp->dynticks_nmi_snap = snap_nmi;
448 ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
449 if (ret)
450 rdp->dynticks_fqs++;
451 return ret;
452} 447}
453 448
454/* 449/*
@@ -459,16 +454,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
459 */ 454 */
460static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) 455static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
461{ 456{
462 long curr; 457 unsigned long curr;
463 long curr_nmi; 458 unsigned long snap;
464 long snap;
465 long snap_nmi;
466 459
467 curr = rdp->dynticks->dynticks; 460 curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
468 snap = rdp->dynticks_snap; 461 snap = (unsigned long)rdp->dynticks_snap;
469 curr_nmi = rdp->dynticks->dynticks_nmi;
470 snap_nmi = rdp->dynticks_nmi_snap;
471 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
472 462
473 /* 463 /*
474 * If the CPU passed through or entered a dynticks idle phase with 464 * If the CPU passed through or entered a dynticks idle phase with
@@ -478,8 +468,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
478 * read-side critical section that started before the beginning 468 * read-side critical section that started before the beginning
479 * of the current RCU grace period. 469 * of the current RCU grace period.
480 */ 470 */
481 if ((curr != snap || (curr & 0x1) == 0) && 471 if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
482 (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
483 rdp->dynticks_fqs++; 472 rdp->dynticks_fqs++;
484 return 1; 473 return 1;
485 } 474 }
@@ -908,6 +897,12 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
908 unsigned long gp_duration; 897 unsigned long gp_duration;
909 898
910 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 899 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
900
901 /*
902 * Ensure that all grace-period and pre-grace-period activity
903 * is seen before the assignment to rsp->completed.
904 */
905 smp_mb(); /* See above block comment. */
911 gp_duration = jiffies - rsp->gp_start; 906 gp_duration = jiffies - rsp->gp_start;
912 if (gp_duration > rsp->gp_max) 907 if (gp_duration > rsp->gp_max)
913 rsp->gp_max = gp_duration; 908 rsp->gp_max = gp_duration;
@@ -1455,25 +1450,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1455 */ 1450 */
1456static void rcu_process_callbacks(void) 1451static void rcu_process_callbacks(void)
1457{ 1452{
1458 /*
1459 * Memory references from any prior RCU read-side critical sections
1460 * executed by the interrupted code must be seen before any RCU
1461 * grace-period manipulations below.
1462 */
1463 smp_mb(); /* See above block comment. */
1464
1465 __rcu_process_callbacks(&rcu_sched_state, 1453 __rcu_process_callbacks(&rcu_sched_state,
1466 &__get_cpu_var(rcu_sched_data)); 1454 &__get_cpu_var(rcu_sched_data));
1467 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1455 __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
1468 rcu_preempt_process_callbacks(); 1456 rcu_preempt_process_callbacks();
1469 1457
1470 /*
1471 * Memory references from any later RCU read-side critical sections
1472 * executed by the interrupted code must be seen after any RCU
1473 * grace-period manipulations above.
1474 */
1475 smp_mb(); /* See above block comment. */
1476
1477 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ 1458 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1478 rcu_needs_cpu_flush(); 1459 rcu_needs_cpu_flush();
1479} 1460}
@@ -1494,7 +1475,7 @@ static void invoke_rcu_cpu_kthread(void)
1494 local_irq_restore(flags); 1475 local_irq_restore(flags);
1495 return; 1476 return;
1496 } 1477 }
1497 wake_up(&__get_cpu_var(rcu_cpu_wq)); 1478 wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
1498 local_irq_restore(flags); 1479 local_irq_restore(flags);
1499} 1480}
1500 1481
@@ -1544,13 +1525,10 @@ static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
1544 */ 1525 */
1545static void rcu_cpu_kthread_timer(unsigned long arg) 1526static void rcu_cpu_kthread_timer(unsigned long arg)
1546{ 1527{
1547 unsigned long flags;
1548 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); 1528 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
1549 struct rcu_node *rnp = rdp->mynode; 1529 struct rcu_node *rnp = rdp->mynode;
1550 1530
1551 raw_spin_lock_irqsave(&rnp->lock, flags); 1531 atomic_or(rdp->grpmask, &rnp->wakemask);
1552 rnp->wakemask |= rdp->grpmask;
1553 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1554 invoke_rcu_node_kthread(rnp); 1532 invoke_rcu_node_kthread(rnp);
1555} 1533}
1556 1534
@@ -1617,14 +1595,12 @@ static int rcu_cpu_kthread(void *arg)
1617 unsigned long flags; 1595 unsigned long flags;
1618 int spincnt = 0; 1596 int spincnt = 0;
1619 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); 1597 unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
1620 wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
1621 char work; 1598 char work;
1622 char *workp = &per_cpu(rcu_cpu_has_work, cpu); 1599 char *workp = &per_cpu(rcu_cpu_has_work, cpu);
1623 1600
1624 for (;;) { 1601 for (;;) {
1625 *statusp = RCU_KTHREAD_WAITING; 1602 *statusp = RCU_KTHREAD_WAITING;
1626 wait_event_interruptible(*wqp, 1603 rcu_wait(*workp != 0 || kthread_should_stop());
1627 *workp != 0 || kthread_should_stop());
1628 local_bh_disable(); 1604 local_bh_disable();
1629 if (rcu_cpu_kthread_should_stop(cpu)) { 1605 if (rcu_cpu_kthread_should_stop(cpu)) {
1630 local_bh_enable(); 1606 local_bh_enable();
@@ -1675,7 +1651,6 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
1675 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; 1651 per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
1676 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); 1652 WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
1677 per_cpu(rcu_cpu_kthread_task, cpu) = t; 1653 per_cpu(rcu_cpu_kthread_task, cpu) = t;
1678 wake_up_process(t);
1679 sp.sched_priority = RCU_KTHREAD_PRIO; 1654 sp.sched_priority = RCU_KTHREAD_PRIO;
1680 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1655 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1681 return 0; 1656 return 0;
@@ -1698,11 +1673,10 @@ static int rcu_node_kthread(void *arg)
1698 1673
1699 for (;;) { 1674 for (;;) {
1700 rnp->node_kthread_status = RCU_KTHREAD_WAITING; 1675 rnp->node_kthread_status = RCU_KTHREAD_WAITING;
1701 wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0); 1676 rcu_wait(atomic_read(&rnp->wakemask) != 0);
1702 rnp->node_kthread_status = RCU_KTHREAD_RUNNING; 1677 rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
1703 raw_spin_lock_irqsave(&rnp->lock, flags); 1678 raw_spin_lock_irqsave(&rnp->lock, flags);
1704 mask = rnp->wakemask; 1679 mask = atomic_xchg(&rnp->wakemask, 0);
1705 rnp->wakemask = 0;
1706 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ 1680 rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
1707 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { 1681 for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
1708 if ((mask & 0x1) == 0) 1682 if ((mask & 0x1) == 0)
@@ -1783,13 +1757,14 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
1783 raw_spin_lock_irqsave(&rnp->lock, flags); 1757 raw_spin_lock_irqsave(&rnp->lock, flags);
1784 rnp->node_kthread_task = t; 1758 rnp->node_kthread_task = t;
1785 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1759 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1786 wake_up_process(t);
1787 sp.sched_priority = 99; 1760 sp.sched_priority = 99;
1788 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1761 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1789 } 1762 }
1790 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); 1763 return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
1791} 1764}
1792 1765
1766static void rcu_wake_one_boost_kthread(struct rcu_node *rnp);
1767
1793/* 1768/*
1794 * Spawn all kthreads -- called as soon as the scheduler is running. 1769 * Spawn all kthreads -- called as soon as the scheduler is running.
1795 */ 1770 */
@@ -1797,24 +1772,31 @@ static int __init rcu_spawn_kthreads(void)
1797{ 1772{
1798 int cpu; 1773 int cpu;
1799 struct rcu_node *rnp; 1774 struct rcu_node *rnp;
1775 struct task_struct *t;
1800 1776
1801 rcu_kthreads_spawnable = 1; 1777 rcu_kthreads_spawnable = 1;
1802 for_each_possible_cpu(cpu) { 1778 for_each_possible_cpu(cpu) {
1803 init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
1804 per_cpu(rcu_cpu_has_work, cpu) = 0; 1779 per_cpu(rcu_cpu_has_work, cpu) = 0;
1805 if (cpu_online(cpu)) 1780 if (cpu_online(cpu)) {
1806 (void)rcu_spawn_one_cpu_kthread(cpu); 1781 (void)rcu_spawn_one_cpu_kthread(cpu);
1782 t = per_cpu(rcu_cpu_kthread_task, cpu);
1783 if (t)
1784 wake_up_process(t);
1785 }
1807 } 1786 }
1808 rnp = rcu_get_root(rcu_state); 1787 rnp = rcu_get_root(rcu_state);
1809 init_waitqueue_head(&rnp->node_wq);
1810 rcu_init_boost_waitqueue(rnp);
1811 (void)rcu_spawn_one_node_kthread(rcu_state, rnp); 1788 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1812 if (NUM_RCU_NODES > 1) 1789 if (rnp->node_kthread_task)
1790 wake_up_process(rnp->node_kthread_task);
1791 if (NUM_RCU_NODES > 1) {
1813 rcu_for_each_leaf_node(rcu_state, rnp) { 1792 rcu_for_each_leaf_node(rcu_state, rnp) {
1814 init_waitqueue_head(&rnp->node_wq);
1815 rcu_init_boost_waitqueue(rnp);
1816 (void)rcu_spawn_one_node_kthread(rcu_state, rnp); 1793 (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
1794 t = rnp->node_kthread_task;
1795 if (t)
1796 wake_up_process(t);
1797 rcu_wake_one_boost_kthread(rnp);
1817 } 1798 }
1799 }
1818 return 0; 1800 return 0;
1819} 1801}
1820early_initcall(rcu_spawn_kthreads); 1802early_initcall(rcu_spawn_kthreads);
@@ -2218,14 +2200,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2218 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 2200 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
2219} 2201}
2220 2202
2221static void __cpuinit rcu_online_cpu(int cpu) 2203static void __cpuinit rcu_prepare_cpu(int cpu)
2222{ 2204{
2223 rcu_init_percpu_data(cpu, &rcu_sched_state, 0); 2205 rcu_init_percpu_data(cpu, &rcu_sched_state, 0);
2224 rcu_init_percpu_data(cpu, &rcu_bh_state, 0); 2206 rcu_init_percpu_data(cpu, &rcu_bh_state, 0);
2225 rcu_preempt_init_percpu_data(cpu); 2207 rcu_preempt_init_percpu_data(cpu);
2226} 2208}
2227 2209
2228static void __cpuinit rcu_online_kthreads(int cpu) 2210static void __cpuinit rcu_prepare_kthreads(int cpu)
2229{ 2211{
2230 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); 2212 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2231 struct rcu_node *rnp = rdp->mynode; 2213 struct rcu_node *rnp = rdp->mynode;
@@ -2239,6 +2221,31 @@ static void __cpuinit rcu_online_kthreads(int cpu)
2239} 2221}
2240 2222
2241/* 2223/*
2224 * kthread_create() creates threads in TASK_UNINTERRUPTIBLE state,
2225 * but the RCU threads are woken on demand, and if demand is low this
2226 * could be a while triggering the hung task watchdog.
2227 *
2228 * In order to avoid this, poke all tasks once the CPU is fully
2229 * up and running.
2230 */
2231static void __cpuinit rcu_online_kthreads(int cpu)
2232{
2233 struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
2234 struct rcu_node *rnp = rdp->mynode;
2235 struct task_struct *t;
2236
2237 t = per_cpu(rcu_cpu_kthread_task, cpu);
2238 if (t)
2239 wake_up_process(t);
2240
2241 t = rnp->node_kthread_task;
2242 if (t)
2243 wake_up_process(t);
2244
2245 rcu_wake_one_boost_kthread(rnp);
2246}
2247
2248/*
2242 * Handle CPU online/offline notification events. 2249 * Handle CPU online/offline notification events.
2243 */ 2250 */
2244static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 2251static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
@@ -2251,10 +2258,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2251 switch (action) { 2258 switch (action) {
2252 case CPU_UP_PREPARE: 2259 case CPU_UP_PREPARE:
2253 case CPU_UP_PREPARE_FROZEN: 2260 case CPU_UP_PREPARE_FROZEN:
2254 rcu_online_cpu(cpu); 2261 rcu_prepare_cpu(cpu);
2255 rcu_online_kthreads(cpu); 2262 rcu_prepare_kthreads(cpu);
2256 break; 2263 break;
2257 case CPU_ONLINE: 2264 case CPU_ONLINE:
2265 rcu_online_kthreads(cpu);
2258 case CPU_DOWN_FAILED: 2266 case CPU_DOWN_FAILED:
2259 rcu_node_kthread_setaffinity(rnp, -1); 2267 rcu_node_kthread_setaffinity(rnp, -1);
2260 rcu_cpu_kthread_setrt(cpu, 1); 2268 rcu_cpu_kthread_setrt(cpu, 1);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 257664815d5d..7b9a08b4aaea 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,11 +84,9 @@
84 * Dynticks per-CPU state. 84 * Dynticks per-CPU state.
85 */ 85 */
86struct rcu_dynticks { 86struct rcu_dynticks {
87 int dynticks_nesting; /* Track nesting level, sort of. */ 87 int dynticks_nesting; /* Track irq/process nesting level. */
88 int dynticks; /* Even value for dynticks-idle, else odd. */ 88 int dynticks_nmi_nesting; /* Track NMI nesting level. */
89 int dynticks_nmi; /* Even value for either dynticks-idle or */ 89 atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
90 /* not in nmi handler, else odd. So this */
91 /* remains even for nmi from irq handler. */
92}; 90};
93 91
94/* RCU's kthread states for tracing. */ 92/* RCU's kthread states for tracing. */
@@ -121,7 +119,9 @@ struct rcu_node {
121 /* elements that need to drain to allow the */ 119 /* elements that need to drain to allow the */
122 /* current expedited grace period to */ 120 /* current expedited grace period to */
123 /* complete (only for TREE_PREEMPT_RCU). */ 121 /* complete (only for TREE_PREEMPT_RCU). */
124 unsigned long wakemask; /* CPUs whose kthread needs to be awakened. */ 122 atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
123 /* Since this has meaning only for leaf */
124 /* rcu_node structures, 32 bits suffices. */
125 unsigned long qsmaskinit; 125 unsigned long qsmaskinit;
126 /* Per-GP initial value for qsmask & expmask. */ 126 /* Per-GP initial value for qsmask & expmask. */
127 unsigned long grpmask; /* Mask to apply to parent qsmask. */ 127 unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -159,9 +159,6 @@ struct rcu_node {
159 struct task_struct *boost_kthread_task; 159 struct task_struct *boost_kthread_task;
160 /* kthread that takes care of priority */ 160 /* kthread that takes care of priority */
161 /* boosting for this rcu_node structure. */ 161 /* boosting for this rcu_node structure. */
162 wait_queue_head_t boost_wq;
163 /* Wait queue on which to park the boost */
164 /* kthread. */
165 unsigned int boost_kthread_status; 162 unsigned int boost_kthread_status;
166 /* State of boost_kthread_task for tracing. */ 163 /* State of boost_kthread_task for tracing. */
167 unsigned long n_tasks_boosted; 164 unsigned long n_tasks_boosted;
@@ -188,9 +185,6 @@ struct rcu_node {
188 /* kthread that takes care of this rcu_node */ 185 /* kthread that takes care of this rcu_node */
189 /* structure, for example, awakening the */ 186 /* structure, for example, awakening the */
190 /* per-CPU kthreads as needed. */ 187 /* per-CPU kthreads as needed. */
191 wait_queue_head_t node_wq;
192 /* Wait queue on which to park the per-node */
193 /* kthread. */
194 unsigned int node_kthread_status; 188 unsigned int node_kthread_status;
195 /* State of node_kthread_task for tracing. */ 189 /* State of node_kthread_task for tracing. */
196} ____cacheline_internodealigned_in_smp; 190} ____cacheline_internodealigned_in_smp;
@@ -284,7 +278,6 @@ struct rcu_data {
284 /* 3) dynticks interface. */ 278 /* 3) dynticks interface. */
285 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ 279 struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
286 int dynticks_snap; /* Per-GP tracking for dynticks. */ 280 int dynticks_snap; /* Per-GP tracking for dynticks. */
287 int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
288#endif /* #ifdef CONFIG_NO_HZ */ 281#endif /* #ifdef CONFIG_NO_HZ */
289 282
290 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 283 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -337,6 +330,16 @@ struct rcu_data {
337 /* scheduling clock irq */ 330 /* scheduling clock irq */
338 /* before ratting on them. */ 331 /* before ratting on them. */
339 332
333#define rcu_wait(cond) \
334do { \
335 for (;;) { \
336 set_current_state(TASK_INTERRUPTIBLE); \
337 if (cond) \
338 break; \
339 schedule(); \
340 } \
341 __set_current_state(TASK_RUNNING); \
342} while (0)
340 343
341/* 344/*
342 * RCU global state, including node hierarchy. This hierarchy is 345 * RCU global state, including node hierarchy. This hierarchy is
@@ -446,7 +449,6 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
446static void rcu_preempt_send_cbs_to_online(void); 449static void rcu_preempt_send_cbs_to_online(void);
447static void __init __rcu_init_preempt(void); 450static void __init __rcu_init_preempt(void);
448static void rcu_needs_cpu_flush(void); 451static void rcu_needs_cpu_flush(void);
449static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp);
450static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 452static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
451static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, 453static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
452 cpumask_var_t cm); 454 cpumask_var_t cm);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 3f6559a5f5cd..c8bff3099a89 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1196,8 +1196,7 @@ static int rcu_boost_kthread(void *arg)
1196 1196
1197 for (;;) { 1197 for (;;) {
1198 rnp->boost_kthread_status = RCU_KTHREAD_WAITING; 1198 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1199 wait_event_interruptible(rnp->boost_wq, rnp->boost_tasks || 1199 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1200 rnp->exp_tasks);
1201 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; 1200 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1202 more2boost = rcu_boost(rnp); 1201 more2boost = rcu_boost(rnp);
1203 if (more2boost) 1202 if (more2boost)
@@ -1275,14 +1274,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1275} 1274}
1276 1275
1277/* 1276/*
1278 * Initialize the RCU-boost waitqueue.
1279 */
1280static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
1281{
1282 init_waitqueue_head(&rnp->boost_wq);
1283}
1284
1285/*
1286 * Create an RCU-boost kthread for the specified node if one does not 1277 * Create an RCU-boost kthread for the specified node if one does not
1287 * already exist. We only create this kthread for preemptible RCU. 1278 * already exist. We only create this kthread for preemptible RCU.
1288 * Returns zero if all is well, a negated errno otherwise. 1279 * Returns zero if all is well, a negated errno otherwise.
@@ -1306,12 +1297,17 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1306 raw_spin_lock_irqsave(&rnp->lock, flags); 1297 raw_spin_lock_irqsave(&rnp->lock, flags);
1307 rnp->boost_kthread_task = t; 1298 rnp->boost_kthread_task = t;
1308 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1299 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1309 wake_up_process(t);
1310 sp.sched_priority = RCU_KTHREAD_PRIO; 1300 sp.sched_priority = RCU_KTHREAD_PRIO;
1311 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 1301 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1312 return 0; 1302 return 0;
1313} 1303}
1314 1304
1305static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp)
1306{
1307 if (rnp->boost_kthread_task)
1308 wake_up_process(rnp->boost_kthread_task);
1309}
1310
1315#else /* #ifdef CONFIG_RCU_BOOST */ 1311#else /* #ifdef CONFIG_RCU_BOOST */
1316 1312
1317static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) 1313static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1328,10 +1324,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1328{ 1324{
1329} 1325}
1330 1326
1331static void __init rcu_init_boost_waitqueue(struct rcu_node *rnp)
1332{
1333}
1334
1335static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 1327static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1336 struct rcu_node *rnp, 1328 struct rcu_node *rnp,
1337 int rnp_index) 1329 int rnp_index)
@@ -1339,6 +1331,10 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1339 return 0; 1331 return 0;
1340} 1332}
1341 1333
1334static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp)
1335{
1336}
1337
1342#endif /* #else #ifdef CONFIG_RCU_BOOST */ 1338#endif /* #else #ifdef CONFIG_RCU_BOOST */
1343 1339
1344#ifndef CONFIG_SMP 1340#ifndef CONFIG_SMP
@@ -1520,7 +1516,6 @@ int rcu_needs_cpu(int cpu)
1520{ 1516{
1521 int c = 0; 1517 int c = 0;
1522 int snap; 1518 int snap;
1523 int snap_nmi;
1524 int thatcpu; 1519 int thatcpu;
1525 1520
1526 /* Check for being in the holdoff period. */ 1521 /* Check for being in the holdoff period. */
@@ -1531,10 +1526,10 @@ int rcu_needs_cpu(int cpu)
1531 for_each_online_cpu(thatcpu) { 1526 for_each_online_cpu(thatcpu) {
1532 if (thatcpu == cpu) 1527 if (thatcpu == cpu)
1533 continue; 1528 continue;
1534 snap = per_cpu(rcu_dynticks, thatcpu).dynticks; 1529 snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
1535 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi; 1530 thatcpu).dynticks);
1536 smp_mb(); /* Order sampling of snap with end of grace period. */ 1531 smp_mb(); /* Order sampling of snap with end of grace period. */
1537 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) { 1532 if ((snap & 0x1) != 0) {
1538 per_cpu(rcu_dyntick_drain, cpu) = 0; 1533 per_cpu(rcu_dyntick_drain, cpu) = 0;
1539 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1534 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1540 return rcu_needs_cpu_quick_check(cpu); 1535 return rcu_needs_cpu_quick_check(cpu);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index aa0fd72b4bc7..9678cc3650f5 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -69,10 +69,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
69 rdp->passed_quiesc, rdp->passed_quiesc_completed, 69 rdp->passed_quiesc, rdp->passed_quiesc_completed,
70 rdp->qs_pending); 70 rdp->qs_pending);
71#ifdef CONFIG_NO_HZ 71#ifdef CONFIG_NO_HZ
72 seq_printf(m, " dt=%d/%d dn=%d df=%lu", 72 seq_printf(m, " dt=%d/%d/%d df=%lu",
73 rdp->dynticks->dynticks, 73 atomic_read(&rdp->dynticks->dynticks),
74 rdp->dynticks->dynticks_nesting, 74 rdp->dynticks->dynticks_nesting,
75 rdp->dynticks->dynticks_nmi, 75 rdp->dynticks->dynticks_nmi_nesting,
76 rdp->dynticks_fqs); 76 rdp->dynticks_fqs);
77#endif /* #ifdef CONFIG_NO_HZ */ 77#endif /* #ifdef CONFIG_NO_HZ */
78 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 78 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
@@ -141,9 +141,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
141 rdp->qs_pending); 141 rdp->qs_pending);
142#ifdef CONFIG_NO_HZ 142#ifdef CONFIG_NO_HZ
143 seq_printf(m, ",%d,%d,%d,%lu", 143 seq_printf(m, ",%d,%d,%d,%lu",
144 rdp->dynticks->dynticks, 144 atomic_read(&rdp->dynticks->dynticks),
145 rdp->dynticks->dynticks_nesting, 145 rdp->dynticks->dynticks_nesting,
146 rdp->dynticks->dynticks_nmi, 146 rdp->dynticks->dynticks_nmi_nesting,
147 rdp->dynticks_fqs); 147 rdp->dynticks_fqs);
148#endif /* #ifdef CONFIG_NO_HZ */ 148#endif /* #ifdef CONFIG_NO_HZ */
149 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 149 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
@@ -167,7 +167,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
167{ 167{
168 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); 168 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
169#ifdef CONFIG_NO_HZ 169#ifdef CONFIG_NO_HZ
170 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
171#endif /* #ifdef CONFIG_NO_HZ */ 171#endif /* #ifdef CONFIG_NO_HZ */
172 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); 172 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
173#ifdef CONFIG_TREE_PREEMPT_RCU 173#ifdef CONFIG_TREE_PREEMPT_RCU
diff --git a/kernel/sched.c b/kernel/sched.c
index 2d12893b8b0f..3f2e502d609b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -605,10 +605,10 @@ static inline int cpu_of(struct rq *rq)
605/* 605/*
606 * Return the group to which this tasks belongs. 606 * Return the group to which this tasks belongs.
607 * 607 *
608 * We use task_subsys_state_check() and extend the RCU verification 608 * We use task_subsys_state_check() and extend the RCU verification with
609 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() 609 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
610 * holds that lock for each task it moves into the cgroup. Therefore 610 * task it moves into the cgroup. Therefore by holding either of those locks,
611 * by holding that lock, we pin the task to the current cgroup. 611 * we pin the task to the current cgroup.
612 */ 612 */
613static inline struct task_group *task_group(struct task_struct *p) 613static inline struct task_group *task_group(struct task_struct *p)
614{ 614{
@@ -616,7 +616,8 @@ static inline struct task_group *task_group(struct task_struct *p)
616 struct cgroup_subsys_state *css; 616 struct cgroup_subsys_state *css;
617 617
618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
619 lockdep_is_held(&p->pi_lock)); 619 lockdep_is_held(&p->pi_lock) ||
620 lockdep_is_held(&task_rq(p)->lock));
620 tg = container_of(css, struct task_group, css); 621 tg = container_of(css, struct task_group, css);
621 622
622 return autogroup_task_group(p, tg); 623 return autogroup_task_group(p, tg);
@@ -2200,6 +2201,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2200 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2201 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2201 2202
2202#ifdef CONFIG_LOCKDEP 2203#ifdef CONFIG_LOCKDEP
2204 /*
2205 * The caller should hold either p->pi_lock or rq->lock, when changing
2206 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2207 *
2208 * sched_move_task() holds both and thus holding either pins the cgroup,
2209 * see set_task_rq().
2210 *
2211 * Furthermore, all task_rq users should acquire both locks, see
2212 * task_rq_lock().
2213 */
2203 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || 2214 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2204 lockdep_is_held(&task_rq(p)->lock))); 2215 lockdep_is_held(&task_rq(p)->lock)));
2205#endif 2216#endif
@@ -2447,6 +2458,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2447 } 2458 }
2448 rcu_read_unlock(); 2459 rcu_read_unlock();
2449 } 2460 }
2461
2462 if (wake_flags & WF_MIGRATED)
2463 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2464
2450#endif /* CONFIG_SMP */ 2465#endif /* CONFIG_SMP */
2451 2466
2452 schedstat_inc(rq, ttwu_count); 2467 schedstat_inc(rq, ttwu_count);
@@ -2455,9 +2470,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2455 if (wake_flags & WF_SYNC) 2470 if (wake_flags & WF_SYNC)
2456 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2471 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2457 2472
2458 if (cpu != task_cpu(p))
2459 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2460
2461#endif /* CONFIG_SCHEDSTATS */ 2473#endif /* CONFIG_SCHEDSTATS */
2462} 2474}
2463 2475
@@ -2573,7 +2585,26 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
2573 if (!next) 2585 if (!next)
2574 smp_send_reschedule(cpu); 2586 smp_send_reschedule(cpu);
2575} 2587}
2576#endif 2588
2589#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2590static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2591{
2592 struct rq *rq;
2593 int ret = 0;
2594
2595 rq = __task_rq_lock(p);
2596 if (p->on_cpu) {
2597 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2598 ttwu_do_wakeup(rq, p, wake_flags);
2599 ret = 1;
2600 }
2601 __task_rq_unlock(rq);
2602
2603 return ret;
2604
2605}
2606#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2607#endif /* CONFIG_SMP */
2577 2608
2578static void ttwu_queue(struct task_struct *p, int cpu) 2609static void ttwu_queue(struct task_struct *p, int cpu)
2579{ 2610{
@@ -2581,6 +2612,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
2581 2612
2582#if defined(CONFIG_SMP) 2613#if defined(CONFIG_SMP)
2583 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { 2614 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2615 sched_clock_cpu(cpu); /* sync clocks x-cpu */
2584 ttwu_queue_remote(p, cpu); 2616 ttwu_queue_remote(p, cpu);
2585 return; 2617 return;
2586 } 2618 }
@@ -2631,17 +2663,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2631 while (p->on_cpu) { 2663 while (p->on_cpu) {
2632#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 2664#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2633 /* 2665 /*
2634 * If called from interrupt context we could have landed in the 2666 * In case the architecture enables interrupts in
2635 * middle of schedule(), in this case we should take care not 2667 * context_switch(), we cannot busy wait, since that
2636 * to spin on ->on_cpu if p is current, since that would 2668 * would lead to deadlocks when an interrupt hits and
2637 * deadlock. 2669 * tries to wake up @prev. So bail and do a complete
2670 * remote wakeup.
2638 */ 2671 */
2639 if (p == current) { 2672 if (ttwu_activate_remote(p, wake_flags))
2640 ttwu_queue(p, cpu);
2641 goto stat; 2673 goto stat;
2642 } 2674#else
2643#endif
2644 cpu_relax(); 2675 cpu_relax();
2676#endif
2645 } 2677 }
2646 /* 2678 /*
2647 * Pairs with the smp_wmb() in finish_lock_switch(). 2679 * Pairs with the smp_wmb() in finish_lock_switch().
@@ -2655,8 +2687,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2655 p->sched_class->task_waking(p); 2687 p->sched_class->task_waking(p);
2656 2688
2657 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2689 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2658 if (task_cpu(p) != cpu) 2690 if (task_cpu(p) != cpu) {
2691 wake_flags |= WF_MIGRATED;
2659 set_task_cpu(p, cpu); 2692 set_task_cpu(p, cpu);
2693 }
2660#endif /* CONFIG_SMP */ 2694#endif /* CONFIG_SMP */
2661 2695
2662 ttwu_queue(p, cpu); 2696 ttwu_queue(p, cpu);
@@ -5841,7 +5875,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5841 idle->state = TASK_RUNNING; 5875 idle->state = TASK_RUNNING;
5842 idle->se.exec_start = sched_clock(); 5876 idle->se.exec_start = sched_clock();
5843 5877
5844 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5878 do_set_cpus_allowed(idle, cpumask_of(cpu));
5845 /* 5879 /*
5846 * We're having a chicken and egg problem, even though we are 5880 * We're having a chicken and egg problem, even though we are
5847 * holding rq->lock, the cpu isn't yet set to this cpu so the 5881 * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5929,6 +5963,16 @@ static inline void sched_init_granularity(void)
5929} 5963}
5930 5964
5931#ifdef CONFIG_SMP 5965#ifdef CONFIG_SMP
5966void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
5967{
5968 if (p->sched_class && p->sched_class->set_cpus_allowed)
5969 p->sched_class->set_cpus_allowed(p, new_mask);
5970 else {
5971 cpumask_copy(&p->cpus_allowed, new_mask);
5972 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5973 }
5974}
5975
5932/* 5976/*
5933 * This is how migration works: 5977 * This is how migration works:
5934 * 5978 *
@@ -5974,12 +6018,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5974 goto out; 6018 goto out;
5975 } 6019 }
5976 6020
5977 if (p->sched_class->set_cpus_allowed) 6021 do_set_cpus_allowed(p, new_mask);
5978 p->sched_class->set_cpus_allowed(p, new_mask);
5979 else {
5980 cpumask_copy(&p->cpus_allowed, new_mask);
5981 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5982 }
5983 6022
5984 /* Can the task run on the task's current CPU? If so, we're done */ 6023 /* Can the task run on the task's current CPU? If so, we're done */
5985 if (cpumask_test_cpu(task_cpu(p), new_mask)) 6024 if (cpumask_test_cpu(task_cpu(p), new_mask))
@@ -8764,42 +8803,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8764 return 0; 8803 return 0;
8765} 8804}
8766 8805
8767static int
8768cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8769 struct task_struct *tsk, bool threadgroup)
8770{
8771 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
8772 if (retval)
8773 return retval;
8774 if (threadgroup) {
8775 struct task_struct *c;
8776 rcu_read_lock();
8777 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8778 retval = cpu_cgroup_can_attach_task(cgrp, c);
8779 if (retval) {
8780 rcu_read_unlock();
8781 return retval;
8782 }
8783 }
8784 rcu_read_unlock();
8785 }
8786 return 0;
8787}
8788
8789static void 8806static void
8790cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8807cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
8791 struct cgroup *old_cont, struct task_struct *tsk,
8792 bool threadgroup)
8793{ 8808{
8794 sched_move_task(tsk); 8809 sched_move_task(tsk);
8795 if (threadgroup) {
8796 struct task_struct *c;
8797 rcu_read_lock();
8798 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
8799 sched_move_task(c);
8800 }
8801 rcu_read_unlock();
8802 }
8803} 8810}
8804 8811
8805static void 8812static void
@@ -8887,8 +8894,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8887 .name = "cpu", 8894 .name = "cpu",
8888 .create = cpu_cgroup_create, 8895 .create = cpu_cgroup_create,
8889 .destroy = cpu_cgroup_destroy, 8896 .destroy = cpu_cgroup_destroy,
8890 .can_attach = cpu_cgroup_can_attach, 8897 .can_attach_task = cpu_cgroup_can_attach_task,
8891 .attach = cpu_cgroup_attach, 8898 .attach_task = cpu_cgroup_attach_task,
8892 .exit = cpu_cgroup_exit, 8899 .exit = cpu_cgroup_exit,
8893 .populate = cpu_cgroup_populate, 8900 .populate = cpu_cgroup_populate,
8894 .subsys_id = cpu_cgroup_subsys_id, 8901 .subsys_id = cpu_cgroup_subsys_id,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e32a9b70ee9c..433491c2dc8f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1076,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1076 se->on_rq = 0; 1076 se->on_rq = 0;
1077 update_cfs_load(cfs_rq, 0); 1077 update_cfs_load(cfs_rq, 0);
1078 account_entity_dequeue(cfs_rq, se); 1078 account_entity_dequeue(cfs_rq, se);
1079 update_min_vruntime(cfs_rq);
1080 update_cfs_shares(cfs_rq);
1081 1079
1082 /* 1080 /*
1083 * Normalize the entity after updating the min_vruntime because the 1081 * Normalize the entity after updating the min_vruntime because the
@@ -1086,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
1086 */ 1084 */
1087 if (!(flags & DEQUEUE_SLEEP)) 1085 if (!(flags & DEQUEUE_SLEEP))
1088 se->vruntime -= cfs_rq->min_vruntime; 1086 se->vruntime -= cfs_rq->min_vruntime;
1087
1088 update_min_vruntime(cfs_rq);
1089 update_cfs_shares(cfs_rq);
1089} 1090}
1090 1091
1091/* 1092/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 64b2a37c07d0..88725c939e0b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1263,6 +1263,7 @@ static int find_lowest_rq(struct task_struct *task)
1263 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1263 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1264 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1264 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1265 1265
1266 rcu_read_lock();
1266 for_each_domain(cpu, sd) { 1267 for_each_domain(cpu, sd) {
1267 if (sd->flags & SD_WAKE_AFFINE) { 1268 if (sd->flags & SD_WAKE_AFFINE) {
1268 int best_cpu; 1269 int best_cpu;
@@ -1272,15 +1273,20 @@ static int find_lowest_rq(struct task_struct *task)
1272 * remote processor. 1273 * remote processor.
1273 */ 1274 */
1274 if (this_cpu != -1 && 1275 if (this_cpu != -1 &&
1275 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) 1276 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1277 rcu_read_unlock();
1276 return this_cpu; 1278 return this_cpu;
1279 }
1277 1280
1278 best_cpu = cpumask_first_and(lowest_mask, 1281 best_cpu = cpumask_first_and(lowest_mask,
1279 sched_domain_span(sd)); 1282 sched_domain_span(sd));
1280 if (best_cpu < nr_cpu_ids) 1283 if (best_cpu < nr_cpu_ids) {
1284 rcu_read_unlock();
1281 return best_cpu; 1285 return best_cpu;
1286 }
1282 } 1287 }
1283 } 1288 }
1289 rcu_read_unlock();
1284 1290
1285 /* 1291 /*
1286 * And finally, if there were no matches within the domains 1292 * And finally, if there were no matches within the domains
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 48ddf431db0e..331e01bcd026 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
37 37
38#ifdef CONFIG_SMP 38#ifdef CONFIG_SMP
39 /* domain-specific stats */ 39 /* domain-specific stats */
40 preempt_disable(); 40 rcu_read_lock();
41 for_each_domain(cpu, sd) { 41 for_each_domain(cpu, sd) {
42 enum cpu_idle_type itype; 42 enum cpu_idle_type itype;
43 43
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
64 sd->ttwu_wake_remote, sd->ttwu_move_affine, 64 sd->ttwu_wake_remote, sd->ttwu_move_affine,
65 sd->ttwu_move_balance); 65 sd->ttwu_move_balance);
66 } 66 }
67 preempt_enable(); 67 rcu_read_unlock();
68#endif 68#endif
69 } 69 }
70 kfree(mask_str); 70 kfree(mask_str);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4fc92445a29c..f175d98bd355 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -938,6 +938,12 @@ static struct ctl_table kern_table[] = {
938 }, 938 },
939#endif 939#endif
940#ifdef CONFIG_PERF_EVENTS 940#ifdef CONFIG_PERF_EVENTS
941 /*
942 * User-space scripts rely on the existence of this file
943 * as a feature check for perf_events being enabled.
944 *
945 * So it's an ABI, do not remove!
946 */
941 { 947 {
942 .procname = "perf_event_paranoid", 948 .procname = "perf_event_paranoid",
943 .data = &sysctl_perf_event_paranoid, 949 .data = &sysctl_perf_event_paranoid,
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index c027d4f602f1..e4c699dfa4e8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -182,7 +182,10 @@ void clockevents_register_device(struct clock_event_device *dev)
182 unsigned long flags; 182 unsigned long flags;
183 183
184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
185 BUG_ON(!dev->cpumask); 185 if (!dev->cpumask) {
186 WARN_ON(num_possible_cpus() > 1);
187 dev->cpumask = cpumask_of(smp_processor_id());
188 }
186 189
187 raw_spin_lock_irqsave(&clockevents_lock, flags); 190 raw_spin_lock_irqsave(&clockevents_lock, flags);
188 191
diff --git a/kernel/timer.c b/kernel/timer.c
index fd6198692b57..8cff36119e4d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -749,16 +749,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
749 unsigned long expires_limit, mask; 749 unsigned long expires_limit, mask;
750 int bit; 750 int bit;
751 751
752 expires_limit = expires;
753
754 if (timer->slack >= 0) { 752 if (timer->slack >= 0) {
755 expires_limit = expires + timer->slack; 753 expires_limit = expires + timer->slack;
756 } else { 754 } else {
757 unsigned long now = jiffies; 755 long delta = expires - jiffies;
756
757 if (delta < 256)
758 return expires;
758 759
759 /* No slack, if already expired else auto slack 0.4% */ 760 expires_limit = expires + delta / 256;
760 if (time_after(expires, now))
761 expires_limit = expires + (expires - now)/256;
762 } 761 }
763 mask = expires ^ expires_limit; 762 mask = expires ^ expires_limit;
764 if (mask == 0) 763 if (mask == 0)
@@ -795,6 +794,8 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
795 */ 794 */
796int mod_timer(struct timer_list *timer, unsigned long expires) 795int mod_timer(struct timer_list *timer, unsigned long expires)
797{ 796{
797 expires = apply_slack(timer, expires);
798
798 /* 799 /*
799 * This is a common optimization triggered by the 800 * This is a common optimization triggered by the
800 * networking code - if the timer is re-modified 801 * networking code - if the timer is re-modified
@@ -803,8 +804,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
803 if (timer_pending(timer) && timer->expires == expires) 804 if (timer_pending(timer) && timer->expires == expires)
804 return 1; 805 return 1;
805 806
806 expires = apply_slack(timer, expires);
807
808 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 807 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
809} 808}
810EXPORT_SYMBOL(mod_timer); 809EXPORT_SYMBOL(mod_timer);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d017c2c82c44..1ee417fcbfa5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -109,12 +109,18 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
109static void ftrace_global_list_func(unsigned long ip, 109static void ftrace_global_list_func(unsigned long ip,
110 unsigned long parent_ip) 110 unsigned long parent_ip)
111{ 111{
112 struct ftrace_ops *op = rcu_dereference_raw(ftrace_global_list); /*see above*/ 112 struct ftrace_ops *op;
113
114 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
115 return;
113 116
117 trace_recursion_set(TRACE_GLOBAL_BIT);
118 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
114 while (op != &ftrace_list_end) { 119 while (op != &ftrace_list_end) {
115 op->func(ip, parent_ip); 120 op->func(ip, parent_ip);
116 op = rcu_dereference_raw(op->next); /*see above*/ 121 op = rcu_dereference_raw(op->next); /*see above*/
117 }; 122 };
123 trace_recursion_clear(TRACE_GLOBAL_BIT);
118} 124}
119 125
120static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) 126static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
@@ -1638,12 +1644,12 @@ static void ftrace_startup_enable(int command)
1638 ftrace_run_update_code(command); 1644 ftrace_run_update_code(command);
1639} 1645}
1640 1646
1641static void ftrace_startup(struct ftrace_ops *ops, int command) 1647static int ftrace_startup(struct ftrace_ops *ops, int command)
1642{ 1648{
1643 bool hash_enable = true; 1649 bool hash_enable = true;
1644 1650
1645 if (unlikely(ftrace_disabled)) 1651 if (unlikely(ftrace_disabled))
1646 return; 1652 return -ENODEV;
1647 1653
1648 ftrace_start_up++; 1654 ftrace_start_up++;
1649 command |= FTRACE_ENABLE_CALLS; 1655 command |= FTRACE_ENABLE_CALLS;
@@ -1662,6 +1668,8 @@ static void ftrace_startup(struct ftrace_ops *ops, int command)
1662 ftrace_hash_rec_enable(ops, 1); 1668 ftrace_hash_rec_enable(ops, 1);
1663 1669
1664 ftrace_startup_enable(command); 1670 ftrace_startup_enable(command);
1671
1672 return 0;
1665} 1673}
1666 1674
1667static void ftrace_shutdown(struct ftrace_ops *ops, int command) 1675static void ftrace_shutdown(struct ftrace_ops *ops, int command)
@@ -2501,7 +2509,7 @@ static void __enable_ftrace_function_probe(void)
2501 2509
2502 ret = __register_ftrace_function(&trace_probe_ops); 2510 ret = __register_ftrace_function(&trace_probe_ops);
2503 if (!ret) 2511 if (!ret)
2504 ftrace_startup(&trace_probe_ops, 0); 2512 ret = ftrace_startup(&trace_probe_ops, 0);
2505 2513
2506 ftrace_probe_registered = 1; 2514 ftrace_probe_registered = 1;
2507} 2515}
@@ -3466,7 +3474,11 @@ device_initcall(ftrace_nodyn_init);
3466static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 3474static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
3467static inline void ftrace_startup_enable(int command) { } 3475static inline void ftrace_startup_enable(int command) { }
3468/* Keep as macros so we do not need to define the commands */ 3476/* Keep as macros so we do not need to define the commands */
3469# define ftrace_startup(ops, command) do { } while (0) 3477# define ftrace_startup(ops, command) \
3478 ({ \
3479 (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
3480 0; \
3481 })
3470# define ftrace_shutdown(ops, command) do { } while (0) 3482# define ftrace_shutdown(ops, command) do { } while (0)
3471# define ftrace_startup_sysctl() do { } while (0) 3483# define ftrace_startup_sysctl() do { } while (0)
3472# define ftrace_shutdown_sysctl() do { } while (0) 3484# define ftrace_shutdown_sysctl() do { } while (0)
@@ -3484,6 +3496,10 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3484{ 3496{
3485 struct ftrace_ops *op; 3497 struct ftrace_ops *op;
3486 3498
3499 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
3500 return;
3501
3502 trace_recursion_set(TRACE_INTERNAL_BIT);
3487 /* 3503 /*
3488 * Some of the ops may be dynamically allocated, 3504 * Some of the ops may be dynamically allocated,
3489 * they must be freed after a synchronize_sched(). 3505 * they must be freed after a synchronize_sched().
@@ -3496,6 +3512,7 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3496 op = rcu_dereference_raw(op->next); 3512 op = rcu_dereference_raw(op->next);
3497 }; 3513 };
3498 preempt_enable_notrace(); 3514 preempt_enable_notrace();
3515 trace_recursion_clear(TRACE_INTERNAL_BIT);
3499} 3516}
3500 3517
3501static void clear_ftrace_swapper(void) 3518static void clear_ftrace_swapper(void)
@@ -3799,7 +3816,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
3799 3816
3800 ret = __register_ftrace_function(ops); 3817 ret = __register_ftrace_function(ops);
3801 if (!ret) 3818 if (!ret)
3802 ftrace_startup(ops, 0); 3819 ret = ftrace_startup(ops, 0);
3803 3820
3804 3821
3805 out_unlock: 3822 out_unlock:
@@ -4045,7 +4062,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
4045 ftrace_graph_return = retfunc; 4062 ftrace_graph_return = retfunc;
4046 ftrace_graph_entry = entryfunc; 4063 ftrace_graph_entry = entryfunc;
4047 4064
4048 ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); 4065 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
4049 4066
4050out: 4067out:
4051 mutex_unlock(&ftrace_lock); 4068 mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0ef7b4b2a1f7..b0c7aa407943 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2216,7 +2216,7 @@ static noinline void trace_recursive_fail(void)
2216 2216
2217 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" 2217 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
2218 "HC[%lu]:SC[%lu]:NMI[%lu]\n", 2218 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
2219 current->trace_recursion, 2219 trace_recursion_buffer(),
2220 hardirq_count() >> HARDIRQ_SHIFT, 2220 hardirq_count() >> HARDIRQ_SHIFT,
2221 softirq_count() >> SOFTIRQ_SHIFT, 2221 softirq_count() >> SOFTIRQ_SHIFT,
2222 in_nmi()); 2222 in_nmi());
@@ -2226,9 +2226,9 @@ static noinline void trace_recursive_fail(void)
2226 2226
2227static inline int trace_recursive_lock(void) 2227static inline int trace_recursive_lock(void)
2228{ 2228{
2229 current->trace_recursion++; 2229 trace_recursion_inc();
2230 2230
2231 if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) 2231 if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
2232 return 0; 2232 return 0;
2233 2233
2234 trace_recursive_fail(); 2234 trace_recursive_fail();
@@ -2238,9 +2238,9 @@ static inline int trace_recursive_lock(void)
2238 2238
2239static inline void trace_recursive_unlock(void) 2239static inline void trace_recursive_unlock(void)
2240{ 2240{
2241 WARN_ON_ONCE(!current->trace_recursion); 2241 WARN_ON_ONCE(!trace_recursion_buffer());
2242 2242
2243 current->trace_recursion--; 2243 trace_recursion_dec();
2244} 2244}
2245 2245
2246#else 2246#else
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6b69c4bd306f..229f8591f61d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -784,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[];
784 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 784 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
785#include "trace_entries.h" 785#include "trace_entries.h"
786 786
787/* Only current can touch trace_recursion */
788#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
789#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
790
791/* Ring buffer has the 10 LSB bits to count */
792#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
793
794/* for function tracing recursion */
795#define TRACE_INTERNAL_BIT (1<<11)
796#define TRACE_GLOBAL_BIT (1<<12)
797
798#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
799#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
800#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
801
787#endif /* _LINUX_KERNEL_TRACE_H */ 802#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2fe110341359..686ec399f2a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1657,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata =
1657 1657
1658static __init void event_trace_self_test_with_function(void) 1658static __init void event_trace_self_test_with_function(void)
1659{ 1659{
1660 register_ftrace_function(&trace_ops); 1660 int ret;
1661 ret = register_ftrace_function(&trace_ops);
1662 if (WARN_ON(ret < 0)) {
1663 pr_info("Failed to enable function tracer for event tests\n");
1664 return;
1665 }
1661 pr_info("Running tests again, along with the function tracer\n"); 1666 pr_info("Running tests again, along with the function tracer\n");
1662 event_trace_self_tests(); 1667 event_trace_self_tests();
1663 unregister_ftrace_function(&trace_ops); 1668 unregister_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index cf535ccedc86..e37de492a9e1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
353} 353}
354EXPORT_SYMBOL(ftrace_print_symbols_seq); 354EXPORT_SYMBOL(ftrace_print_symbols_seq);
355 355
356#if BITS_PER_LONG == 32
357const char *
358ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
359 const struct trace_print_flags_u64 *symbol_array)
360{
361 int i;
362 const char *ret = p->buffer + p->len;
363
364 for (i = 0; symbol_array[i].name; i++) {
365
366 if (val != symbol_array[i].mask)
367 continue;
368
369 trace_seq_puts(p, symbol_array[i].name);
370 break;
371 }
372
373 if (!p->len)
374 trace_seq_printf(p, "0x%llx", val);
375
376 trace_seq_putc(p, 0);
377
378 return ret;
379}
380EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
381#endif
382
356const char * 383const char *
357ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) 384ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
358{ 385{
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 44646179eaba..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,7 @@
15#include <linux/err.h> 15#include <linux/err.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/user_namespace.h> 17#include <linux/user_namespace.h>
18#include <linux/proc_fs.h>
18 19
19static struct uts_namespace *create_uts_ns(void) 20static struct uts_namespace *create_uts_ns(void)
20{ 21{
@@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref)
79 put_user_ns(ns->user_ns); 80 put_user_ns(ns->user_ns);
80 kfree(ns); 81 kfree(ns);
81} 82}
83
84static void *utsns_get(struct task_struct *task)
85{
86 struct uts_namespace *ns = NULL;
87 struct nsproxy *nsproxy;
88
89 rcu_read_lock();
90 nsproxy = task_nsproxy(task);
91 if (nsproxy) {
92 ns = nsproxy->uts_ns;
93 get_uts_ns(ns);
94 }
95 rcu_read_unlock();
96
97 return ns;
98}
99
100static void utsns_put(void *ns)
101{
102 put_uts_ns(ns);
103}
104
105static int utsns_install(struct nsproxy *nsproxy, void *ns)
106{
107 get_uts_ns(ns);
108 put_uts_ns(nsproxy->uts_ns);
109 nsproxy->uts_ns = ns;
110 return 0;
111}
112
113const struct proc_ns_operations utsns_operations = {
114 .name = "uts",
115 .type = CLONE_NEWUTS,
116 .get = utsns_get,
117 .put = utsns_put,
118 .install = utsns_install,
119};
120
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 7daa4b072e9f..3d0c56ad4792 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -415,15 +415,13 @@ static void watchdog_nmi_disable(int cpu) { return; }
415#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 415#endif /* CONFIG_HARDLOCKUP_DETECTOR */
416 416
417/* prepare/enable/disable routines */ 417/* prepare/enable/disable routines */
418static int watchdog_prepare_cpu(int cpu) 418static void watchdog_prepare_cpu(int cpu)
419{ 419{
420 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); 420 struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
421 421
422 WARN_ON(per_cpu(softlockup_watchdog, cpu)); 422 WARN_ON(per_cpu(softlockup_watchdog, cpu));
423 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 423 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
424 hrtimer->function = watchdog_timer_fn; 424 hrtimer->function = watchdog_timer_fn;
425
426 return 0;
427} 425}
428 426
429static int watchdog_enable(int cpu) 427static int watchdog_enable(int cpu)
@@ -542,17 +540,16 @@ static int __cpuinit
542cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 540cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
543{ 541{
544 int hotcpu = (unsigned long)hcpu; 542 int hotcpu = (unsigned long)hcpu;
545 int err = 0;
546 543
547 switch (action) { 544 switch (action) {
548 case CPU_UP_PREPARE: 545 case CPU_UP_PREPARE:
549 case CPU_UP_PREPARE_FROZEN: 546 case CPU_UP_PREPARE_FROZEN:
550 err = watchdog_prepare_cpu(hotcpu); 547 watchdog_prepare_cpu(hotcpu);
551 break; 548 break;
552 case CPU_ONLINE: 549 case CPU_ONLINE:
553 case CPU_ONLINE_FROZEN: 550 case CPU_ONLINE_FROZEN:
554 if (watchdog_enabled) 551 if (watchdog_enabled)
555 err = watchdog_enable(hotcpu); 552 watchdog_enable(hotcpu);
556 break; 553 break;
557#ifdef CONFIG_HOTPLUG_CPU 554#ifdef CONFIG_HOTPLUG_CPU
558 case CPU_UP_CANCELED: 555 case CPU_UP_CANCELED: