aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2012-03-26 11:18:44 -0400
committerIngo Molnar <mingo@kernel.org>2012-03-26 11:19:03 -0400
commit7fd52392c56361a40f0c630a82b36b95ca31eac6 (patch)
tree14091de24c6b28ea4cae9826f98aeedb7be091f5 /kernel
parentb01c3a0010aabadf745f3e7fdb9cab682e0a28a2 (diff)
parente22057c8599373e5caef0bc42bdb95d2a361ab0d (diff)
Merge branch 'linus' into perf/urgent
Merge reason: we need to fix a non-trivial merge conflict. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/cgroup.c354
-rw-r--r--kernel/cgroup_freezer.c11
-rw-r--r--kernel/cpuset.c59
-rw-r--r--kernel/cred.c1
-rw-r--r--kernel/debug/debug_core.c33
-rw-r--r--kernel/debug/gdbstub.c10
-rw-r--r--kernel/debug/kdb/kdb_bp.c7
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c95
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/debug/kdb/kdb_private.h7
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/events/core.c13
-rw-r--r--kernel/exit.c64
-rw-r--r--kernel/fork.c36
-rw-r--r--kernel/freezer.c6
-rw-r--r--kernel/irq/irqdomain.c828
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kmod.c84
-rw-r--r--kernel/padata.c44
-rw-r--r--kernel/params.c1
-rw-r--r--kernel/pid_namespace.c8
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/hibernate.c47
-rw-r--r--kernel/power/main.c20
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/process.c24
-rw-r--r--kernel/power/qos.c23
-rw-r--r--kernel/power/snapshot.c35
-rw-r--r--kernel/power/suspend.c84
-rw-r--r--kernel/power/user.c12
-rw-r--r--kernel/ptrace.c66
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/sched/core.c21
-rw-r--r--kernel/signal.c27
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c502
-rw-r--r--kernel/sysctl_check.c160
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/watchdog.c27
-rw-r--r--kernel/workqueue.c22
44 files changed, 1391 insertions, 1385 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e76..cb41b9547c9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,7 +27,6 @@ obj-y += power/
27 27
28obj-$(CONFIG_FREEZER) += freezer.o 28obj-$(CONFIG_FREEZER) += freezer.o
29obj-$(CONFIG_PROFILING) += profile.o 29obj-$(CONFIG_PROFILING) += profile.o
30obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
31obj-$(CONFIG_STACKTRACE) += stacktrace.o 30obj-$(CONFIG_STACKTRACE) += stacktrace.o
32obj-y += time/ 31obj-y += time/
33obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 32obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/audit.c b/kernel/audit.c
index bb0eb5bb9a0a..1c7f2c61416b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1418 1418
1419/* This is a helper-function to print the escaped d_path */ 1419/* This is a helper-function to print the escaped d_path */
1420void audit_log_d_path(struct audit_buffer *ab, const char *prefix, 1420void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1421 struct path *path) 1421 const struct path *path)
1422{ 1422{
1423 char *p, *pathname; 1423 char *p, *pathname;
1424 1424
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b5325f77..f4ea4b6f3cf1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -818,7 +818,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
818 818
819 for_each_subsys(cgrp->root, ss) 819 for_each_subsys(cgrp->root, ss)
820 if (ss->pre_destroy) { 820 if (ss->pre_destroy) {
821 ret = ss->pre_destroy(ss, cgrp); 821 ret = ss->pre_destroy(cgrp);
822 if (ret) 822 if (ret)
823 break; 823 break;
824 } 824 }
@@ -846,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
846 * Release the subsystem state objects. 846 * Release the subsystem state objects.
847 */ 847 */
848 for_each_subsys(cgrp->root, ss) 848 for_each_subsys(cgrp->root, ss)
849 ss->destroy(ss, cgrp); 849 ss->destroy(cgrp);
850 850
851 cgrp->root->number_of_cgroups--; 851 cgrp->root->number_of_cgroups--;
852 mutex_unlock(&cgroup_mutex); 852 mutex_unlock(&cgroup_mutex);
@@ -1015,7 +1015,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1015 list_move(&ss->sibling, &root->subsys_list); 1015 list_move(&ss->sibling, &root->subsys_list);
1016 ss->root = root; 1016 ss->root = root;
1017 if (ss->bind) 1017 if (ss->bind)
1018 ss->bind(ss, cgrp); 1018 ss->bind(cgrp);
1019 mutex_unlock(&ss->hierarchy_mutex); 1019 mutex_unlock(&ss->hierarchy_mutex);
1020 /* refcount was already taken, and we're keeping it */ 1020 /* refcount was already taken, and we're keeping it */
1021 } else if (bit & removed_bits) { 1021 } else if (bit & removed_bits) {
@@ -1025,7 +1025,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1025 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1025 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1026 mutex_lock(&ss->hierarchy_mutex); 1026 mutex_lock(&ss->hierarchy_mutex);
1027 if (ss->bind) 1027 if (ss->bind)
1028 ss->bind(ss, dummytop); 1028 ss->bind(dummytop);
1029 dummytop->subsys[i]->cgroup = dummytop; 1029 dummytop->subsys[i]->cgroup = dummytop;
1030 cgrp->subsys[i] = NULL; 1030 cgrp->subsys[i] = NULL;
1031 subsys[i]->root = &rootnode; 1031 subsys[i]->root = &rootnode;
@@ -1472,7 +1472,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
1472 1472
1473 struct inode *inode = 1473 struct inode *inode =
1474 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1474 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1475 struct dentry *dentry;
1476 1475
1477 if (!inode) 1476 if (!inode)
1478 return -ENOMEM; 1477 return -ENOMEM;
@@ -1481,12 +1480,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
1481 inode->i_op = &cgroup_dir_inode_operations; 1480 inode->i_op = &cgroup_dir_inode_operations;
1482 /* directories start off with i_nlink == 2 (for "." entry) */ 1481 /* directories start off with i_nlink == 2 (for "." entry) */
1483 inc_nlink(inode); 1482 inc_nlink(inode);
1484 dentry = d_alloc_root(inode); 1483 sb->s_root = d_make_root(inode);
1485 if (!dentry) { 1484 if (!sb->s_root)
1486 iput(inode);
1487 return -ENOMEM; 1485 return -ENOMEM;
1488 }
1489 sb->s_root = dentry;
1490 /* for everything else we want ->d_op set */ 1486 /* for everything else we want ->d_op set */
1491 sb->s_d_op = &cgroup_dops; 1487 sb->s_d_op = &cgroup_dops;
1492 return 0; 1488 return 0;
@@ -1763,6 +1759,7 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1763struct task_and_cgroup { 1759struct task_and_cgroup {
1764 struct task_struct *task; 1760 struct task_struct *task;
1765 struct cgroup *cgrp; 1761 struct cgroup *cgrp;
1762 struct css_set *cg;
1766}; 1763};
1767 1764
1768struct cgroup_taskset { 1765struct cgroup_taskset {
@@ -1843,11 +1840,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1843 * will already exist. If not set, this function might sleep, and can fail with 1840 * will already exist. If not set, this function might sleep, and can fail with
1844 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. 1841 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1845 */ 1842 */
1846static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1843static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1847 struct task_struct *tsk, bool guarantee) 1844 struct task_struct *tsk, struct css_set *newcg)
1848{ 1845{
1849 struct css_set *oldcg; 1846 struct css_set *oldcg;
1850 struct css_set *newcg;
1851 1847
1852 /* 1848 /*
1853 * We are synchronized through threadgroup_lock() against PF_EXITING 1849 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1857,23 +1853,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1857 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1853 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1858 oldcg = tsk->cgroups; 1854 oldcg = tsk->cgroups;
1859 1855
1860 /* locate or allocate a new css_set for this task. */
1861 if (guarantee) {
1862 /* we know the css_set we want already exists. */
1863 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1864 read_lock(&css_set_lock);
1865 newcg = find_existing_css_set(oldcg, cgrp, template);
1866 BUG_ON(!newcg);
1867 get_css_set(newcg);
1868 read_unlock(&css_set_lock);
1869 } else {
1870 might_sleep();
1871 /* find_css_set will give us newcg already referenced. */
1872 newcg = find_css_set(oldcg, cgrp);
1873 if (!newcg)
1874 return -ENOMEM;
1875 }
1876
1877 task_lock(tsk); 1856 task_lock(tsk);
1878 rcu_assign_pointer(tsk->cgroups, newcg); 1857 rcu_assign_pointer(tsk->cgroups, newcg);
1879 task_unlock(tsk); 1858 task_unlock(tsk);
@@ -1892,7 +1871,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1892 put_css_set(oldcg); 1871 put_css_set(oldcg);
1893 1872
1894 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1873 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1895 return 0;
1896} 1874}
1897 1875
1898/** 1876/**
@@ -1910,6 +1888,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1910 struct cgroup *oldcgrp; 1888 struct cgroup *oldcgrp;
1911 struct cgroupfs_root *root = cgrp->root; 1889 struct cgroupfs_root *root = cgrp->root;
1912 struct cgroup_taskset tset = { }; 1890 struct cgroup_taskset tset = { };
1891 struct css_set *newcg;
1913 1892
1914 /* @tsk either already exited or can't exit until the end */ 1893 /* @tsk either already exited or can't exit until the end */
1915 if (tsk->flags & PF_EXITING) 1894 if (tsk->flags & PF_EXITING)
@@ -1925,7 +1904,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1925 1904
1926 for_each_subsys(root, ss) { 1905 for_each_subsys(root, ss) {
1927 if (ss->can_attach) { 1906 if (ss->can_attach) {
1928 retval = ss->can_attach(ss, cgrp, &tset); 1907 retval = ss->can_attach(cgrp, &tset);
1929 if (retval) { 1908 if (retval) {
1930 /* 1909 /*
1931 * Remember on which subsystem the can_attach() 1910 * Remember on which subsystem the can_attach()
@@ -1939,13 +1918,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1939 } 1918 }
1940 } 1919 }
1941 1920
1942 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1921 newcg = find_css_set(tsk->cgroups, cgrp);
1943 if (retval) 1922 if (!newcg) {
1923 retval = -ENOMEM;
1944 goto out; 1924 goto out;
1925 }
1926
1927 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
1945 1928
1946 for_each_subsys(root, ss) { 1929 for_each_subsys(root, ss) {
1947 if (ss->attach) 1930 if (ss->attach)
1948 ss->attach(ss, cgrp, &tset); 1931 ss->attach(cgrp, &tset);
1949 } 1932 }
1950 1933
1951 synchronize_rcu(); 1934 synchronize_rcu();
@@ -1967,7 +1950,7 @@ out:
1967 */ 1950 */
1968 break; 1951 break;
1969 if (ss->cancel_attach) 1952 if (ss->cancel_attach)
1970 ss->cancel_attach(ss, cgrp, &tset); 1953 ss->cancel_attach(cgrp, &tset);
1971 } 1954 }
1972 } 1955 }
1973 return retval; 1956 return retval;
@@ -1997,66 +1980,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1997} 1980}
1998EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1981EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1999 1982
2000/*
2001 * cgroup_attach_proc works in two stages, the first of which prefetches all
2002 * new css_sets needed (to make sure we have enough memory before committing
2003 * to the move) and stores them in a list of entries of the following type.
2004 * TODO: possible optimization: use css_set->rcu_head for chaining instead
2005 */
2006struct cg_list_entry {
2007 struct css_set *cg;
2008 struct list_head links;
2009};
2010
2011static bool css_set_check_fetched(struct cgroup *cgrp,
2012 struct task_struct *tsk, struct css_set *cg,
2013 struct list_head *newcg_list)
2014{
2015 struct css_set *newcg;
2016 struct cg_list_entry *cg_entry;
2017 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
2018
2019 read_lock(&css_set_lock);
2020 newcg = find_existing_css_set(cg, cgrp, template);
2021 read_unlock(&css_set_lock);
2022
2023 /* doesn't exist at all? */
2024 if (!newcg)
2025 return false;
2026 /* see if it's already in the list */
2027 list_for_each_entry(cg_entry, newcg_list, links)
2028 if (cg_entry->cg == newcg)
2029 return true;
2030
2031 /* not found */
2032 return false;
2033}
2034
2035/*
2036 * Find the new css_set and store it in the list in preparation for moving the
2037 * given task to the given cgroup. Returns 0 or -ENOMEM.
2038 */
2039static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
2040 struct list_head *newcg_list)
2041{
2042 struct css_set *newcg;
2043 struct cg_list_entry *cg_entry;
2044
2045 /* ensure a new css_set will exist for this thread */
2046 newcg = find_css_set(cg, cgrp);
2047 if (!newcg)
2048 return -ENOMEM;
2049 /* add it to the list */
2050 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
2051 if (!cg_entry) {
2052 put_css_set(newcg);
2053 return -ENOMEM;
2054 }
2055 cg_entry->cg = newcg;
2056 list_add(&cg_entry->links, newcg_list);
2057 return 0;
2058}
2059
2060/** 1983/**
2061 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup 1984 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
2062 * @cgrp: the cgroup to attach to 1985 * @cgrp: the cgroup to attach to
@@ -2070,20 +1993,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2070 int retval, i, group_size; 1993 int retval, i, group_size;
2071 struct cgroup_subsys *ss, *failed_ss = NULL; 1994 struct cgroup_subsys *ss, *failed_ss = NULL;
2072 /* guaranteed to be initialized later, but the compiler needs this */ 1995 /* guaranteed to be initialized later, but the compiler needs this */
2073 struct css_set *oldcg;
2074 struct cgroupfs_root *root = cgrp->root; 1996 struct cgroupfs_root *root = cgrp->root;
2075 /* threadgroup list cursor and array */ 1997 /* threadgroup list cursor and array */
2076 struct task_struct *tsk; 1998 struct task_struct *tsk;
2077 struct task_and_cgroup *tc; 1999 struct task_and_cgroup *tc;
2078 struct flex_array *group; 2000 struct flex_array *group;
2079 struct cgroup_taskset tset = { }; 2001 struct cgroup_taskset tset = { };
2080 /*
2081 * we need to make sure we have css_sets for all the tasks we're
2082 * going to move -before- we actually start moving them, so that in
2083 * case we get an ENOMEM we can bail out before making any changes.
2084 */
2085 struct list_head newcg_list;
2086 struct cg_list_entry *cg_entry, *temp_nobe;
2087 2002
2088 /* 2003 /*
2089 * step 0: in order to do expensive, possibly blocking operations for 2004 * step 0: in order to do expensive, possibly blocking operations for
@@ -2102,23 +2017,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2102 if (retval) 2017 if (retval)
2103 goto out_free_group_list; 2018 goto out_free_group_list;
2104 2019
2105 /* prevent changes to the threadgroup list while we take a snapshot. */
2106 read_lock(&tasklist_lock);
2107 if (!thread_group_leader(leader)) {
2108 /*
2109 * a race with de_thread from another thread's exec() may strip
2110 * us of our leadership, making while_each_thread unsafe to use
2111 * on this task. if this happens, there is no choice but to
2112 * throw this task away and try again (from cgroup_procs_write);
2113 * this is "double-double-toil-and-trouble-check locking".
2114 */
2115 read_unlock(&tasklist_lock);
2116 retval = -EAGAIN;
2117 goto out_free_group_list;
2118 }
2119
2120 tsk = leader; 2020 tsk = leader;
2121 i = 0; 2021 i = 0;
2022 /*
2023 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2024 * already PF_EXITING could be freed from underneath us unless we
2025 * take an rcu_read_lock.
2026 */
2027 rcu_read_lock();
2122 do { 2028 do {
2123 struct task_and_cgroup ent; 2029 struct task_and_cgroup ent;
2124 2030
@@ -2128,24 +2034,24 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2128 2034
2129 /* as per above, nr_threads may decrease, but not increase. */ 2035 /* as per above, nr_threads may decrease, but not increase. */
2130 BUG_ON(i >= group_size); 2036 BUG_ON(i >= group_size);
2131 /*
2132 * saying GFP_ATOMIC has no effect here because we did prealloc
2133 * earlier, but it's good form to communicate our expectations.
2134 */
2135 ent.task = tsk; 2037 ent.task = tsk;
2136 ent.cgrp = task_cgroup_from_root(tsk, root); 2038 ent.cgrp = task_cgroup_from_root(tsk, root);
2137 /* nothing to do if this task is already in the cgroup */ 2039 /* nothing to do if this task is already in the cgroup */
2138 if (ent.cgrp == cgrp) 2040 if (ent.cgrp == cgrp)
2139 continue; 2041 continue;
2042 /*
2043 * saying GFP_ATOMIC has no effect here because we did prealloc
2044 * earlier, but it's good form to communicate our expectations.
2045 */
2140 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2046 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2141 BUG_ON(retval != 0); 2047 BUG_ON(retval != 0);
2142 i++; 2048 i++;
2143 } while_each_thread(leader, tsk); 2049 } while_each_thread(leader, tsk);
2050 rcu_read_unlock();
2144 /* remember the number of threads in the array for later. */ 2051 /* remember the number of threads in the array for later. */
2145 group_size = i; 2052 group_size = i;
2146 tset.tc_array = group; 2053 tset.tc_array = group;
2147 tset.tc_array_len = group_size; 2054 tset.tc_array_len = group_size;
2148 read_unlock(&tasklist_lock);
2149 2055
2150 /* methods shouldn't be called if no task is actually migrating */ 2056 /* methods shouldn't be called if no task is actually migrating */
2151 retval = 0; 2057 retval = 0;
@@ -2157,7 +2063,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2157 */ 2063 */
2158 for_each_subsys(root, ss) { 2064 for_each_subsys(root, ss) {
2159 if (ss->can_attach) { 2065 if (ss->can_attach) {
2160 retval = ss->can_attach(ss, cgrp, &tset); 2066 retval = ss->can_attach(cgrp, &tset);
2161 if (retval) { 2067 if (retval) {
2162 failed_ss = ss; 2068 failed_ss = ss;
2163 goto out_cancel_attach; 2069 goto out_cancel_attach;
@@ -2169,17 +2075,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2169 * step 2: make sure css_sets exist for all threads to be migrated. 2075 * step 2: make sure css_sets exist for all threads to be migrated.
2170 * we use find_css_set, which allocates a new one if necessary. 2076 * we use find_css_set, which allocates a new one if necessary.
2171 */ 2077 */
2172 INIT_LIST_HEAD(&newcg_list);
2173 for (i = 0; i < group_size; i++) { 2078 for (i = 0; i < group_size; i++) {
2174 tc = flex_array_get(group, i); 2079 tc = flex_array_get(group, i);
2175 oldcg = tc->task->cgroups; 2080 tc->cg = find_css_set(tc->task->cgroups, cgrp);
2176 2081 if (!tc->cg) {
2177 /* if we don't already have it in the list get a new one */ 2082 retval = -ENOMEM;
2178 if (!css_set_check_fetched(cgrp, tc->task, oldcg, 2083 goto out_put_css_set_refs;
2179 &newcg_list)) {
2180 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2181 if (retval)
2182 goto out_list_teardown;
2183 } 2084 }
2184 } 2085 }
2185 2086
@@ -2190,8 +2091,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2190 */ 2091 */
2191 for (i = 0; i < group_size; i++) { 2092 for (i = 0; i < group_size; i++) {
2192 tc = flex_array_get(group, i); 2093 tc = flex_array_get(group, i);
2193 retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); 2094 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
2194 BUG_ON(retval);
2195 } 2095 }
2196 /* nothing is sensitive to fork() after this point. */ 2096 /* nothing is sensitive to fork() after this point. */
2197 2097
@@ -2200,7 +2100,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2200 */ 2100 */
2201 for_each_subsys(root, ss) { 2101 for_each_subsys(root, ss) {
2202 if (ss->attach) 2102 if (ss->attach)
2203 ss->attach(ss, cgrp, &tset); 2103 ss->attach(cgrp, &tset);
2204 } 2104 }
2205 2105
2206 /* 2106 /*
@@ -2209,21 +2109,22 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2209 synchronize_rcu(); 2109 synchronize_rcu();
2210 cgroup_wakeup_rmdir_waiter(cgrp); 2110 cgroup_wakeup_rmdir_waiter(cgrp);
2211 retval = 0; 2111 retval = 0;
2212out_list_teardown: 2112out_put_css_set_refs:
2213 /* clean up the list of prefetched css_sets. */ 2113 if (retval) {
2214 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { 2114 for (i = 0; i < group_size; i++) {
2215 list_del(&cg_entry->links); 2115 tc = flex_array_get(group, i);
2216 put_css_set(cg_entry->cg); 2116 if (!tc->cg)
2217 kfree(cg_entry); 2117 break;
2118 put_css_set(tc->cg);
2119 }
2218 } 2120 }
2219out_cancel_attach: 2121out_cancel_attach:
2220 /* same deal as in cgroup_attach_task */
2221 if (retval) { 2122 if (retval) {
2222 for_each_subsys(root, ss) { 2123 for_each_subsys(root, ss) {
2223 if (ss == failed_ss) 2124 if (ss == failed_ss)
2224 break; 2125 break;
2225 if (ss->cancel_attach) 2126 if (ss->cancel_attach)
2226 ss->cancel_attach(ss, cgrp, &tset); 2127 ss->cancel_attach(cgrp, &tset);
2227 } 2128 }
2228 } 2129 }
2229out_free_group_list: 2130out_free_group_list:
@@ -2245,22 +2146,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2245 if (!cgroup_lock_live_group(cgrp)) 2146 if (!cgroup_lock_live_group(cgrp))
2246 return -ENODEV; 2147 return -ENODEV;
2247 2148
2149retry_find_task:
2150 rcu_read_lock();
2248 if (pid) { 2151 if (pid) {
2249 rcu_read_lock();
2250 tsk = find_task_by_vpid(pid); 2152 tsk = find_task_by_vpid(pid);
2251 if (!tsk) { 2153 if (!tsk) {
2252 rcu_read_unlock(); 2154 rcu_read_unlock();
2253 cgroup_unlock(); 2155 ret= -ESRCH;
2254 return -ESRCH; 2156 goto out_unlock_cgroup;
2255 }
2256 if (threadgroup) {
2257 /*
2258 * RCU protects this access, since tsk was found in the
2259 * tid map. a race with de_thread may cause group_leader
2260 * to stop being the leader, but cgroup_attach_proc will
2261 * detect it later.
2262 */
2263 tsk = tsk->group_leader;
2264 } 2157 }
2265 /* 2158 /*
2266 * even if we're attaching all tasks in the thread group, we 2159 * even if we're attaching all tasks in the thread group, we
@@ -2271,29 +2164,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2271 cred->euid != tcred->uid && 2164 cred->euid != tcred->uid &&
2272 cred->euid != tcred->suid) { 2165 cred->euid != tcred->suid) {
2273 rcu_read_unlock(); 2166 rcu_read_unlock();
2274 cgroup_unlock(); 2167 ret = -EACCES;
2275 return -EACCES; 2168 goto out_unlock_cgroup;
2276 } 2169 }
2277 get_task_struct(tsk); 2170 } else
2278 rcu_read_unlock(); 2171 tsk = current;
2279 } else {
2280 if (threadgroup)
2281 tsk = current->group_leader;
2282 else
2283 tsk = current;
2284 get_task_struct(tsk);
2285 }
2286
2287 threadgroup_lock(tsk);
2288 2172
2289 if (threadgroup) 2173 if (threadgroup)
2174 tsk = tsk->group_leader;
2175 get_task_struct(tsk);
2176 rcu_read_unlock();
2177
2178 threadgroup_lock(tsk);
2179 if (threadgroup) {
2180 if (!thread_group_leader(tsk)) {
2181 /*
2182 * a race with de_thread from another thread's exec()
2183 * may strip us of our leadership, if this happens,
2184 * there is no choice but to throw this task away and
2185 * try again; this is
2186 * "double-double-toil-and-trouble-check locking".
2187 */
2188 threadgroup_unlock(tsk);
2189 put_task_struct(tsk);
2190 goto retry_find_task;
2191 }
2290 ret = cgroup_attach_proc(cgrp, tsk); 2192 ret = cgroup_attach_proc(cgrp, tsk);
2291 else 2193 } else
2292 ret = cgroup_attach_task(cgrp, tsk); 2194 ret = cgroup_attach_task(cgrp, tsk);
2293
2294 threadgroup_unlock(tsk); 2195 threadgroup_unlock(tsk);
2295 2196
2296 put_task_struct(tsk); 2197 put_task_struct(tsk);
2198out_unlock_cgroup:
2297 cgroup_unlock(); 2199 cgroup_unlock();
2298 return ret; 2200 return ret;
2299} 2201}
@@ -2305,16 +2207,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2305 2207
2306static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2208static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2307{ 2209{
2308 int ret; 2210 return attach_task_by_pid(cgrp, tgid, true);
2309 do {
2310 /*
2311 * attach_proc fails with -EAGAIN if threadgroup leadership
2312 * changes in the middle of the operation, in which case we need
2313 * to find the task_struct for the new leader and start over.
2314 */
2315 ret = attach_task_by_pid(cgrp, tgid, true);
2316 } while (ret == -EAGAIN);
2317 return ret;
2318} 2211}
2319 2212
2320/** 2213/**
@@ -2804,15 +2697,20 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
2804 * using their cgroups capability, we don't maintain the lists running 2697 * using their cgroups capability, we don't maintain the lists running
2805 * through each css_set to its tasks until we see the list actually 2698 * through each css_set to its tasks until we see the list actually
2806 * used - in other words after the first call to cgroup_iter_start(). 2699 * used - in other words after the first call to cgroup_iter_start().
2807 *
2808 * The tasklist_lock is not held here, as do_each_thread() and
2809 * while_each_thread() are protected by RCU.
2810 */ 2700 */
2811static void cgroup_enable_task_cg_lists(void) 2701static void cgroup_enable_task_cg_lists(void)
2812{ 2702{
2813 struct task_struct *p, *g; 2703 struct task_struct *p, *g;
2814 write_lock(&css_set_lock); 2704 write_lock(&css_set_lock);
2815 use_task_css_set_links = 1; 2705 use_task_css_set_links = 1;
2706 /*
2707 * We need tasklist_lock because RCU is not safe against
2708 * while_each_thread(). Besides, a forking task that has passed
2709 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2710 * is not guaranteed to have its child immediately visible in the
2711 * tasklist if we walk through it with RCU.
2712 */
2713 read_lock(&tasklist_lock);
2816 do_each_thread(g, p) { 2714 do_each_thread(g, p) {
2817 task_lock(p); 2715 task_lock(p);
2818 /* 2716 /*
@@ -2824,6 +2722,7 @@ static void cgroup_enable_task_cg_lists(void)
2824 list_add(&p->cg_list, &p->cgroups->tasks); 2722 list_add(&p->cg_list, &p->cgroups->tasks);
2825 task_unlock(p); 2723 task_unlock(p);
2826 } while_each_thread(g, p); 2724 } while_each_thread(g, p);
2725 read_unlock(&tasklist_lock);
2827 write_unlock(&css_set_lock); 2726 write_unlock(&css_set_lock);
2828} 2727}
2829 2728
@@ -3043,6 +2942,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3043 * 2942 *
3044 */ 2943 */
3045 2944
2945/* which pidlist file are we talking about? */
2946enum cgroup_filetype {
2947 CGROUP_FILE_PROCS,
2948 CGROUP_FILE_TASKS,
2949};
2950
2951/*
2952 * A pidlist is a list of pids that virtually represents the contents of one
2953 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
2954 * a pair (one each for procs, tasks) for each pid namespace that's relevant
2955 * to the cgroup.
2956 */
2957struct cgroup_pidlist {
2958 /*
2959 * used to find which pidlist is wanted. doesn't change as long as
2960 * this particular list stays in the list.
2961 */
2962 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
2963 /* array of xids */
2964 pid_t *list;
2965 /* how many elements the above list has */
2966 int length;
2967 /* how many files are using the current array */
2968 int use_count;
2969 /* each of these stored in a list by its cgroup */
2970 struct list_head links;
2971 /* pointer to the cgroup we belong to, for list removal purposes */
2972 struct cgroup *owner;
2973 /* protects the other fields */
2974 struct rw_semaphore mutex;
2975};
2976
3046/* 2977/*
3047 * The following two functions "fix" the issue where there are more pids 2978 * The following two functions "fix" the issue where there are more pids
3048 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. 2979 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3827,7 +3758,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3827 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3758 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3828 3759
3829 for_each_subsys(root, ss) { 3760 for_each_subsys(root, ss) {
3830 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3761 struct cgroup_subsys_state *css = ss->create(cgrp);
3831 3762
3832 if (IS_ERR(css)) { 3763 if (IS_ERR(css)) {
3833 err = PTR_ERR(css); 3764 err = PTR_ERR(css);
@@ -3841,7 +3772,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3841 } 3772 }
3842 /* At error, ->destroy() callback has to free assigned ID. */ 3773 /* At error, ->destroy() callback has to free assigned ID. */
3843 if (clone_children(parent) && ss->post_clone) 3774 if (clone_children(parent) && ss->post_clone)
3844 ss->post_clone(ss, cgrp); 3775 ss->post_clone(cgrp);
3845 } 3776 }
3846 3777
3847 cgroup_lock_hierarchy(root); 3778 cgroup_lock_hierarchy(root);
@@ -3875,7 +3806,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3875 3806
3876 for_each_subsys(root, ss) { 3807 for_each_subsys(root, ss) {
3877 if (cgrp->subsys[ss->subsys_id]) 3808 if (cgrp->subsys[ss->subsys_id])
3878 ss->destroy(ss, cgrp); 3809 ss->destroy(cgrp);
3879 } 3810 }
3880 3811
3881 mutex_unlock(&cgroup_mutex); 3812 mutex_unlock(&cgroup_mutex);
@@ -4099,7 +4030,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4099 /* Create the top cgroup state for this subsystem */ 4030 /* Create the top cgroup state for this subsystem */
4100 list_add(&ss->sibling, &rootnode.subsys_list); 4031 list_add(&ss->sibling, &rootnode.subsys_list);
4101 ss->root = &rootnode; 4032 ss->root = &rootnode;
4102 css = ss->create(ss, dummytop); 4033 css = ss->create(dummytop);
4103 /* We don't handle early failures gracefully */ 4034 /* We don't handle early failures gracefully */
4104 BUG_ON(IS_ERR(css)); 4035 BUG_ON(IS_ERR(css));
4105 init_cgroup_css(css, ss, dummytop); 4036 init_cgroup_css(css, ss, dummytop);
@@ -4188,7 +4119,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4188 * no ss->create seems to need anything important in the ss struct, so 4119 * no ss->create seems to need anything important in the ss struct, so
4189 * this can happen first (i.e. before the rootnode attachment). 4120 * this can happen first (i.e. before the rootnode attachment).
4190 */ 4121 */
4191 css = ss->create(ss, dummytop); 4122 css = ss->create(dummytop);
4192 if (IS_ERR(css)) { 4123 if (IS_ERR(css)) {
4193 /* failure case - need to deassign the subsys[] slot. */ 4124 /* failure case - need to deassign the subsys[] slot. */
4194 subsys[i] = NULL; 4125 subsys[i] = NULL;
@@ -4206,7 +4137,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4206 int ret = cgroup_init_idr(ss, css); 4137 int ret = cgroup_init_idr(ss, css);
4207 if (ret) { 4138 if (ret) {
4208 dummytop->subsys[ss->subsys_id] = NULL; 4139 dummytop->subsys[ss->subsys_id] = NULL;
4209 ss->destroy(ss, dummytop); 4140 ss->destroy(dummytop);
4210 subsys[i] = NULL; 4141 subsys[i] = NULL;
4211 mutex_unlock(&cgroup_mutex); 4142 mutex_unlock(&cgroup_mutex);
4212 return ret; 4143 return ret;
@@ -4304,7 +4235,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4304 * pointer to find their state. note that this also takes care of 4235 * pointer to find their state. note that this also takes care of
4305 * freeing the css_id. 4236 * freeing the css_id.
4306 */ 4237 */
4307 ss->destroy(ss, dummytop); 4238 ss->destroy(dummytop);
4308 dummytop->subsys[ss->subsys_id] = NULL; 4239 dummytop->subsys[ss->subsys_id] = NULL;
4309 4240
4310 mutex_unlock(&cgroup_mutex); 4241 mutex_unlock(&cgroup_mutex);
@@ -4580,7 +4511,7 @@ void cgroup_fork_callbacks(struct task_struct *child)
4580 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 4511 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4581 struct cgroup_subsys *ss = subsys[i]; 4512 struct cgroup_subsys *ss = subsys[i];
4582 if (ss->fork) 4513 if (ss->fork)
4583 ss->fork(ss, child); 4514 ss->fork(child);
4584 } 4515 }
4585 } 4516 }
4586} 4517}
@@ -4596,6 +4527,17 @@ void cgroup_fork_callbacks(struct task_struct *child)
4596 */ 4527 */
4597void cgroup_post_fork(struct task_struct *child) 4528void cgroup_post_fork(struct task_struct *child)
4598{ 4529{
4530 /*
4531 * use_task_css_set_links is set to 1 before we walk the tasklist
4532 * under the tasklist_lock and we read it here after we added the child
4533 * to the tasklist under the tasklist_lock as well. If the child wasn't
4534 * yet in the tasklist when we walked through it from
4535 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
4536 * should be visible now due to the paired locking and barriers implied
4537 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
4538 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
4539 * lock on fork.
4540 */
4599 if (use_task_css_set_links) { 4541 if (use_task_css_set_links) {
4600 write_lock(&css_set_lock); 4542 write_lock(&css_set_lock);
4601 if (list_empty(&child->cg_list)) { 4543 if (list_empty(&child->cg_list)) {
@@ -4682,7 +4624,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4682 struct cgroup *old_cgrp = 4624 struct cgroup *old_cgrp =
4683 rcu_dereference_raw(cg->subsys[i])->cgroup; 4625 rcu_dereference_raw(cg->subsys[i])->cgroup;
4684 struct cgroup *cgrp = task_cgroup(tsk, i); 4626 struct cgroup *cgrp = task_cgroup(tsk, i);
4685 ss->exit(ss, cgrp, old_cgrp, tsk); 4627 ss->exit(cgrp, old_cgrp, tsk);
4686 } 4628 }
4687 } 4629 }
4688 } 4630 }
@@ -4939,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4939 4881
4940 rcu_assign_pointer(id->css, NULL); 4882 rcu_assign_pointer(id->css, NULL);
4941 rcu_assign_pointer(css->id, NULL); 4883 rcu_assign_pointer(css->id, NULL);
4942 write_lock(&ss->id_lock); 4884 spin_lock(&ss->id_lock);
4943 idr_remove(&ss->idr, id->id); 4885 idr_remove(&ss->idr, id->id);
4944 write_unlock(&ss->id_lock); 4886 spin_unlock(&ss->id_lock);
4945 kfree_rcu(id, rcu_head); 4887 kfree_rcu(id, rcu_head);
4946} 4888}
4947EXPORT_SYMBOL_GPL(free_css_id); 4889EXPORT_SYMBOL_GPL(free_css_id);
@@ -4967,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4967 error = -ENOMEM; 4909 error = -ENOMEM;
4968 goto err_out; 4910 goto err_out;
4969 } 4911 }
4970 write_lock(&ss->id_lock); 4912 spin_lock(&ss->id_lock);
4971 /* Don't use 0. allocates an ID of 1-65535 */ 4913 /* Don't use 0. allocates an ID of 1-65535 */
4972 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 4914 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
4973 write_unlock(&ss->id_lock); 4915 spin_unlock(&ss->id_lock);
4974 4916
4975 /* Returns error when there are no free spaces for new ID.*/ 4917 /* Returns error when there are no free spaces for new ID.*/
4976 if (error) { 4918 if (error) {
@@ -4985,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4985 return newid; 4927 return newid;
4986remove_idr: 4928remove_idr:
4987 error = -ENOSPC; 4929 error = -ENOSPC;
4988 write_lock(&ss->id_lock); 4930 spin_lock(&ss->id_lock);
4989 idr_remove(&ss->idr, myid); 4931 idr_remove(&ss->idr, myid);
4990 write_unlock(&ss->id_lock); 4932 spin_unlock(&ss->id_lock);
4991err_out: 4933err_out:
4992 kfree(newid); 4934 kfree(newid);
4993 return ERR_PTR(error); 4935 return ERR_PTR(error);
@@ -4999,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4999{ 4941{
5000 struct css_id *newid; 4942 struct css_id *newid;
5001 4943
5002 rwlock_init(&ss->id_lock); 4944 spin_lock_init(&ss->id_lock);
5003 idr_init(&ss->idr); 4945 idr_init(&ss->idr);
5004 4946
5005 newid = get_new_cssid(ss, 0); 4947 newid = get_new_cssid(ss, 0);
@@ -5087,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id,
5087 return NULL; 5029 return NULL;
5088 5030
5089 BUG_ON(!ss->use_id); 5031 BUG_ON(!ss->use_id);
5032 WARN_ON_ONCE(!rcu_read_lock_held());
5033
5090 /* fill start point for scan */ 5034 /* fill start point for scan */
5091 tmpid = id; 5035 tmpid = id;
5092 while (1) { 5036 while (1) {
@@ -5094,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id,
5094 * scan next entry from bitmap(tree), tmpid is updated after 5038 * scan next entry from bitmap(tree), tmpid is updated after
5095 * idr_get_next(). 5039 * idr_get_next().
5096 */ 5040 */
5097 read_lock(&ss->id_lock);
5098 tmp = idr_get_next(&ss->idr, &tmpid); 5041 tmp = idr_get_next(&ss->idr, &tmpid);
5099 read_unlock(&ss->id_lock);
5100
5101 if (!tmp) 5042 if (!tmp)
5102 break; 5043 break;
5103 if (tmp->depth >= depth && tmp->stack[depth] == rootid) { 5044 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
@@ -5137,8 +5078,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5137} 5078}
5138 5079
5139#ifdef CONFIG_CGROUP_DEBUG 5080#ifdef CONFIG_CGROUP_DEBUG
5140static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 5081static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5141 struct cgroup *cont)
5142{ 5082{
5143 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5083 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5144 5084
@@ -5148,7 +5088,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
5148 return css; 5088 return css;
5149} 5089}
5150 5090
5151static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 5091static void debug_destroy(struct cgroup *cont)
5152{ 5092{
5153 kfree(cont->subsys[debug_subsys_id]); 5093 kfree(cont->subsys[debug_subsys_id]);
5154} 5094}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fc0646b78a64..f86e93920b62 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys;
128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) 128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
129 * sighand->siglock 129 * sighand->siglock
130 */ 130 */
131static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 131static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
132 struct cgroup *cgroup)
133{ 132{
134 struct freezer *freezer; 133 struct freezer *freezer;
135 134
@@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
142 return &freezer->css; 141 return &freezer->css;
143} 142}
144 143
145static void freezer_destroy(struct cgroup_subsys *ss, 144static void freezer_destroy(struct cgroup *cgroup)
146 struct cgroup *cgroup)
147{ 145{
148 struct freezer *freezer = cgroup_freezer(cgroup); 146 struct freezer *freezer = cgroup_freezer(cgroup);
149 147
@@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
164 * a write to that file racing against an attach, and hence the 162 * a write to that file racing against an attach, and hence the
165 * can_attach() result will remain valid until the attach completes. 163 * can_attach() result will remain valid until the attach completes.
166 */ 164 */
167static int freezer_can_attach(struct cgroup_subsys *ss, 165static int freezer_can_attach(struct cgroup *new_cgroup,
168 struct cgroup *new_cgroup,
169 struct cgroup_taskset *tset) 166 struct cgroup_taskset *tset)
170{ 167{
171 struct freezer *freezer; 168 struct freezer *freezer;
@@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
185 return 0; 182 return 0;
186} 183}
187 184
188static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) 185static void freezer_fork(struct task_struct *task)
189{ 186{
190 struct freezer *freezer; 187 struct freezer *freezer;
191 188
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a09ac2b9a661..1010cc61931f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
964{ 964{
965 bool need_loop; 965 bool need_loop;
966 966
967repeat:
968 /* 967 /*
969 * Allow tasks that have access to memory reserves because they have 968 * Allow tasks that have access to memory reserves because they have
970 * been OOM killed to get memory anywhere. 969 * been OOM killed to get memory anywhere.
@@ -983,45 +982,19 @@ repeat:
983 */ 982 */
984 need_loop = task_has_mempolicy(tsk) || 983 need_loop = task_has_mempolicy(tsk) ||
985 !nodes_intersects(*newmems, tsk->mems_allowed); 984 !nodes_intersects(*newmems, tsk->mems_allowed);
986 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
987 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
988 985
989 /* 986 if (need_loop)
990 * ensure checking ->mems_allowed_change_disable after setting all new 987 write_seqcount_begin(&tsk->mems_allowed_seq);
991 * allowed nodes.
992 *
993 * the read-side task can see an nodemask with new allowed nodes and
994 * old allowed nodes. and if it allocates page when cpuset clears newly
995 * disallowed ones continuous, it can see the new allowed bits.
996 *
997 * And if setting all new allowed nodes is after the checking, setting
998 * all new allowed nodes and clearing newly disallowed ones will be done
999 * continuous, and the read-side task may find no node to alloc page.
1000 */
1001 smp_mb();
1002 988
1003 /* 989 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1004 * Allocation of memory is very fast, we needn't sleep when waiting 990 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
1005 * for the read-side.
1006 */
1007 while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
1008 task_unlock(tsk);
1009 if (!task_curr(tsk))
1010 yield();
1011 goto repeat;
1012 }
1013
1014 /*
1015 * ensure checking ->mems_allowed_change_disable before clearing all new
1016 * disallowed nodes.
1017 *
1018 * if clearing newly disallowed bits before the checking, the read-side
1019 * task may find no node to alloc page.
1020 */
1021 smp_mb();
1022 991
1023 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); 992 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1024 tsk->mems_allowed = *newmems; 993 tsk->mems_allowed = *newmems;
994
995 if (need_loop)
996 write_seqcount_end(&tsk->mems_allowed_seq);
997
1025 task_unlock(tsk); 998 task_unlock(tsk);
1026} 999}
1027 1000
@@ -1399,8 +1372,7 @@ static nodemask_t cpuset_attach_nodemask_from;
1399static nodemask_t cpuset_attach_nodemask_to; 1372static nodemask_t cpuset_attach_nodemask_to;
1400 1373
1401/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1374/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1402static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1375static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1403 struct cgroup_taskset *tset)
1404{ 1376{
1405 struct cpuset *cs = cgroup_cs(cgrp); 1377 struct cpuset *cs = cgroup_cs(cgrp);
1406 struct task_struct *task; 1378 struct task_struct *task;
@@ -1436,8 +1408,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1436 return 0; 1408 return 0;
1437} 1409}
1438 1410
1439static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1411static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1440 struct cgroup_taskset *tset)
1441{ 1412{
1442 struct mm_struct *mm; 1413 struct mm_struct *mm;
1443 struct task_struct *task; 1414 struct task_struct *task;
@@ -1833,8 +1804,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1833 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex 1804 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1834 * held. 1805 * held.
1835 */ 1806 */
1836static void cpuset_post_clone(struct cgroup_subsys *ss, 1807static void cpuset_post_clone(struct cgroup *cgroup)
1837 struct cgroup *cgroup)
1838{ 1808{
1839 struct cgroup *parent, *child; 1809 struct cgroup *parent, *child;
1840 struct cpuset *cs, *parent_cs; 1810 struct cpuset *cs, *parent_cs;
@@ -1857,13 +1827,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1857 1827
1858/* 1828/*
1859 * cpuset_create - create a cpuset 1829 * cpuset_create - create a cpuset
1860 * ss: cpuset cgroup subsystem
1861 * cont: control group that the new cpuset will be part of 1830 * cont: control group that the new cpuset will be part of
1862 */ 1831 */
1863 1832
1864static struct cgroup_subsys_state *cpuset_create( 1833static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1865 struct cgroup_subsys *ss,
1866 struct cgroup *cont)
1867{ 1834{
1868 struct cpuset *cs; 1835 struct cpuset *cs;
1869 struct cpuset *parent; 1836 struct cpuset *parent;
@@ -1902,7 +1869,7 @@ static struct cgroup_subsys_state *cpuset_create(
1902 * will call async_rebuild_sched_domains(). 1869 * will call async_rebuild_sched_domains().
1903 */ 1870 */
1904 1871
1905static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1872static void cpuset_destroy(struct cgroup *cont)
1906{ 1873{
1907 struct cpuset *cs = cgroup_cs(cont); 1874 struct cpuset *cs = cgroup_cs(cont);
1908 1875
diff --git a/kernel/cred.c b/kernel/cred.c
index 5791612a4045..97b36eeca4c9 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,7 @@
16#include <linux/keyctl.h> 16#include <linux/keyctl.h>
17#include <linux/init_task.h> 17#include <linux/init_task.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/binfmts.h>
19#include <linux/cn_proc.h> 20#include <linux/cn_proc.h>
20 21
21#if 0 22#if 0
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0d7c08784efb..3f88a45e6f0a 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -41,6 +41,7 @@
41#include <linux/delay.h> 41#include <linux/delay.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/sysrq.h> 43#include <linux/sysrq.h>
44#include <linux/reboot.h>
44#include <linux/init.h> 45#include <linux/init.h>
45#include <linux/kgdb.h> 46#include <linux/kgdb.h>
46#include <linux/kdb.h> 47#include <linux/kdb.h>
@@ -75,6 +76,8 @@ static int exception_level;
75struct kgdb_io *dbg_io_ops; 76struct kgdb_io *dbg_io_ops;
76static DEFINE_SPINLOCK(kgdb_registration_lock); 77static DEFINE_SPINLOCK(kgdb_registration_lock);
77 78
79/* Action for the reboot notifiter, a global allow kdb to change it */
80static int kgdbreboot;
78/* kgdb console driver is loaded */ 81/* kgdb console driver is loaded */
79static int kgdb_con_registered; 82static int kgdb_con_registered;
80/* determine if kgdb console output should be used */ 83/* determine if kgdb console output should be used */
@@ -96,6 +99,7 @@ static int __init opt_kgdb_con(char *str)
96early_param("kgdbcon", opt_kgdb_con); 99early_param("kgdbcon", opt_kgdb_con);
97 100
98module_param(kgdb_use_con, int, 0644); 101module_param(kgdb_use_con, int, 0644);
102module_param(kgdbreboot, int, 0644);
99 103
100/* 104/*
101 * Holds information about breakpoints in a kernel. These breakpoints are 105 * Holds information about breakpoints in a kernel. These breakpoints are
@@ -784,6 +788,33 @@ void __init dbg_late_init(void)
784 kdb_init(KDB_INIT_FULL); 788 kdb_init(KDB_INIT_FULL);
785} 789}
786 790
791static int
792dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
793{
794 /*
795 * Take the following action on reboot notify depending on value:
796 * 1 == Enter debugger
797 * 0 == [the default] detatch debug client
798 * -1 == Do nothing... and use this until the board resets
799 */
800 switch (kgdbreboot) {
801 case 1:
802 kgdb_breakpoint();
803 case -1:
804 goto done;
805 }
806 if (!dbg_kdb_mode)
807 gdbstub_exit(code);
808done:
809 return NOTIFY_DONE;
810}
811
812static struct notifier_block dbg_reboot_notifier = {
813 .notifier_call = dbg_notify_reboot,
814 .next = NULL,
815 .priority = INT_MAX,
816};
817
787static void kgdb_register_callbacks(void) 818static void kgdb_register_callbacks(void)
788{ 819{
789 if (!kgdb_io_module_registered) { 820 if (!kgdb_io_module_registered) {
@@ -791,6 +822,7 @@ static void kgdb_register_callbacks(void)
791 kgdb_arch_init(); 822 kgdb_arch_init();
792 if (!dbg_is_early) 823 if (!dbg_is_early)
793 kgdb_arch_late(); 824 kgdb_arch_late();
825 register_reboot_notifier(&dbg_reboot_notifier);
794 atomic_notifier_chain_register(&panic_notifier_list, 826 atomic_notifier_chain_register(&panic_notifier_list,
795 &kgdb_panic_event_nb); 827 &kgdb_panic_event_nb);
796#ifdef CONFIG_MAGIC_SYSRQ 828#ifdef CONFIG_MAGIC_SYSRQ
@@ -812,6 +844,7 @@ static void kgdb_unregister_callbacks(void)
812 */ 844 */
813 if (kgdb_io_module_registered) { 845 if (kgdb_io_module_registered) {
814 kgdb_io_module_registered = 0; 846 kgdb_io_module_registered = 0;
847 unregister_reboot_notifier(&dbg_reboot_notifier);
815 atomic_notifier_chain_unregister(&panic_notifier_list, 848 atomic_notifier_chain_unregister(&panic_notifier_list,
816 &kgdb_panic_event_nb); 849 &kgdb_panic_event_nb);
817 kgdb_arch_exit(); 850 kgdb_arch_exit();
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index c22d8c28ad84..ce615e064482 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1111,6 +1111,13 @@ void gdbstub_exit(int status)
1111 unsigned char checksum, ch, buffer[3]; 1111 unsigned char checksum, ch, buffer[3];
1112 int loop; 1112 int loop;
1113 1113
1114 if (!kgdb_connected)
1115 return;
1116 kgdb_connected = 0;
1117
1118 if (!dbg_io_ops || dbg_kdb_mode)
1119 return;
1120
1114 buffer[0] = 'W'; 1121 buffer[0] = 'W';
1115 buffer[1] = hex_asc_hi(status); 1122 buffer[1] = hex_asc_hi(status);
1116 buffer[2] = hex_asc_lo(status); 1123 buffer[2] = hex_asc_lo(status);
@@ -1129,5 +1136,6 @@ void gdbstub_exit(int status)
1129 dbg_io_ops->write_char(hex_asc_lo(checksum)); 1136 dbg_io_ops->write_char(hex_asc_lo(checksum));
1130 1137
1131 /* make sure the output is flushed, lest the bootloader clobber it */ 1138 /* make sure the output is flushed, lest the bootloader clobber it */
1132 dbg_io_ops->flush(); 1139 if (dbg_io_ops->flush)
1140 dbg_io_ops->flush();
1133} 1141}
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 20059ef4459a..8418c2f8ec5d 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
153 } else { 153 } else {
154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n", 154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
155 __func__, bp->bp_addr); 155 __func__, bp->bp_addr);
156#ifdef CONFIG_DEBUG_RODATA
157 if (!bp->bp_type) {
158 kdb_printf("Software breakpoints are unavailable.\n"
159 " Change the kernel CONFIG_DEBUG_RODATA=n\n"
160 " OR use hw breaks: help bph\n");
161 }
162#endif
156 return 1; 163 return 1;
157 } 164 }
158 return 0; 165 return 0;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 4802eb5840e1..9b5f17da1c56 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -689,7 +689,7 @@ kdb_printit:
689 if (!dbg_kdb_mode && kgdb_connected) { 689 if (!dbg_kdb_mode && kgdb_connected) {
690 gdbstub_msg_write(kdb_buffer, retlen); 690 gdbstub_msg_write(kdb_buffer, retlen);
691 } else { 691 } else {
692 if (!dbg_io_ops->is_console) { 692 if (dbg_io_ops && !dbg_io_ops->is_console) {
693 len = strlen(kdb_buffer); 693 len = strlen(kdb_buffer);
694 cp = kdb_buffer; 694 cp = kdb_buffer;
695 while (len--) { 695 while (len--) {
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 4bca634975c0..118527aa60ea 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,7 @@
25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ 25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
26 26
27static int kbd_exists; 27static int kbd_exists;
28static int kbd_last_ret;
28 29
29/* 30/*
30 * Check if the keyboard controller has a keypress for us. 31 * Check if the keyboard controller has a keypress for us.
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)
90 return -1; 91 return -1;
91 } 92 }
92 93
93 if ((scancode & 0x80) != 0) 94 if ((scancode & 0x80) != 0) {
95 if (scancode == 0x9c)
96 kbd_last_ret = 0;
94 return -1; 97 return -1;
98 }
95 99
96 scancode &= 0x7f; 100 scancode &= 0x7f;
97 101
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)
178 return -1; /* ignore unprintables */ 182 return -1; /* ignore unprintables */
179 } 183 }
180 184
181 if ((scancode & 0x7f) == 0x1c) { 185 if (scancode == 0x1c) {
182 /* 186 kbd_last_ret = 1;
183 * enter key. All done. Absorb the release scancode. 187 return 13;
184 */ 188 }
189
190 return keychar & 0xff;
191}
192EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
193
194/*
195 * Best effort cleanup of ENTER break codes on leaving KDB. Called on
196 * exiting KDB, when we know we processed an ENTER or KP ENTER scan
197 * code.
198 */
199void kdb_kbd_cleanup_state(void)
200{
201 int scancode, scanstatus;
202
203 /*
204 * Nothing to clean up, since either
205 * ENTER was never pressed, or has already
206 * gotten cleaned up.
207 */
208 if (!kbd_last_ret)
209 return;
210
211 kbd_last_ret = 0;
212 /*
213 * Enter key. Need to absorb the break code here, lest it gets
214 * leaked out if we exit KDB as the result of processing 'g'.
215 *
216 * This has several interesting implications:
217 * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
218 * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
219 * only get a break code at the end of the repeated
220 * sequence. This means we can't propagate the repeated key
221 * press, and must swallow it away.
222 * + Need to handle possible PS/2 mouse input.
223 * + Need to handle mashed keys.
224 */
225
226 while (1) {
185 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) 227 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
186 ; 228 cpu_relax();
187 229
188 /* 230 /*
189 * Fetch the scancode 231 * Fetch the scancode.
190 */ 232 */
191 scancode = inb(KBD_DATA_REG); 233 scancode = inb(KBD_DATA_REG);
192 scanstatus = inb(KBD_STATUS_REG); 234 scanstatus = inb(KBD_STATUS_REG);
193 235
194 while (scanstatus & KBD_STAT_MOUSE_OBF) { 236 /*
195 scancode = inb(KBD_DATA_REG); 237 * Skip mouse input.
196 scanstatus = inb(KBD_STATUS_REG); 238 */
197 } 239 if (scanstatus & KBD_STAT_MOUSE_OBF)
240 continue;
198 241
199 if (scancode != 0x9c) { 242 /*
200 /* 243 * If we see 0xe0, this is either a break code for KP
201 * Wasn't an enter-release, why not? 244 * ENTER, or a repeat make for KP ENTER. Either way,
202 */ 245 * since the second byte is equivalent to an ENTER,
203 kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", 246 * skip the 0xe0 and try again.
204 scancode, scanstatus); 247 *
205 } 248 * If we see 0x1c, this must be a repeat ENTER or KP
249 * ENTER (and we swallowed 0xe0 before). Try again.
250 *
251 * We can also see make and break codes for other keys
252 * mashed before or after pressing ENTER. Thus, if we
253 * see anything other than 0x9c, we have to try again.
254 *
255 * Note, if you held some key as ENTER was depressed,
256 * that break code would get leaked out.
257 */
258 if (scancode != 0x9c)
259 continue;
206 260
207 return 13; 261 return;
208 } 262 }
209
210 return keychar & 0xff;
211} 263}
212EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e2ae7349437f..67b847dfa2bb 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1400 if (KDB_STATE(DOING_SS)) 1400 if (KDB_STATE(DOING_SS))
1401 KDB_STATE_CLEAR(SSBPT); 1401 KDB_STATE_CLEAR(SSBPT);
1402 1402
1403 /* Clean up any keyboard devices before leaving */
1404 kdb_kbd_cleanup_state();
1405
1403 return result; 1406 return result;
1404} 1407}
1405 1408
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index e381d105b40b..47c4e56e513b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -246,6 +246,13 @@ extern void debug_kusage(void);
246 246
247extern void kdb_set_current_task(struct task_struct *); 247extern void kdb_set_current_task(struct task_struct *);
248extern struct task_struct *kdb_current_task; 248extern struct task_struct *kdb_current_task;
249
250#ifdef CONFIG_KDB_KEYBOARD
251extern void kdb_kbd_cleanup_state(void);
252#else /* ! CONFIG_KDB_KEYBOARD */
253#define kdb_kbd_cleanup_state()
254#endif /* ! CONFIG_KDB_KEYBOARD */
255
249#ifdef CONFIG_MODULES 256#ifdef CONFIG_MODULES
250extern struct list_head *kdb_modules; 257extern struct list_head *kdb_modules;
251#endif /* CONFIG_MODULES */ 258#endif /* CONFIG_MODULES */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 7d6fb40d2188..d35cc2d3a4cc 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
384 if (!pfn_valid(pfn)) 384 if (!pfn_valid(pfn))
385 return 1; 385 return 1;
386 page = pfn_to_page(pfn); 386 page = pfn_to_page(pfn);
387 vaddr = kmap_atomic(page, KM_KDB); 387 vaddr = kmap_atomic(page);
388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); 388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
389 kunmap_atomic(vaddr, KM_KDB); 389 kunmap_atomic(vaddr);
390 390
391 return 0; 391 return 0;
392} 392}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3f92a19aa11e..a6a9ec4cd8f5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7154,8 +7154,7 @@ unlock:
7154device_initcall(perf_event_sysfs_init); 7154device_initcall(perf_event_sysfs_init);
7155 7155
7156#ifdef CONFIG_CGROUP_PERF 7156#ifdef CONFIG_CGROUP_PERF
7157static struct cgroup_subsys_state *perf_cgroup_create( 7157static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
7158 struct cgroup_subsys *ss, struct cgroup *cont)
7159{ 7158{
7160 struct perf_cgroup *jc; 7159 struct perf_cgroup *jc;
7161 7160
@@ -7172,8 +7171,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(
7172 return &jc->css; 7171 return &jc->css;
7173} 7172}
7174 7173
7175static void perf_cgroup_destroy(struct cgroup_subsys *ss, 7174static void perf_cgroup_destroy(struct cgroup *cont)
7176 struct cgroup *cont)
7177{ 7175{
7178 struct perf_cgroup *jc; 7176 struct perf_cgroup *jc;
7179 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7177 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -7189,8 +7187,7 @@ static int __perf_cgroup_move(void *info)
7189 return 0; 7187 return 0;
7190} 7188}
7191 7189
7192static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7190static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
7193 struct cgroup_taskset *tset)
7194{ 7191{
7195 struct task_struct *task; 7192 struct task_struct *task;
7196 7193
@@ -7198,8 +7195,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7198 task_function_call(task, __perf_cgroup_move, task); 7195 task_function_call(task, __perf_cgroup_move, task);
7199} 7196}
7200 7197
7201static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7198static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7202 struct cgroup *old_cgrp, struct task_struct *task) 7199 struct task_struct *task)
7203{ 7200{
7204 /* 7201 /*
7205 * cgroup_exit() is called in the copy_process() failure path. 7202 * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/exit.c b/kernel/exit.c
index 752d2c0abd19..3db1909faed9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,7 @@
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h> 53#include <linux/oom.h>
54#include <linux/writeback.h> 54#include <linux/writeback.h>
55#include <linux/shm.h>
55 56
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
57#include <asm/unistd.h> 58#include <asm/unistd.h>
@@ -424,7 +425,7 @@ void daemonize(const char *name, ...)
424 */ 425 */
425 exit_mm(current); 426 exit_mm(current);
426 /* 427 /*
427 * We don't want to have TIF_FREEZE set if the system-wide hibernation 428 * We don't want to get frozen, in case system-wide hibernation
428 * or suspend transition begins right now. 429 * or suspend transition begins right now.
429 */ 430 */
430 current->flags |= (PF_NOFREEZE | PF_KTHREAD); 431 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
@@ -686,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)
686} 687}
687 688
688/* 689/*
689 * When we die, we re-parent all our children. 690 * When we die, we re-parent all our children, and try to:
690 * Try to give them to another thread in our thread 691 * 1. give them to another thread in our thread group, if such a member exists
691 * group, and if no such member exists, give it to 692 * 2. give it to the first ancestor process which prctl'd itself as a
692 * the child reaper process (ie "init") in our pid 693 * child_subreaper for its children (like a service manager)
693 * space. 694 * 3. give it to the init process (PID 1) in our pid namespace
694 */ 695 */
695static struct task_struct *find_new_reaper(struct task_struct *father) 696static struct task_struct *find_new_reaper(struct task_struct *father)
696 __releases(&tasklist_lock) 697 __releases(&tasklist_lock)
@@ -710,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
710 711
711 if (unlikely(pid_ns->child_reaper == father)) { 712 if (unlikely(pid_ns->child_reaper == father)) {
712 write_unlock_irq(&tasklist_lock); 713 write_unlock_irq(&tasklist_lock);
713 if (unlikely(pid_ns == &init_pid_ns)) 714 if (unlikely(pid_ns == &init_pid_ns)) {
714 panic("Attempted to kill init!"); 715 panic("Attempted to kill init! exitcode=0x%08x\n",
716 father->signal->group_exit_code ?:
717 father->exit_code);
718 }
715 719
716 zap_pid_ns_processes(pid_ns); 720 zap_pid_ns_processes(pid_ns);
717 write_lock_irq(&tasklist_lock); 721 write_lock_irq(&tasklist_lock);
@@ -721,6 +725,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
721 * forget_original_parent() must move them somewhere. 725 * forget_original_parent() must move them somewhere.
722 */ 726 */
723 pid_ns->child_reaper = init_pid_ns.child_reaper; 727 pid_ns->child_reaper = init_pid_ns.child_reaper;
728 } else if (father->signal->has_child_subreaper) {
729 struct task_struct *reaper;
730
731 /*
732 * Find the first ancestor marked as child_subreaper.
733 * Note that the code below checks same_thread_group(reaper,
734 * pid_ns->child_reaper). This is what we need to DTRT in a
735 * PID namespace. However we still need the check above, see
736 * http://marc.info/?l=linux-kernel&m=131385460420380
737 */
738 for (reaper = father->real_parent;
739 reaper != &init_task;
740 reaper = reaper->real_parent) {
741 if (same_thread_group(reaper, pid_ns->child_reaper))
742 break;
743 if (!reaper->signal->is_child_subreaper)
744 continue;
745 thread = reaper;
746 do {
747 if (!(thread->flags & PF_EXITING))
748 return reaper;
749 } while_each_thread(reaper, thread);
750 }
724 } 751 }
725 752
726 return pid_ns->child_reaper; 753 return pid_ns->child_reaper;
@@ -818,25 +845,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
818 if (group_dead) 845 if (group_dead)
819 kill_orphaned_pgrp(tsk->group_leader, NULL); 846 kill_orphaned_pgrp(tsk->group_leader, NULL);
820 847
821 /* Let father know we died
822 *
823 * Thread signals are configurable, but you aren't going to use
824 * that to send signals to arbitrary processes.
825 * That stops right now.
826 *
827 * If the parent exec id doesn't match the exec id we saved
828 * when we started then we know the parent has changed security
829 * domain.
830 *
831 * If our self_exec id doesn't match our parent_exec_id then
832 * we have changed execution domain as these two values started
833 * the same after a fork.
834 */
835 if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
836 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
837 tsk->self_exec_id != tsk->parent_exec_id))
838 tsk->exit_signal = SIGCHLD;
839
840 if (unlikely(tsk->ptrace)) { 848 if (unlikely(tsk->ptrace)) {
841 int sig = thread_group_leader(tsk) && 849 int sig = thread_group_leader(tsk) &&
842 thread_group_empty(tsk) && 850 thread_group_empty(tsk) &&
@@ -953,7 +961,7 @@ void do_exit(long code)
953 acct_update_integrals(tsk); 961 acct_update_integrals(tsk);
954 /* sync mm's RSS info before statistics gathering */ 962 /* sync mm's RSS info before statistics gathering */
955 if (tsk->mm) 963 if (tsk->mm)
956 sync_mm_rss(tsk, tsk->mm); 964 sync_mm_rss(tsk->mm);
957 group_dead = atomic_dec_and_test(&tsk->signal->live); 965 group_dead = atomic_dec_and_test(&tsk->signal->live);
958 if (group_dead) { 966 if (group_dead) {
959 hrtimer_cancel(&tsk->signal->real_timer); 967 hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index 26a7a6707fa7..b9372a0bff18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)
193 WARN_ON(atomic_read(&tsk->usage)); 193 WARN_ON(atomic_read(&tsk->usage));
194 WARN_ON(tsk == current); 194 WARN_ON(tsk == current);
195 195
196 security_task_free(tsk);
196 exit_creds(tsk); 197 exit_creds(tsk);
197 delayacct_tsk_free(tsk); 198 delayacct_tsk_free(tsk);
198 put_signal_struct(tsk->signal); 199 put_signal_struct(tsk->signal);
@@ -355,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
355 charge = 0; 356 charge = 0;
356 if (mpnt->vm_flags & VM_ACCOUNT) { 357 if (mpnt->vm_flags & VM_ACCOUNT) {
357 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; 358 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
358 if (security_vm_enough_memory(len)) 359 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
359 goto fail_nomem; 360 goto fail_nomem;
360 charge = len; 361 charge = len;
361 } 362 }
@@ -511,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
511 return NULL; 512 return NULL;
512} 513}
513 514
515static void check_mm(struct mm_struct *mm)
516{
517 int i;
518
519 for (i = 0; i < NR_MM_COUNTERS; i++) {
520 long x = atomic_long_read(&mm->rss_stat.count[i]);
521
522 if (unlikely(x))
523 printk(KERN_ALERT "BUG: Bad rss-counter state "
524 "mm:%p idx:%d val:%ld\n", mm, i, x);
525 }
526
527#ifdef CONFIG_TRANSPARENT_HUGEPAGE
528 VM_BUG_ON(mm->pmd_huge_pte);
529#endif
530}
531
514/* 532/*
515 * Allocate and initialize an mm_struct. 533 * Allocate and initialize an mm_struct.
516 */ 534 */
@@ -538,9 +556,7 @@ void __mmdrop(struct mm_struct *mm)
538 mm_free_pgd(mm); 556 mm_free_pgd(mm);
539 destroy_context(mm); 557 destroy_context(mm);
540 mmu_notifier_mm_destroy(mm); 558 mmu_notifier_mm_destroy(mm);
541#ifdef CONFIG_TRANSPARENT_HUGEPAGE 559 check_mm(mm);
542 VM_BUG_ON(mm->pmd_huge_pte);
543#endif
544 free_mm(mm); 560 free_mm(mm);
545} 561}
546EXPORT_SYMBOL_GPL(__mmdrop); 562EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1035,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1035 sig->oom_score_adj = current->signal->oom_score_adj; 1051 sig->oom_score_adj = current->signal->oom_score_adj;
1036 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1052 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1037 1053
1054 sig->has_child_subreaper = current->signal->has_child_subreaper ||
1055 current->signal->is_child_subreaper;
1056
1038 mutex_init(&sig->cred_guard_mutex); 1057 mutex_init(&sig->cred_guard_mutex);
1039 1058
1040 return 0; 1059 return 0;
@@ -1222,6 +1241,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1222#ifdef CONFIG_CPUSETS 1241#ifdef CONFIG_CPUSETS
1223 p->cpuset_mem_spread_rotor = NUMA_NO_NODE; 1242 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
1224 p->cpuset_slab_spread_rotor = NUMA_NO_NODE; 1243 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
1244 seqcount_init(&p->mems_allowed_seq);
1225#endif 1245#endif
1226#ifdef CONFIG_TRACE_IRQFLAGS 1246#ifdef CONFIG_TRACE_IRQFLAGS
1227 p->irq_events = 0; 1247 p->irq_events = 0;
@@ -1340,7 +1360,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1340 clear_all_latency_tracing(p); 1360 clear_all_latency_tracing(p);
1341 1361
1342 /* ok, now we should be set up.. */ 1362 /* ok, now we should be set up.. */
1343 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); 1363 if (clone_flags & CLONE_THREAD)
1364 p->exit_signal = -1;
1365 else if (clone_flags & CLONE_PARENT)
1366 p->exit_signal = current->group_leader->exit_signal;
1367 else
1368 p->exit_signal = (clone_flags & CSIGNAL);
1369
1344 p->pdeath_signal = 0; 1370 p->pdeath_signal = 0;
1345 p->exit_state = 0; 1371 p->exit_state = 0;
1346 1372
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 9815b8d1eed5..11f82a4d4eae 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p)
99 * freeze_task - send a freeze request to given task 99 * freeze_task - send a freeze request to given task
100 * @p: task to send the request to 100 * @p: task to send the request to
101 * 101 *
102 * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE 102 * If @p is freezing, the freeze request is sent either by sending a fake
103 * flag and either sending a fake signal to it or waking it up, depending 103 * signal (if it's not a kernel thread) or waking it up (if it's a kernel
104 * on whether it has %PF_FREEZER_NOSIG set. 104 * thread).
105 * 105 *
106 * RETURNS: 106 * RETURNS:
107 * %false, if @p is not freezing or already frozen; %true, otherwise 107 * %false, if @p is not freezing or already frozen; %true, otherwise
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1f9e26526b69..af48e59bc2ff 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,189 +1,793 @@
1#include <linux/debugfs.h>
2#include <linux/hardirq.h>
3#include <linux/interrupt.h>
1#include <linux/irq.h> 4#include <linux/irq.h>
5#include <linux/irqdesc.h>
2#include <linux/irqdomain.h> 6#include <linux/irqdomain.h>
3#include <linux/module.h> 7#include <linux/module.h>
4#include <linux/mutex.h> 8#include <linux/mutex.h>
5#include <linux/of.h> 9#include <linux/of.h>
6#include <linux/of_address.h> 10#include <linux/of_address.h>
11#include <linux/seq_file.h>
7#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/smp.h>
14#include <linux/fs.h>
15
16#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
17 * ie. legacy 8259, gets irqs 1..15 */
18#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
19#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
20#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
8 21
9static LIST_HEAD(irq_domain_list); 22static LIST_HEAD(irq_domain_list);
10static DEFINE_MUTEX(irq_domain_mutex); 23static DEFINE_MUTEX(irq_domain_mutex);
11 24
25static DEFINE_MUTEX(revmap_trees_mutex);
26static unsigned int irq_virq_count = NR_IRQS;
27static struct irq_domain *irq_default_domain;
28
12/** 29/**
13 * irq_domain_add() - Register an irq_domain 30 * irq_domain_alloc() - Allocate a new irq_domain data structure
14 * @domain: ptr to initialized irq_domain structure 31 * @of_node: optional device-tree node of the interrupt controller
32 * @revmap_type: type of reverse mapping to use
33 * @ops: map/unmap domain callbacks
34 * @host_data: Controller private data pointer
15 * 35 *
16 * Registers an irq_domain structure. The irq_domain must at a minimum be 36 * Allocates and initialize and irq_domain structure. Caller is expected to
17 * initialized with an ops structure pointer, and either a ->to_irq hook or 37 * register allocated irq_domain with irq_domain_register(). Returns pointer
18 * a valid irq_base value. Everything else is optional. 38 * to IRQ domain, or NULL on failure.
19 */ 39 */
20void irq_domain_add(struct irq_domain *domain) 40static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
41 unsigned int revmap_type,
42 const struct irq_domain_ops *ops,
43 void *host_data)
21{ 44{
22 struct irq_data *d; 45 struct irq_domain *domain;
23 int hwirq, irq;
24 46
25 /* 47 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
26 * This assumes that the irq_domain owner has already allocated 48 if (WARN_ON(!domain))
27 * the irq_descs. This block will be removed when support for dynamic 49 return NULL;
28 * allocation of irq_descs is added to irq_domain. 50
29 */ 51 /* Fill structure */
30 irq_domain_for_each_irq(domain, hwirq, irq) { 52 domain->revmap_type = revmap_type;
31 d = irq_get_irq_data(irq); 53 domain->ops = ops;
32 if (!d) { 54 domain->host_data = host_data;
33 WARN(1, "error: assigning domain to non existant irq_desc"); 55 domain->of_node = of_node_get(of_node);
34 return; 56
35 } 57 return domain;
36 if (d->domain) { 58}
37 /* things are broken; just report, don't clean up */ 59
38 WARN(1, "error: irq_desc already assigned to a domain"); 60static void irq_domain_add(struct irq_domain *domain)
39 return; 61{
62 mutex_lock(&irq_domain_mutex);
63 list_add(&domain->link, &irq_domain_list);
64 mutex_unlock(&irq_domain_mutex);
65 pr_debug("irq: Allocated domain of type %d @0x%p\n",
66 domain->revmap_type, domain);
67}
68
69static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
70 irq_hw_number_t hwirq)
71{
72 irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
73 int size = domain->revmap_data.legacy.size;
74
75 if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
76 return 0;
77 return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
78}
79
80/**
81 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
82 * @of_node: pointer to interrupt controller's device tree node.
83 * @size: total number of irqs in legacy mapping
84 * @first_irq: first number of irq block assigned to the domain
85 * @first_hwirq: first hwirq number to use for the translation. Should normally
86 * be '0', but a positive integer can be used if the effective
87 * hwirqs numbering does not begin at zero.
88 * @ops: map/unmap domain callbacks
89 * @host_data: Controller private data pointer
90 *
91 * Note: the map() callback will be called before this function returns
92 * for all legacy interrupts except 0 (which is always the invalid irq for
93 * a legacy controller).
94 */
95struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
96 unsigned int size,
97 unsigned int first_irq,
98 irq_hw_number_t first_hwirq,
99 const struct irq_domain_ops *ops,
100 void *host_data)
101{
102 struct irq_domain *domain;
103 unsigned int i;
104
105 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data);
106 if (!domain)
107 return NULL;
108
109 domain->revmap_data.legacy.first_irq = first_irq;
110 domain->revmap_data.legacy.first_hwirq = first_hwirq;
111 domain->revmap_data.legacy.size = size;
112
113 mutex_lock(&irq_domain_mutex);
114 /* Verify that all the irqs are available */
115 for (i = 0; i < size; i++) {
116 int irq = first_irq + i;
117 struct irq_data *irq_data = irq_get_irq_data(irq);
118
119 if (WARN_ON(!irq_data || irq_data->domain)) {
120 mutex_unlock(&irq_domain_mutex);
121 of_node_put(domain->of_node);
122 kfree(domain);
123 return NULL;
40 } 124 }
41 d->domain = domain;
42 d->hwirq = hwirq;
43 } 125 }
44 126
45 mutex_lock(&irq_domain_mutex); 127 /* Claim all of the irqs before registering a legacy domain */
46 list_add(&domain->list, &irq_domain_list); 128 for (i = 0; i < size; i++) {
129 struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
130 irq_data->hwirq = first_hwirq + i;
131 irq_data->domain = domain;
132 }
47 mutex_unlock(&irq_domain_mutex); 133 mutex_unlock(&irq_domain_mutex);
134
135 for (i = 0; i < size; i++) {
136 int irq = first_irq + i;
137 int hwirq = first_hwirq + i;
138
139 /* IRQ0 gets ignored */
140 if (!irq)
141 continue;
142
143 /* Legacy flags are left to default at this point,
144 * one can then use irq_create_mapping() to
145 * explicitly change them
146 */
147 ops->map(domain, irq, hwirq);
148
149 /* Clear norequest flags */
150 irq_clear_status_flags(irq, IRQ_NOREQUEST);
151 }
152
153 irq_domain_add(domain);
154 return domain;
155}
156
157/**
158 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
159 * @of_node: pointer to interrupt controller's device tree node.
160 * @ops: map/unmap domain callbacks
161 * @host_data: Controller private data pointer
162 */
163struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
164 unsigned int size,
165 const struct irq_domain_ops *ops,
166 void *host_data)
167{
168 struct irq_domain *domain;
169 unsigned int *revmap;
170
171 revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL);
172 if (WARN_ON(!revmap))
173 return NULL;
174
175 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
176 if (!domain) {
177 kfree(revmap);
178 return NULL;
179 }
180 domain->revmap_data.linear.size = size;
181 domain->revmap_data.linear.revmap = revmap;
182 irq_domain_add(domain);
183 return domain;
184}
185
186struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
187 const struct irq_domain_ops *ops,
188 void *host_data)
189{
190 struct irq_domain *domain = irq_domain_alloc(of_node,
191 IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
192 if (domain)
193 irq_domain_add(domain);
194 return domain;
195}
196
197/**
198 * irq_domain_add_tree()
199 * @of_node: pointer to interrupt controller's device tree node.
200 * @ops: map/unmap domain callbacks
201 *
202 * Note: The radix tree will be allocated later during boot automatically
203 * (the reverse mapping will use the slow path until that happens).
204 */
205struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
206 const struct irq_domain_ops *ops,
207 void *host_data)
208{
209 struct irq_domain *domain = irq_domain_alloc(of_node,
210 IRQ_DOMAIN_MAP_TREE, ops, host_data);
211 if (domain) {
212 INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
213 irq_domain_add(domain);
214 }
215 return domain;
48} 216}
49 217
50/** 218/**
51 * irq_domain_del() - Unregister an irq_domain 219 * irq_find_host() - Locates a domain for a given device node
52 * @domain: ptr to registered irq_domain. 220 * @node: device-tree node of the interrupt controller
53 */ 221 */
54void irq_domain_del(struct irq_domain *domain) 222struct irq_domain *irq_find_host(struct device_node *node)
55{ 223{
56 struct irq_data *d; 224 struct irq_domain *h, *found = NULL;
57 int hwirq, irq; 225 int rc;
58 226
227 /* We might want to match the legacy controller last since
228 * it might potentially be set to match all interrupts in
229 * the absence of a device node. This isn't a problem so far
230 * yet though...
231 */
59 mutex_lock(&irq_domain_mutex); 232 mutex_lock(&irq_domain_mutex);
60 list_del(&domain->list); 233 list_for_each_entry(h, &irq_domain_list, link) {
234 if (h->ops->match)
235 rc = h->ops->match(h, node);
236 else
237 rc = (h->of_node != NULL) && (h->of_node == node);
238
239 if (rc) {
240 found = h;
241 break;
242 }
243 }
61 mutex_unlock(&irq_domain_mutex); 244 mutex_unlock(&irq_domain_mutex);
245 return found;
246}
247EXPORT_SYMBOL_GPL(irq_find_host);
248
249/**
250 * irq_set_default_host() - Set a "default" irq domain
251 * @domain: default domain pointer
252 *
253 * For convenience, it's possible to set a "default" domain that will be used
254 * whenever NULL is passed to irq_create_mapping(). It makes life easier for
255 * platforms that want to manipulate a few hard coded interrupt numbers that
256 * aren't properly represented in the device-tree.
257 */
258void irq_set_default_host(struct irq_domain *domain)
259{
260 pr_debug("irq: Default domain set to @0x%p\n", domain);
261
262 irq_default_domain = domain;
263}
264
265/**
266 * irq_set_virq_count() - Set the maximum number of linux irqs
267 * @count: number of linux irqs, capped with NR_IRQS
268 *
269 * This is mainly for use by platforms like iSeries who want to program
270 * the virtual irq number in the controller to avoid the reverse mapping
271 */
272void irq_set_virq_count(unsigned int count)
273{
274 pr_debug("irq: Trying to set virq count to %d\n", count);
62 275
63 /* Clear the irq_domain assignments */ 276 BUG_ON(count < NUM_ISA_INTERRUPTS);
64 irq_domain_for_each_irq(domain, hwirq, irq) { 277 if (count < NR_IRQS)
65 d = irq_get_irq_data(irq); 278 irq_virq_count = count;
66 d->domain = NULL; 279}
280
281static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
282 irq_hw_number_t hwirq)
283{
284 struct irq_data *irq_data = irq_get_irq_data(virq);
285
286 irq_data->hwirq = hwirq;
287 irq_data->domain = domain;
288 if (domain->ops->map(domain, virq, hwirq)) {
289 pr_debug("irq: -> mapping failed, freeing\n");
290 irq_data->domain = NULL;
291 irq_data->hwirq = 0;
292 return -1;
67 } 293 }
294
295 irq_clear_status_flags(virq, IRQ_NOREQUEST);
296
297 return 0;
68} 298}
69 299
70#if defined(CONFIG_OF_IRQ)
71/** 300/**
72 * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec 301 * irq_create_direct_mapping() - Allocate an irq for direct mapping
302 * @domain: domain to allocate the irq for or NULL for default domain
73 * 303 *
74 * Used by the device tree interrupt mapping code to translate a device tree 304 * This routine is used for irq controllers which can choose the hardware
75 * interrupt specifier to a valid linux irq number. Returns either a valid 305 * interrupt numbers they generate. In such a case it's simplest to use
76 * linux IRQ number or 0. 306 * the linux irq as the hardware interrupt number.
307 */
308unsigned int irq_create_direct_mapping(struct irq_domain *domain)
309{
310 unsigned int virq;
311
312 if (domain == NULL)
313 domain = irq_default_domain;
314
315 BUG_ON(domain == NULL);
316 WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
317
318 virq = irq_alloc_desc_from(1, 0);
319 if (!virq) {
320 pr_debug("irq: create_direct virq allocation failed\n");
321 return 0;
322 }
323 if (virq >= irq_virq_count) {
324 pr_err("ERROR: no free irqs available below %i maximum\n",
325 irq_virq_count);
326 irq_free_desc(virq);
327 return 0;
328 }
329
330 pr_debug("irq: create_direct obtained virq %d\n", virq);
331
332 if (irq_setup_virq(domain, virq, virq)) {
333 irq_free_desc(virq);
334 return 0;
335 }
336
337 return virq;
338}
339
340/**
341 * irq_create_mapping() - Map a hardware interrupt into linux irq space
342 * @domain: domain owning this hardware interrupt or NULL for default domain
343 * @hwirq: hardware irq number in that domain space
77 * 344 *
78 * When the caller no longer need the irq number returned by this function it 345 * Only one mapping per hardware interrupt is permitted. Returns a linux
79 * should arrange to call irq_dispose_mapping(). 346 * irq number.
347 * If the sense/trigger is to be specified, set_irq_type() should be called
348 * on the number returned from that call.
80 */ 349 */
350unsigned int irq_create_mapping(struct irq_domain *domain,
351 irq_hw_number_t hwirq)
352{
353 unsigned int virq, hint;
354
355 pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
356
357 /* Look for default domain if nececssary */
358 if (domain == NULL)
359 domain = irq_default_domain;
360 if (domain == NULL) {
361 printk(KERN_WARNING "irq_create_mapping called for"
362 " NULL domain, hwirq=%lx\n", hwirq);
363 WARN_ON(1);
364 return 0;
365 }
366 pr_debug("irq: -> using domain @%p\n", domain);
367
368 /* Check if mapping already exists */
369 virq = irq_find_mapping(domain, hwirq);
370 if (virq) {
371 pr_debug("irq: -> existing mapping on virq %d\n", virq);
372 return virq;
373 }
374
375 /* Get a virtual interrupt number */
376 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
377 return irq_domain_legacy_revmap(domain, hwirq);
378
379 /* Allocate a virtual interrupt number */
380 hint = hwirq % irq_virq_count;
381 if (hint == 0)
382 hint++;
383 virq = irq_alloc_desc_from(hint, 0);
384 if (!virq)
385 virq = irq_alloc_desc_from(1, 0);
386 if (!virq) {
387 pr_debug("irq: -> virq allocation failed\n");
388 return 0;
389 }
390
391 if (irq_setup_virq(domain, virq, hwirq)) {
392 if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
393 irq_free_desc(virq);
394 return 0;
395 }
396
397 pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n",
398 hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
399
400 return virq;
401}
402EXPORT_SYMBOL_GPL(irq_create_mapping);
403
81unsigned int irq_create_of_mapping(struct device_node *controller, 404unsigned int irq_create_of_mapping(struct device_node *controller,
82 const u32 *intspec, unsigned int intsize) 405 const u32 *intspec, unsigned int intsize)
83{ 406{
84 struct irq_domain *domain; 407 struct irq_domain *domain;
85 unsigned long hwirq; 408 irq_hw_number_t hwirq;
86 unsigned int irq, type; 409 unsigned int type = IRQ_TYPE_NONE;
87 int rc = -EINVAL; 410 unsigned int virq;
88 411
89 /* Find a domain which can translate the irq spec */ 412 domain = controller ? irq_find_host(controller) : irq_default_domain;
90 mutex_lock(&irq_domain_mutex); 413 if (!domain) {
91 list_for_each_entry(domain, &irq_domain_list, list) { 414#ifdef CONFIG_MIPS
92 if (!domain->ops->dt_translate) 415 /*
93 continue; 416 * Workaround to avoid breaking interrupt controller drivers
94 rc = domain->ops->dt_translate(domain, controller, 417 * that don't yet register an irq_domain. This is temporary
95 intspec, intsize, &hwirq, &type); 418 * code. ~~~gcl, Feb 24, 2012
96 if (rc == 0) 419 *
97 break; 420 * Scheduled for removal in Linux v3.6. That should be enough
421 * time.
422 */
423 if (intsize > 0)
424 return intspec[0];
425#endif
426 printk(KERN_WARNING "irq: no irq domain found for %s !\n",
427 controller->full_name);
428 return 0;
98 } 429 }
99 mutex_unlock(&irq_domain_mutex);
100 430
101 if (rc != 0) 431 /* If domain has no translation, then we assume interrupt line */
102 return 0; 432 if (domain->ops->xlate == NULL)
433 hwirq = intspec[0];
434 else {
435 if (domain->ops->xlate(domain, controller, intspec, intsize,
436 &hwirq, &type))
437 return 0;
438 }
439
440 /* Create mapping */
441 virq = irq_create_mapping(domain, hwirq);
442 if (!virq)
443 return virq;
103 444
104 irq = irq_domain_to_irq(domain, hwirq); 445 /* Set type if specified and different than the current one */
105 if (type != IRQ_TYPE_NONE) 446 if (type != IRQ_TYPE_NONE &&
106 irq_set_irq_type(irq, type); 447 type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
107 pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", 448 irq_set_irq_type(virq, type);
108 controller->full_name, (int)hwirq, irq, type); 449 return virq;
109 return irq;
110} 450}
111EXPORT_SYMBOL_GPL(irq_create_of_mapping); 451EXPORT_SYMBOL_GPL(irq_create_of_mapping);
112 452
113/** 453/**
114 * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() 454 * irq_dispose_mapping() - Unmap an interrupt
115 * @irq: linux irq number to be discarded 455 * @virq: linux irq number of the interrupt to unmap
456 */
457void irq_dispose_mapping(unsigned int virq)
458{
459 struct irq_data *irq_data = irq_get_irq_data(virq);
460 struct irq_domain *domain;
461 irq_hw_number_t hwirq;
462
463 if (!virq || !irq_data)
464 return;
465
466 domain = irq_data->domain;
467 if (WARN_ON(domain == NULL))
468 return;
469
470 /* Never unmap legacy interrupts */
471 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
472 return;
473
474 irq_set_status_flags(virq, IRQ_NOREQUEST);
475
476 /* remove chip and handler */
477 irq_set_chip_and_handler(virq, NULL, NULL);
478
479 /* Make sure it's completed */
480 synchronize_irq(virq);
481
482 /* Tell the PIC about it */
483 if (domain->ops->unmap)
484 domain->ops->unmap(domain, virq);
485 smp_mb();
486
487 /* Clear reverse map */
488 hwirq = irq_data->hwirq;
489 switch(domain->revmap_type) {
490 case IRQ_DOMAIN_MAP_LINEAR:
491 if (hwirq < domain->revmap_data.linear.size)
492 domain->revmap_data.linear.revmap[hwirq] = 0;
493 break;
494 case IRQ_DOMAIN_MAP_TREE:
495 mutex_lock(&revmap_trees_mutex);
496 radix_tree_delete(&domain->revmap_data.tree, hwirq);
497 mutex_unlock(&revmap_trees_mutex);
498 break;
499 }
500
501 irq_free_desc(virq);
502}
503EXPORT_SYMBOL_GPL(irq_dispose_mapping);
504
505/**
506 * irq_find_mapping() - Find a linux irq from an hw irq number.
507 * @domain: domain owning this hardware interrupt
508 * @hwirq: hardware irq number in that domain space
509 *
510 * This is a slow path, for use by generic code. It's expected that an
511 * irq controller implementation directly calls the appropriate low level
512 * mapping function.
513 */
514unsigned int irq_find_mapping(struct irq_domain *domain,
515 irq_hw_number_t hwirq)
516{
517 unsigned int i;
518 unsigned int hint = hwirq % irq_virq_count;
519
520 /* Look for default domain if nececssary */
521 if (domain == NULL)
522 domain = irq_default_domain;
523 if (domain == NULL)
524 return 0;
525
526 /* legacy -> bail early */
527 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
528 return irq_domain_legacy_revmap(domain, hwirq);
529
530 /* Slow path does a linear search of the map */
531 if (hint == 0)
532 hint = 1;
533 i = hint;
534 do {
535 struct irq_data *data = irq_get_irq_data(i);
536 if (data && (data->domain == domain) && (data->hwirq == hwirq))
537 return i;
538 i++;
539 if (i >= irq_virq_count)
540 i = 1;
541 } while(i != hint);
542 return 0;
543}
544EXPORT_SYMBOL_GPL(irq_find_mapping);
545
546/**
547 * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
548 * @domain: domain owning this hardware interrupt
549 * @hwirq: hardware irq number in that domain space
116 * 550 *
117 * Calling this function indicates the caller no longer needs a reference to 551 * This is a fast path, for use by irq controller code that uses radix tree
118 * the linux irq number returned by a prior call to irq_create_of_mapping(). 552 * revmaps
119 */ 553 */
120void irq_dispose_mapping(unsigned int irq) 554unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
555 irq_hw_number_t hwirq)
121{ 556{
557 struct irq_data *irq_data;
558
559 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
560 return irq_find_mapping(domain, hwirq);
561
562 /*
563 * Freeing an irq can delete nodes along the path to
564 * do the lookup via call_rcu.
565 */
566 rcu_read_lock();
567 irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
568 rcu_read_unlock();
569
122 /* 570 /*
123 * nothing yet; will be filled when support for dynamic allocation of 571 * If found in radix tree, then fine.
124 * irq_descs is added to irq_domain 572 * Else fallback to linear lookup - this should not happen in practice
573 * as it means that we failed to insert the node in the radix tree.
125 */ 574 */
575 return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
126} 576}
127EXPORT_SYMBOL_GPL(irq_dispose_mapping);
128 577
129int irq_domain_simple_dt_translate(struct irq_domain *d, 578/**
130 struct device_node *controller, 579 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
131 const u32 *intspec, unsigned int intsize, 580 * @domain: domain owning this hardware interrupt
132 unsigned long *out_hwirq, unsigned int *out_type) 581 * @virq: linux irq number
582 * @hwirq: hardware irq number in that domain space
583 *
584 * This is for use by irq controllers that use a radix tree reverse
585 * mapping for fast lookup.
586 */
587void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
588 irq_hw_number_t hwirq)
133{ 589{
134 if (d->of_node != controller) 590 struct irq_data *irq_data = irq_get_irq_data(virq);
135 return -EINVAL; 591
136 if (intsize < 1) 592 if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
137 return -EINVAL; 593 return;
138 if (d->nr_irq && ((intspec[0] < d->hwirq_base) || 594
139 (intspec[0] >= d->hwirq_base + d->nr_irq))) 595 if (virq) {
140 return -EINVAL; 596 mutex_lock(&revmap_trees_mutex);
597 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
598 mutex_unlock(&revmap_trees_mutex);
599 }
600}
601
602/**
603 * irq_linear_revmap() - Find a linux irq from a hw irq number.
604 * @domain: domain owning this hardware interrupt
605 * @hwirq: hardware irq number in that domain space
606 *
607 * This is a fast path, for use by irq controller code that uses linear
608 * revmaps. It does fallback to the slow path if the revmap doesn't exist
609 * yet and will create the revmap entry with appropriate locking
610 */
611unsigned int irq_linear_revmap(struct irq_domain *domain,
612 irq_hw_number_t hwirq)
613{
614 unsigned int *revmap;
615
616 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
617 return irq_find_mapping(domain, hwirq);
618
619 /* Check revmap bounds */
620 if (unlikely(hwirq >= domain->revmap_data.linear.size))
621 return irq_find_mapping(domain, hwirq);
622
623 /* Check if revmap was allocated */
624 revmap = domain->revmap_data.linear.revmap;
625 if (unlikely(revmap == NULL))
626 return irq_find_mapping(domain, hwirq);
627
628 /* Fill up revmap with slow path if no mapping found */
629 if (unlikely(!revmap[hwirq]))
630 revmap[hwirq] = irq_find_mapping(domain, hwirq);
631
632 return revmap[hwirq];
633}
634
635#ifdef CONFIG_VIRQ_DEBUG
636static int virq_debug_show(struct seq_file *m, void *private)
637{
638 unsigned long flags;
639 struct irq_desc *desc;
640 const char *p;
641 static const char none[] = "none";
642 void *data;
643 int i;
644
645 seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq",
646 "chip name", "chip data", "domain name");
647
648 for (i = 1; i < nr_irqs; i++) {
649 desc = irq_to_desc(i);
650 if (!desc)
651 continue;
652
653 raw_spin_lock_irqsave(&desc->lock, flags);
654
655 if (desc->action && desc->action->handler) {
656 struct irq_chip *chip;
657
658 seq_printf(m, "%5d ", i);
659 seq_printf(m, "0x%05lx ", desc->irq_data.hwirq);
660
661 chip = irq_desc_get_chip(desc);
662 if (chip && chip->name)
663 p = chip->name;
664 else
665 p = none;
666 seq_printf(m, "%-15s ", p);
667
668 data = irq_desc_get_chip_data(desc);
669 seq_printf(m, "0x%16p ", data);
670
671 if (desc->irq_data.domain->of_node)
672 p = desc->irq_data.domain->of_node->full_name;
673 else
674 p = none;
675 seq_printf(m, "%s\n", p);
676 }
677
678 raw_spin_unlock_irqrestore(&desc->lock, flags);
679 }
680
681 return 0;
682}
141 683
684static int virq_debug_open(struct inode *inode, struct file *file)
685{
686 return single_open(file, virq_debug_show, inode->i_private);
687}
688
689static const struct file_operations virq_debug_fops = {
690 .open = virq_debug_open,
691 .read = seq_read,
692 .llseek = seq_lseek,
693 .release = single_release,
694};
695
696static int __init irq_debugfs_init(void)
697{
698 if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root,
699 NULL, &virq_debug_fops) == NULL)
700 return -ENOMEM;
701
702 return 0;
703}
704__initcall(irq_debugfs_init);
705#endif /* CONFIG_VIRQ_DEBUG */
706
707int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
708 irq_hw_number_t hwirq)
709{
710 return 0;
711}
712
713/**
714 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
715 *
716 * Device Tree IRQ specifier translation function which works with one cell
717 * bindings where the cell value maps directly to the hwirq number.
718 */
719int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
720 const u32 *intspec, unsigned int intsize,
721 unsigned long *out_hwirq, unsigned int *out_type)
722{
723 if (WARN_ON(intsize < 1))
724 return -EINVAL;
142 *out_hwirq = intspec[0]; 725 *out_hwirq = intspec[0];
143 *out_type = IRQ_TYPE_NONE; 726 *out_type = IRQ_TYPE_NONE;
144 if (intsize > 1)
145 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
146 return 0; 727 return 0;
147} 728}
729EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
148 730
149/** 731/**
150 * irq_domain_create_simple() - Set up a 'simple' translation range 732 * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
733 *
734 * Device Tree IRQ specifier translation function which works with two cell
735 * bindings where the cell values map directly to the hwirq number
736 * and linux irq flags.
151 */ 737 */
152void irq_domain_add_simple(struct device_node *controller, int irq_base) 738int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
739 const u32 *intspec, unsigned int intsize,
740 irq_hw_number_t *out_hwirq, unsigned int *out_type)
153{ 741{
154 struct irq_domain *domain; 742 if (WARN_ON(intsize < 2))
155 743 return -EINVAL;
156 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 744 *out_hwirq = intspec[0];
157 if (!domain) { 745 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
158 WARN_ON(1); 746 return 0;
159 return; 747}
160 } 748EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
161 749
162 domain->irq_base = irq_base; 750/**
163 domain->of_node = of_node_get(controller); 751 * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
164 domain->ops = &irq_domain_simple_ops; 752 *
165 irq_domain_add(domain); 753 * Device Tree IRQ specifier translation function which works with either one
754 * or two cell bindings where the cell values map directly to the hwirq number
755 * and linux irq flags.
756 *
757 * Note: don't use this function unless your interrupt controller explicitly
758 * supports both one and two cell bindings. For the majority of controllers
759 * the _onecell() or _twocell() variants above should be used.
760 */
761int irq_domain_xlate_onetwocell(struct irq_domain *d,
762 struct device_node *ctrlr,
763 const u32 *intspec, unsigned int intsize,
764 unsigned long *out_hwirq, unsigned int *out_type)
765{
766 if (WARN_ON(intsize < 1))
767 return -EINVAL;
768 *out_hwirq = intspec[0];
769 *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
770 return 0;
166} 771}
167EXPORT_SYMBOL_GPL(irq_domain_add_simple); 772EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
168 773
774const struct irq_domain_ops irq_domain_simple_ops = {
775 .map = irq_domain_simple_map,
776 .xlate = irq_domain_xlate_onetwocell,
777};
778EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
779
780#ifdef CONFIG_OF_IRQ
169void irq_domain_generate_simple(const struct of_device_id *match, 781void irq_domain_generate_simple(const struct of_device_id *match,
170 u64 phys_base, unsigned int irq_start) 782 u64 phys_base, unsigned int irq_start)
171{ 783{
172 struct device_node *node; 784 struct device_node *node;
173 pr_info("looking for phys_base=%llx, irq_start=%i\n", 785 pr_debug("looking for phys_base=%llx, irq_start=%i\n",
174 (unsigned long long) phys_base, (int) irq_start); 786 (unsigned long long) phys_base, (int) irq_start);
175 node = of_find_matching_node_by_address(NULL, match, phys_base); 787 node = of_find_matching_node_by_address(NULL, match, phys_base);
176 if (node) 788 if (node)
177 irq_domain_add_simple(node, irq_start); 789 irq_domain_add_legacy(node, 32, irq_start, 0,
178 else 790 &irq_domain_simple_ops, NULL);
179 pr_info("no node found\n");
180} 791}
181EXPORT_SYMBOL_GPL(irq_domain_generate_simple); 792EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
182#endif /* CONFIG_OF_IRQ */ 793#endif
183
184struct irq_domain_ops irq_domain_simple_ops = {
185#ifdef CONFIG_OF_IRQ
186 .dt_translate = irq_domain_simple_dt_translate,
187#endif /* CONFIG_OF_IRQ */
188};
189EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7b0886786701..a6a675cb9818 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1546,13 +1546,13 @@ int kernel_kexec(void)
1546 if (error) 1546 if (error)
1547 goto Resume_console; 1547 goto Resume_console;
1548 /* At this point, dpm_suspend_start() has been called, 1548 /* At this point, dpm_suspend_start() has been called,
1549 * but *not* dpm_suspend_noirq(). We *must* call 1549 * but *not* dpm_suspend_end(). We *must* call
1550 * dpm_suspend_noirq() now. Otherwise, drivers for 1550 * dpm_suspend_end() now. Otherwise, drivers for
1551 * some devices (e.g. interrupt controllers) become 1551 * some devices (e.g. interrupt controllers) become
1552 * desynchronized with the actual state of the 1552 * desynchronized with the actual state of the
1553 * hardware at resume time, and evil weirdness ensues. 1553 * hardware at resume time, and evil weirdness ensues.
1554 */ 1554 */
1555 error = dpm_suspend_noirq(PMSG_FREEZE); 1555 error = dpm_suspend_end(PMSG_FREEZE);
1556 if (error) 1556 if (error)
1557 goto Resume_devices; 1557 goto Resume_devices;
1558 error = disable_nonboot_cpus(); 1558 error = disable_nonboot_cpus();
@@ -1579,7 +1579,7 @@ int kernel_kexec(void)
1579 local_irq_enable(); 1579 local_irq_enable();
1580 Enable_cpus: 1580 Enable_cpus:
1581 enable_nonboot_cpus(); 1581 enable_nonboot_cpus();
1582 dpm_resume_noirq(PMSG_RESTORE); 1582 dpm_resume_start(PMSG_RESTORE);
1583 Resume_devices: 1583 Resume_devices:
1584 dpm_resume_end(PMSG_RESTORE); 1584 dpm_resume_end(PMSG_RESTORE);
1585 Resume_console: 1585 Resume_console:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a0a88543934e..957a7aab8ebc 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -60,6 +60,43 @@ static DECLARE_RWSEM(umhelper_sem);
60*/ 60*/
61char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; 61char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
62 62
63static void free_modprobe_argv(struct subprocess_info *info)
64{
65 kfree(info->argv[3]); /* check call_modprobe() */
66 kfree(info->argv);
67}
68
69static int call_modprobe(char *module_name, int wait)
70{
71 static char *envp[] = {
72 "HOME=/",
73 "TERM=linux",
74 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
75 NULL
76 };
77
78 char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
79 if (!argv)
80 goto out;
81
82 module_name = kstrdup(module_name, GFP_KERNEL);
83 if (!module_name)
84 goto free_argv;
85
86 argv[0] = modprobe_path;
87 argv[1] = "-q";
88 argv[2] = "--";
89 argv[3] = module_name; /* check free_modprobe_argv() */
90 argv[4] = NULL;
91
92 return call_usermodehelper_fns(modprobe_path, argv, envp,
93 wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
94free_argv:
95 kfree(argv);
96out:
97 return -ENOMEM;
98}
99
63/** 100/**
64 * __request_module - try to load a kernel module 101 * __request_module - try to load a kernel module
65 * @wait: wait (or not) for the operation to complete 102 * @wait: wait (or not) for the operation to complete
@@ -81,11 +118,6 @@ int __request_module(bool wait, const char *fmt, ...)
81 char module_name[MODULE_NAME_LEN]; 118 char module_name[MODULE_NAME_LEN];
82 unsigned int max_modprobes; 119 unsigned int max_modprobes;
83 int ret; 120 int ret;
84 char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
85 static char *envp[] = { "HOME=/",
86 "TERM=linux",
87 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
88 NULL };
89 static atomic_t kmod_concurrent = ATOMIC_INIT(0); 121 static atomic_t kmod_concurrent = ATOMIC_INIT(0);
90#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 122#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
91 static int kmod_loop_msg; 123 static int kmod_loop_msg;
@@ -128,9 +160,7 @@ int __request_module(bool wait, const char *fmt, ...)
128 160
129 trace_module_request(module_name, wait, _RET_IP_); 161 trace_module_request(module_name, wait, _RET_IP_);
130 162
131 ret = call_usermodehelper_fns(modprobe_path, argv, envp, 163 ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
132 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
133 NULL, NULL, NULL);
134 164
135 atomic_dec(&kmod_concurrent); 165 atomic_dec(&kmod_concurrent);
136 return ret; 166 return ret;
@@ -188,7 +218,7 @@ static int ____call_usermodehelper(void *data)
188 /* Exec failed? */ 218 /* Exec failed? */
189fail: 219fail:
190 sub_info->retval = retval; 220 sub_info->retval = retval;
191 do_exit(0); 221 return 0;
192} 222}
193 223
194void call_usermodehelper_freeinfo(struct subprocess_info *info) 224void call_usermodehelper_freeinfo(struct subprocess_info *info)
@@ -199,6 +229,19 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
199} 229}
200EXPORT_SYMBOL(call_usermodehelper_freeinfo); 230EXPORT_SYMBOL(call_usermodehelper_freeinfo);
201 231
232static void umh_complete(struct subprocess_info *sub_info)
233{
234 struct completion *comp = xchg(&sub_info->complete, NULL);
235 /*
236 * See call_usermodehelper_exec(). If xchg() returns NULL
237 * we own sub_info, the UMH_KILLABLE caller has gone away.
238 */
239 if (comp)
240 complete(comp);
241 else
242 call_usermodehelper_freeinfo(sub_info);
243}
244
202/* Keventd can't block, but this (a child) can. */ 245/* Keventd can't block, but this (a child) can. */
203static int wait_for_helper(void *data) 246static int wait_for_helper(void *data)
204{ 247{
@@ -235,7 +278,7 @@ static int wait_for_helper(void *data)
235 sub_info->retval = ret; 278 sub_info->retval = ret;
236 } 279 }
237 280
238 complete(sub_info->complete); 281 umh_complete(sub_info);
239 return 0; 282 return 0;
240} 283}
241 284
@@ -244,7 +287,7 @@ static void __call_usermodehelper(struct work_struct *work)
244{ 287{
245 struct subprocess_info *sub_info = 288 struct subprocess_info *sub_info =
246 container_of(work, struct subprocess_info, work); 289 container_of(work, struct subprocess_info, work);
247 enum umh_wait wait = sub_info->wait; 290 int wait = sub_info->wait & ~UMH_KILLABLE;
248 pid_t pid; 291 pid_t pid;
249 292
250 /* CLONE_VFORK: wait until the usermode helper has execve'd 293 /* CLONE_VFORK: wait until the usermode helper has execve'd
@@ -269,7 +312,7 @@ static void __call_usermodehelper(struct work_struct *work)
269 case UMH_WAIT_EXEC: 312 case UMH_WAIT_EXEC:
270 if (pid < 0) 313 if (pid < 0)
271 sub_info->retval = pid; 314 sub_info->retval = pid;
272 complete(sub_info->complete); 315 umh_complete(sub_info);
273 } 316 }
274} 317}
275 318
@@ -435,8 +478,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
435 * asynchronously if wait is not set, and runs as a child of keventd. 478 * asynchronously if wait is not set, and runs as a child of keventd.
436 * (ie. it runs with full root capabilities). 479 * (ie. it runs with full root capabilities).
437 */ 480 */
438int call_usermodehelper_exec(struct subprocess_info *sub_info, 481int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
439 enum umh_wait wait)
440{ 482{
441 DECLARE_COMPLETION_ONSTACK(done); 483 DECLARE_COMPLETION_ONSTACK(done);
442 int retval = 0; 484 int retval = 0;
@@ -456,9 +498,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
456 queue_work(khelper_wq, &sub_info->work); 498 queue_work(khelper_wq, &sub_info->work);
457 if (wait == UMH_NO_WAIT) /* task has freed sub_info */ 499 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
458 goto unlock; 500 goto unlock;
501
502 if (wait & UMH_KILLABLE) {
503 retval = wait_for_completion_killable(&done);
504 if (!retval)
505 goto wait_done;
506
507 /* umh_complete() will see NULL and free sub_info */
508 if (xchg(&sub_info->complete, NULL))
509 goto unlock;
510 /* fallthrough, umh_complete() was already called */
511 }
512
459 wait_for_completion(&done); 513 wait_for_completion(&done);
514wait_done:
460 retval = sub_info->retval; 515 retval = sub_info->retval;
461
462out: 516out:
463 call_usermodehelper_freeinfo(sub_info); 517 call_usermodehelper_freeinfo(sub_info);
464unlock: 518unlock:
diff --git a/kernel/padata.c b/kernel/padata.c
index b45259931512..6f10eb285ece 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -29,7 +29,6 @@
29#include <linux/sysfs.h> 29#include <linux/sysfs.h>
30#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
31 31
32#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
33#define MAX_OBJ_NUM 1000 32#define MAX_OBJ_NUM 1000
34 33
35static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
@@ -43,18 +42,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
43 return target_cpu; 42 return target_cpu;
44} 43}
45 44
46static int padata_cpu_hash(struct padata_priv *padata) 45static int padata_cpu_hash(struct parallel_data *pd)
47{ 46{
48 int cpu_index; 47 int cpu_index;
49 struct parallel_data *pd;
50
51 pd = padata->pd;
52 48
53 /* 49 /*
54 * Hash the sequence numbers to the cpus by taking 50 * Hash the sequence numbers to the cpus by taking
55 * seq_nr mod. number of cpus in use. 51 * seq_nr mod. number of cpus in use.
56 */ 52 */
57 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); 53
54 spin_lock(&pd->seq_lock);
55 cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
56 pd->seq_nr++;
57 spin_unlock(&pd->seq_lock);
58 58
59 return padata_index_to_cpu(pd, cpu_index); 59 return padata_index_to_cpu(pd, cpu_index);
60} 60}
@@ -132,12 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst,
132 padata->pd = pd; 132 padata->pd = pd;
133 padata->cb_cpu = cb_cpu; 133 padata->cb_cpu = cb_cpu;
134 134
135 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) 135 target_cpu = padata_cpu_hash(pd);
136 atomic_set(&pd->seq_nr, -1);
137
138 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
139
140 target_cpu = padata_cpu_hash(padata);
141 queue = per_cpu_ptr(pd->pqueue, target_cpu); 136 queue = per_cpu_ptr(pd->pqueue, target_cpu);
142 137
143 spin_lock(&queue->parallel.lock); 138 spin_lock(&queue->parallel.lock);
@@ -173,7 +168,7 @@ EXPORT_SYMBOL(padata_do_parallel);
173static struct padata_priv *padata_get_next(struct parallel_data *pd) 168static struct padata_priv *padata_get_next(struct parallel_data *pd)
174{ 169{
175 int cpu, num_cpus; 170 int cpu, num_cpus;
176 int next_nr, next_index; 171 unsigned int next_nr, next_index;
177 struct padata_parallel_queue *queue, *next_queue; 172 struct padata_parallel_queue *queue, *next_queue;
178 struct padata_priv *padata; 173 struct padata_priv *padata;
179 struct padata_list *reorder; 174 struct padata_list *reorder;
@@ -189,14 +184,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
189 cpu = padata_index_to_cpu(pd, next_index); 184 cpu = padata_index_to_cpu(pd, next_index);
190 next_queue = per_cpu_ptr(pd->pqueue, cpu); 185 next_queue = per_cpu_ptr(pd->pqueue, cpu);
191 186
192 if (unlikely(next_nr > pd->max_seq_nr)) {
193 next_nr = next_nr - pd->max_seq_nr - 1;
194 next_index = next_nr % num_cpus;
195 cpu = padata_index_to_cpu(pd, next_index);
196 next_queue = per_cpu_ptr(pd->pqueue, cpu);
197 pd->processed = 0;
198 }
199
200 padata = NULL; 187 padata = NULL;
201 188
202 reorder = &next_queue->reorder; 189 reorder = &next_queue->reorder;
@@ -205,8 +192,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
205 padata = list_entry(reorder->list.next, 192 padata = list_entry(reorder->list.next,
206 struct padata_priv, list); 193 struct padata_priv, list);
207 194
208 BUG_ON(next_nr != padata->seq_nr);
209
210 spin_lock(&reorder->lock); 195 spin_lock(&reorder->lock);
211 list_del_init(&padata->list); 196 list_del_init(&padata->list);
212 atomic_dec(&pd->reorder_objects); 197 atomic_dec(&pd->reorder_objects);
@@ -230,6 +215,7 @@ out:
230 215
231static void padata_reorder(struct parallel_data *pd) 216static void padata_reorder(struct parallel_data *pd)
232{ 217{
218 int cb_cpu;
233 struct padata_priv *padata; 219 struct padata_priv *padata;
234 struct padata_serial_queue *squeue; 220 struct padata_serial_queue *squeue;
235 struct padata_instance *pinst = pd->pinst; 221 struct padata_instance *pinst = pd->pinst;
@@ -270,13 +256,14 @@ static void padata_reorder(struct parallel_data *pd)
270 return; 256 return;
271 } 257 }
272 258
273 squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); 259 cb_cpu = padata->cb_cpu;
260 squeue = per_cpu_ptr(pd->squeue, cb_cpu);
274 261
275 spin_lock(&squeue->serial.lock); 262 spin_lock(&squeue->serial.lock);
276 list_add_tail(&padata->list, &squeue->serial.list); 263 list_add_tail(&padata->list, &squeue->serial.list);
277 spin_unlock(&squeue->serial.lock); 264 spin_unlock(&squeue->serial.lock);
278 265
279 queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); 266 queue_work_on(cb_cpu, pinst->wq, &squeue->work);
280 } 267 }
281 268
282 spin_unlock_bh(&pd->lock); 269 spin_unlock_bh(&pd->lock);
@@ -400,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd)
400/* Initialize all percpu queues used by parallel workers */ 387/* Initialize all percpu queues used by parallel workers */
401static void padata_init_pqueues(struct parallel_data *pd) 388static void padata_init_pqueues(struct parallel_data *pd)
402{ 389{
403 int cpu_index, num_cpus, cpu; 390 int cpu_index, cpu;
404 struct padata_parallel_queue *pqueue; 391 struct padata_parallel_queue *pqueue;
405 392
406 cpu_index = 0; 393 cpu_index = 0;
@@ -415,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd)
415 INIT_WORK(&pqueue->work, padata_parallel_worker); 402 INIT_WORK(&pqueue->work, padata_parallel_worker);
416 atomic_set(&pqueue->num_obj, 0); 403 atomic_set(&pqueue->num_obj, 0);
417 } 404 }
418
419 num_cpus = cpumask_weight(pd->cpumask.pcpu);
420 pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
421} 405}
422 406
423/* Allocate and initialize the internal cpumask dependend resources. */ 407/* Allocate and initialize the internal cpumask dependend resources. */
@@ -444,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
444 padata_init_pqueues(pd); 428 padata_init_pqueues(pd);
445 padata_init_squeues(pd); 429 padata_init_squeues(pd);
446 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 430 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
447 atomic_set(&pd->seq_nr, -1); 431 pd->seq_nr = 0;
448 atomic_set(&pd->reorder_objects, 0); 432 atomic_set(&pd->reorder_objects, 0);
449 atomic_set(&pd->refcnt, 0); 433 atomic_set(&pd->refcnt, 0);
450 pd->pinst = pinst; 434 pd->pinst = pinst;
diff --git a/kernel/params.c b/kernel/params.c
index 4bc965d8a1fe..47f5bf12434a 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/module.h>
19#include <linux/kernel.h> 18#include <linux/kernel.h>
20#include <linux/string.h> 19#include <linux/string.h>
21#include <linux/errno.h> 20#include <linux/errno.h>
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a8968396046d..17b232869a04 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -168,13 +168,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
168 while (nr > 0) { 168 while (nr > 0) {
169 rcu_read_lock(); 169 rcu_read_lock();
170 170
171 /*
172 * Any nested-container's init processes won't ignore the
173 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
174 */
175 task = pid_task(find_vpid(nr), PIDTYPE_PID); 171 task = pid_task(find_vpid(nr), PIDTYPE_PID);
176 if (task) 172 if (task && !__fatal_signal_pending(task))
177 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); 173 send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
178 174
179 rcu_read_unlock(); 175 rcu_read_unlock();
180 176
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 07e0e28ffba7..66d808ec5252 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,8 @@
1 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
3 3
4obj-$(CONFIG_PM) += main.o qos.o 4obj-y += qos.o
5obj-$(CONFIG_PM) += main.o
5obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o 6obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
6obj-$(CONFIG_FREEZER) += process.o 7obj-$(CONFIG_FREEZER) += process.o
7obj-$(CONFIG_SUSPEND) += suspend.o 8obj-$(CONFIG_SUSPEND) += suspend.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 6d6d28870335..0a186cfde788 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
245 * create_image - Create a hibernation image. 245 * create_image - Create a hibernation image.
246 * @platform_mode: Whether or not to use the platform driver. 246 * @platform_mode: Whether or not to use the platform driver.
247 * 247 *
248 * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image 248 * Execute device drivers' "late" and "noirq" freeze callbacks, create a
249 * and execute the drivers' .thaw_noirq() callbacks. 249 * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
250 * 250 *
251 * Control reappears in this routine after the subsequent restore. 251 * Control reappears in this routine after the subsequent restore.
252 */ 252 */
@@ -254,7 +254,7 @@ static int create_image(int platform_mode)
254{ 254{
255 int error; 255 int error;
256 256
257 error = dpm_suspend_noirq(PMSG_FREEZE); 257 error = dpm_suspend_end(PMSG_FREEZE);
258 if (error) { 258 if (error) {
259 printk(KERN_ERR "PM: Some devices failed to power down, " 259 printk(KERN_ERR "PM: Some devices failed to power down, "
260 "aborting hibernation\n"); 260 "aborting hibernation\n");
@@ -306,7 +306,7 @@ static int create_image(int platform_mode)
306 Platform_finish: 306 Platform_finish:
307 platform_finish(platform_mode); 307 platform_finish(platform_mode);
308 308
309 dpm_resume_noirq(in_suspend ? 309 dpm_resume_start(in_suspend ?
310 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 310 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
311 311
312 return error; 312 return error;
@@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode)
343 * successful freezer test. 343 * successful freezer test.
344 */ 344 */
345 freezer_test_done = true; 345 freezer_test_done = true;
346 goto Cleanup; 346 goto Thaw;
347 } 347 }
348 348
349 error = dpm_prepare(PMSG_FREEZE); 349 error = dpm_prepare(PMSG_FREEZE);
350 if (error) { 350 if (error) {
351 dpm_complete(PMSG_RECOVER); 351 dpm_complete(PMSG_RECOVER);
352 goto Cleanup; 352 goto Thaw;
353 } 353 }
354 354
355 suspend_console(); 355 suspend_console();
@@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode)
385 platform_end(platform_mode); 385 platform_end(platform_mode);
386 return error; 386 return error;
387 387
388 Thaw:
389 thaw_kernel_threads();
388 Cleanup: 390 Cleanup:
389 swsusp_free(); 391 swsusp_free();
390 goto Close; 392 goto Close;
@@ -394,16 +396,16 @@ int hibernation_snapshot(int platform_mode)
394 * resume_target_kernel - Restore system state from a hibernation image. 396 * resume_target_kernel - Restore system state from a hibernation image.
395 * @platform_mode: Whether or not to use the platform driver. 397 * @platform_mode: Whether or not to use the platform driver.
396 * 398 *
397 * Execute device drivers' .freeze_noirq() callbacks, restore the contents of 399 * Execute device drivers' "noirq" and "late" freeze callbacks, restore the
398 * highmem that have not been restored yet from the image and run the low-level 400 * contents of highmem that have not been restored yet from the image and run
399 * code that will restore the remaining contents of memory and switch to the 401 * the low-level code that will restore the remaining contents of memory and
400 * just restored target kernel. 402 * switch to the just restored target kernel.
401 */ 403 */
402static int resume_target_kernel(bool platform_mode) 404static int resume_target_kernel(bool platform_mode)
403{ 405{
404 int error; 406 int error;
405 407
406 error = dpm_suspend_noirq(PMSG_QUIESCE); 408 error = dpm_suspend_end(PMSG_QUIESCE);
407 if (error) { 409 if (error) {
408 printk(KERN_ERR "PM: Some devices failed to power down, " 410 printk(KERN_ERR "PM: Some devices failed to power down, "
409 "aborting resume\n"); 411 "aborting resume\n");
@@ -460,7 +462,7 @@ static int resume_target_kernel(bool platform_mode)
460 Cleanup: 462 Cleanup:
461 platform_restore_cleanup(platform_mode); 463 platform_restore_cleanup(platform_mode);
462 464
463 dpm_resume_noirq(PMSG_RECOVER); 465 dpm_resume_start(PMSG_RECOVER);
464 466
465 return error; 467 return error;
466} 468}
@@ -518,7 +520,7 @@ int hibernation_platform_enter(void)
518 goto Resume_devices; 520 goto Resume_devices;
519 } 521 }
520 522
521 error = dpm_suspend_noirq(PMSG_HIBERNATE); 523 error = dpm_suspend_end(PMSG_HIBERNATE);
522 if (error) 524 if (error)
523 goto Resume_devices; 525 goto Resume_devices;
524 526
@@ -549,7 +551,7 @@ int hibernation_platform_enter(void)
549 Platform_finish: 551 Platform_finish:
550 hibernation_ops->finish(); 552 hibernation_ops->finish();
551 553
552 dpm_resume_noirq(PMSG_RESTORE); 554 dpm_resume_start(PMSG_RESTORE);
553 555
554 Resume_devices: 556 Resume_devices:
555 entering_platform_hibernation = false; 557 entering_platform_hibernation = false;
@@ -616,7 +618,7 @@ int hibernate(void)
616 /* Allocate memory management structures */ 618 /* Allocate memory management structures */
617 error = create_basic_memory_bitmaps(); 619 error = create_basic_memory_bitmaps();
618 if (error) 620 if (error)
619 goto Exit; 621 goto Enable_umh;
620 622
621 printk(KERN_INFO "PM: Syncing filesystems ... "); 623 printk(KERN_INFO "PM: Syncing filesystems ... ");
622 sys_sync(); 624 sys_sync();
@@ -624,15 +626,11 @@ int hibernate(void)
624 626
625 error = freeze_processes(); 627 error = freeze_processes();
626 if (error) 628 if (error)
627 goto Finish; 629 goto Free_bitmaps;
628 630
629 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 631 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
630 if (error) 632 if (error || freezer_test_done)
631 goto Thaw;
632 if (freezer_test_done) {
633 freezer_test_done = false;
634 goto Thaw; 633 goto Thaw;
635 }
636 634
637 if (in_suspend) { 635 if (in_suspend) {
638 unsigned int flags = 0; 636 unsigned int flags = 0;
@@ -657,8 +655,13 @@ int hibernate(void)
657 655
658 Thaw: 656 Thaw:
659 thaw_processes(); 657 thaw_processes();
660 Finish: 658
659 /* Don't bother checking whether freezer_test_done is true */
660 freezer_test_done = false;
661
662 Free_bitmaps:
661 free_basic_memory_bitmaps(); 663 free_basic_memory_bitmaps();
664 Enable_umh:
662 usermodehelper_enable(); 665 usermodehelper_enable();
663 Exit: 666 Exit:
664 pm_notifier_call_chain(PM_POST_HIBERNATION); 667 pm_notifier_call_chain(PM_POST_HIBERNATION);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9824b41e5a18..1c12581f1c62 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
165 last_errno %= REC_FAILED_NUM; 165 last_errno %= REC_FAILED_NUM;
166 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; 166 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
167 last_step %= REC_FAILED_NUM; 167 last_step %= REC_FAILED_NUM;
168 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" 168 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
169 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", 169 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
170 "success", suspend_stats.success, 170 "success", suspend_stats.success,
171 "fail", suspend_stats.fail, 171 "fail", suspend_stats.fail,
172 "failed_freeze", suspend_stats.failed_freeze, 172 "failed_freeze", suspend_stats.failed_freeze,
173 "failed_prepare", suspend_stats.failed_prepare, 173 "failed_prepare", suspend_stats.failed_prepare,
174 "failed_suspend", suspend_stats.failed_suspend, 174 "failed_suspend", suspend_stats.failed_suspend,
175 "failed_suspend_late",
176 suspend_stats.failed_suspend_late,
175 "failed_suspend_noirq", 177 "failed_suspend_noirq",
176 suspend_stats.failed_suspend_noirq, 178 suspend_stats.failed_suspend_noirq,
177 "failed_resume", suspend_stats.failed_resume, 179 "failed_resume", suspend_stats.failed_resume,
180 "failed_resume_early",
181 suspend_stats.failed_resume_early,
178 "failed_resume_noirq", 182 "failed_resume_noirq",
179 suspend_stats.failed_resume_noirq); 183 suspend_stats.failed_resume_noirq);
180 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", 184 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
@@ -287,16 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
287 291
288#ifdef CONFIG_SUSPEND 292#ifdef CONFIG_SUSPEND
289 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { 293 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
290 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 294 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
295 error = pm_suspend(state);
291 break; 296 break;
292 } 297 }
293 if (state < PM_SUSPEND_MAX && *s) {
294 error = enter_state(state);
295 if (error) {
296 suspend_stats.fail++;
297 dpm_save_failed_errno(error);
298 } else
299 suspend_stats.success++;
300 } 298 }
301#endif 299#endif
302 300
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 21724eee5206..98f3622d7407 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -177,13 +177,11 @@ extern const char *const pm_states[];
177 177
178extern bool valid_state(suspend_state_t state); 178extern bool valid_state(suspend_state_t state);
179extern int suspend_devices_and_enter(suspend_state_t state); 179extern int suspend_devices_and_enter(suspend_state_t state);
180extern int enter_state(suspend_state_t state);
181#else /* !CONFIG_SUSPEND */ 180#else /* !CONFIG_SUSPEND */
182static inline int suspend_devices_and_enter(suspend_state_t state) 181static inline int suspend_devices_and_enter(suspend_state_t state)
183{ 182{
184 return -ENOSYS; 183 return -ENOSYS;
185} 184}
186static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
187static inline bool valid_state(suspend_state_t state) { return false; } 185static inline bool valid_state(suspend_state_t state) { return false; }
188#endif /* !CONFIG_SUSPEND */ 186#endif /* !CONFIG_SUSPEND */
189 187
@@ -234,16 +232,14 @@ static inline int suspend_freeze_processes(void)
234 int error; 232 int error;
235 233
236 error = freeze_processes(); 234 error = freeze_processes();
237
238 /* 235 /*
239 * freeze_processes() automatically thaws every task if freezing 236 * freeze_processes() automatically thaws every task if freezing
240 * fails. So we need not do anything extra upon error. 237 * fails. So we need not do anything extra upon error.
241 */ 238 */
242 if (error) 239 if (error)
243 goto Finish; 240 return error;
244 241
245 error = freeze_kernel_threads(); 242 error = freeze_kernel_threads();
246
247 /* 243 /*
248 * freeze_kernel_threads() thaws only kernel threads upon freezing 244 * freeze_kernel_threads() thaws only kernel threads upon freezing
249 * failure. So we have to thaw the userspace tasks ourselves. 245 * failure. So we have to thaw the userspace tasks ourselves.
@@ -251,7 +247,6 @@ static inline int suspend_freeze_processes(void)
251 if (error) 247 if (error)
252 thaw_processes(); 248 thaw_processes();
253 249
254 Finish:
255 return error; 250 return error;
256} 251}
257 252
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7e426459e60a..0d2aeb226108 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only)
53 * It is "frozen enough". If the task does wake 53 * It is "frozen enough". If the task does wake
54 * up, it will immediately call try_to_freeze. 54 * up, it will immediately call try_to_freeze.
55 * 55 *
56 * Because freeze_task() goes through p's 56 * Because freeze_task() goes through p's scheduler lock, it's
57 * scheduler lock after setting TIF_FREEZE, it's 57 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
58 * guaranteed that either we see TASK_RUNNING or 58 * transition can't race with task state testing here.
59 * try_to_stop() after schedule() in ptrace/signal
60 * stop sees TIF_FREEZE.
61 */ 59 */
62 if (!task_is_stopped_or_traced(p) && 60 if (!task_is_stopped_or_traced(p) &&
63 !freezer_should_skip(p)) 61 !freezer_should_skip(p))
@@ -98,13 +96,15 @@ static int try_to_freeze_tasks(bool user_only)
98 elapsed_csecs / 100, elapsed_csecs % 100, 96 elapsed_csecs / 100, elapsed_csecs % 100,
99 todo - wq_busy, wq_busy); 97 todo - wq_busy, wq_busy);
100 98
101 read_lock(&tasklist_lock); 99 if (!wakeup) {
102 do_each_thread(g, p) { 100 read_lock(&tasklist_lock);
103 if (!wakeup && !freezer_should_skip(p) && 101 do_each_thread(g, p) {
104 p != current && freezing(p) && !frozen(p)) 102 if (p != current && !freezer_should_skip(p)
105 sched_show_task(p); 103 && freezing(p) && !frozen(p))
106 } while_each_thread(g, p); 104 sched_show_task(p);
107 read_unlock(&tasklist_lock); 105 } while_each_thread(g, p);
106 read_unlock(&tasklist_lock);
107 }
108 } else { 108 } else {
109 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, 109 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
110 elapsed_csecs % 100); 110 elapsed_csecs % 100);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 995e3bd3417b..d6d6dbd1ecc0 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
469static int __init pm_qos_power_init(void) 469static int __init pm_qos_power_init(void)
470{ 470{
471 int ret = 0; 471 int ret = 0;
472 int i;
472 473
473 ret = register_pm_qos_misc(&cpu_dma_pm_qos); 474 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
474 if (ret < 0) { 475
475 printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); 476 for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
476 return ret; 477 ret = register_pm_qos_misc(pm_qos_array[i]);
477 } 478 if (ret < 0) {
478 ret = register_pm_qos_misc(&network_lat_pm_qos); 479 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
479 if (ret < 0) { 480 pm_qos_array[i]->name);
480 printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); 481 return ret;
481 return ret; 482 }
482 } 483 }
483 ret = register_pm_qos_misc(&network_throughput_pm_qos);
484 if (ret < 0)
485 printk(KERN_ERR
486 "pm_qos_param: network_throughput setup failed\n");
487 484
488 return ret; 485 return ret;
489} 486}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 6a768e537001..0de28576807d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
711 list_for_each_entry(region, &nosave_regions, list) { 711 list_for_each_entry(region, &nosave_regions, list) {
712 unsigned long pfn; 712 unsigned long pfn;
713 713
714 pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", 714 pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
715 region->start_pfn << PAGE_SHIFT, 715 (unsigned long long) region->start_pfn << PAGE_SHIFT,
716 region->end_pfn << PAGE_SHIFT); 716 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
717 - 1);
717 718
718 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) 719 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
719 if (pfn_valid(pfn)) { 720 if (pfn_valid(pfn)) {
@@ -1000,20 +1001,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
1000 s_page = pfn_to_page(src_pfn); 1001 s_page = pfn_to_page(src_pfn);
1001 d_page = pfn_to_page(dst_pfn); 1002 d_page = pfn_to_page(dst_pfn);
1002 if (PageHighMem(s_page)) { 1003 if (PageHighMem(s_page)) {
1003 src = kmap_atomic(s_page, KM_USER0); 1004 src = kmap_atomic(s_page);
1004 dst = kmap_atomic(d_page, KM_USER1); 1005 dst = kmap_atomic(d_page);
1005 do_copy_page(dst, src); 1006 do_copy_page(dst, src);
1006 kunmap_atomic(dst, KM_USER1); 1007 kunmap_atomic(dst);
1007 kunmap_atomic(src, KM_USER0); 1008 kunmap_atomic(src);
1008 } else { 1009 } else {
1009 if (PageHighMem(d_page)) { 1010 if (PageHighMem(d_page)) {
1010 /* Page pointed to by src may contain some kernel 1011 /* Page pointed to by src may contain some kernel
1011 * data modified by kmap_atomic() 1012 * data modified by kmap_atomic()
1012 */ 1013 */
1013 safe_copy_page(buffer, s_page); 1014 safe_copy_page(buffer, s_page);
1014 dst = kmap_atomic(d_page, KM_USER0); 1015 dst = kmap_atomic(d_page);
1015 copy_page(dst, buffer); 1016 copy_page(dst, buffer);
1016 kunmap_atomic(dst, KM_USER0); 1017 kunmap_atomic(dst);
1017 } else { 1018 } else {
1018 safe_copy_page(page_address(d_page), s_page); 1019 safe_copy_page(page_address(d_page), s_page);
1019 } 1020 }
@@ -1728,9 +1729,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
1728 */ 1729 */
1729 void *kaddr; 1730 void *kaddr;
1730 1731
1731 kaddr = kmap_atomic(page, KM_USER0); 1732 kaddr = kmap_atomic(page);
1732 copy_page(buffer, kaddr); 1733 copy_page(buffer, kaddr);
1733 kunmap_atomic(kaddr, KM_USER0); 1734 kunmap_atomic(kaddr);
1734 handle->buffer = buffer; 1735 handle->buffer = buffer;
1735 } else { 1736 } else {
1736 handle->buffer = page_address(page); 1737 handle->buffer = page_address(page);
@@ -2014,9 +2015,9 @@ static void copy_last_highmem_page(void)
2014 if (last_highmem_page) { 2015 if (last_highmem_page) {
2015 void *dst; 2016 void *dst;
2016 2017
2017 dst = kmap_atomic(last_highmem_page, KM_USER0); 2018 dst = kmap_atomic(last_highmem_page);
2018 copy_page(dst, buffer); 2019 copy_page(dst, buffer);
2019 kunmap_atomic(dst, KM_USER0); 2020 kunmap_atomic(dst);
2020 last_highmem_page = NULL; 2021 last_highmem_page = NULL;
2021 } 2022 }
2022} 2023}
@@ -2309,13 +2310,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2309{ 2310{
2310 void *kaddr1, *kaddr2; 2311 void *kaddr1, *kaddr2;
2311 2312
2312 kaddr1 = kmap_atomic(p1, KM_USER0); 2313 kaddr1 = kmap_atomic(p1);
2313 kaddr2 = kmap_atomic(p2, KM_USER1); 2314 kaddr2 = kmap_atomic(p2);
2314 copy_page(buf, kaddr1); 2315 copy_page(buf, kaddr1);
2315 copy_page(kaddr1, kaddr2); 2316 copy_page(kaddr1, kaddr2);
2316 copy_page(kaddr2, buf); 2317 copy_page(kaddr2, buf);
2317 kunmap_atomic(kaddr2, KM_USER1); 2318 kunmap_atomic(kaddr2);
2318 kunmap_atomic(kaddr1, KM_USER0); 2319 kunmap_atomic(kaddr1);
2319} 2320}
2320 2321
2321/** 2322/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4fd51beed879..88e5c967370d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
37static const struct platform_suspend_ops *suspend_ops; 37static const struct platform_suspend_ops *suspend_ops;
38 38
39/** 39/**
40 * suspend_set_ops - Set the global suspend method table. 40 * suspend_set_ops - Set the global suspend method table.
41 * @ops: Pointer to ops structure. 41 * @ops: Suspend operations to use.
42 */ 42 */
43void suspend_set_ops(const struct platform_suspend_ops *ops) 43void suspend_set_ops(const struct platform_suspend_ops *ops)
44{ 44{
@@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state)
58} 58}
59 59
60/** 60/**
61 * suspend_valid_only_mem - generic memory-only valid callback 61 * suspend_valid_only_mem - Generic memory-only valid callback.
62 * 62 *
63 * Platform drivers that implement mem suspend only and only need 63 * Platform drivers that implement mem suspend only and only need to check for
64 * to check for that in their .valid callback can use this instead 64 * that in their .valid() callback can use this instead of rolling their own
65 * of rolling their own .valid callback. 65 * .valid() callback.
66 */ 66 */
67int suspend_valid_only_mem(suspend_state_t state) 67int suspend_valid_only_mem(suspend_state_t state)
68{ 68{
@@ -83,10 +83,11 @@ static int suspend_test(int level)
83} 83}
84 84
85/** 85/**
86 * suspend_prepare - Do prep work before entering low-power state. 86 * suspend_prepare - Prepare for entering system sleep state.
87 * 87 *
88 * This is common code that is called for each state that we're entering. 88 * Common code run for every system sleep state that can be entered (except for
89 * Run suspend notifiers, allocate a console and stop all processes. 89 * hibernation). Run suspend notifiers, allocate the "suspend" console and
90 * freeze processes.
90 */ 91 */
91static int suspend_prepare(void) 92static int suspend_prepare(void)
92{ 93{
@@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
131} 132}
132 133
133/** 134/**
134 * suspend_enter - enter the desired system sleep state. 135 * suspend_enter - Make the system enter the given sleep state.
135 * @state: State to enter 136 * @state: System sleep state to enter.
136 * @wakeup: Returns information that suspend should not be entered again. 137 * @wakeup: Returns information that the sleep state should not be re-entered.
137 * 138 *
138 * This function should be called after devices have been suspended. 139 * This function should be called after devices have been suspended.
139 */ 140 */
@@ -147,7 +148,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
147 goto Platform_finish; 148 goto Platform_finish;
148 } 149 }
149 150
150 error = dpm_suspend_noirq(PMSG_SUSPEND); 151 error = dpm_suspend_end(PMSG_SUSPEND);
151 if (error) { 152 if (error) {
152 printk(KERN_ERR "PM: Some devices failed to power down\n"); 153 printk(KERN_ERR "PM: Some devices failed to power down\n");
153 goto Platform_finish; 154 goto Platform_finish;
@@ -189,7 +190,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
189 if (suspend_ops->wake) 190 if (suspend_ops->wake)
190 suspend_ops->wake(); 191 suspend_ops->wake();
191 192
192 dpm_resume_noirq(PMSG_RESUME); 193 dpm_resume_start(PMSG_RESUME);
193 194
194 Platform_finish: 195 Platform_finish:
195 if (suspend_ops->finish) 196 if (suspend_ops->finish)
@@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
199} 200}
200 201
201/** 202/**
202 * suspend_devices_and_enter - suspend devices and enter the desired system 203 * suspend_devices_and_enter - Suspend devices and enter system sleep state.
203 * sleep state. 204 * @state: System sleep state to enter.
204 * @state: state to enter
205 */ 205 */
206int suspend_devices_and_enter(suspend_state_t state) 206int suspend_devices_and_enter(suspend_state_t state)
207{ 207{
@@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state)
251} 251}
252 252
253/** 253/**
254 * suspend_finish - Do final work before exiting suspend sequence. 254 * suspend_finish - Clean up before finishing the suspend sequence.
255 * 255 *
256 * Call platform code to clean up, restart processes, and free the 256 * Call platform code to clean up, restart processes, and free the console that
257 * console that we've allocated. This is not called for suspend-to-disk. 257 * we've allocated. This routine is not called for hibernation.
258 */ 258 */
259static void suspend_finish(void) 259static void suspend_finish(void)
260{ 260{
@@ -265,16 +265,14 @@ static void suspend_finish(void)
265} 265}
266 266
267/** 267/**
268 * enter_state - Do common work of entering low-power state. 268 * enter_state - Do common work needed to enter system sleep state.
269 * @state: pm_state structure for state we're entering. 269 * @state: System sleep state to enter.
270 * 270 *
271 * Make sure we're the only ones trying to enter a sleep state. Fail 271 * Make sure that no one else is trying to put the system into a sleep state.
272 * if someone has beat us to it, since we don't want anything weird to 272 * Fail if that's not the case. Otherwise, prepare for system suspend, make the
273 * happen when we wake up. 273 * system enter the given sleep state and clean up after wakeup.
274 * Then, do the setup for suspend, enter the state, and cleaup (after
275 * we've woken up).
276 */ 274 */
277int enter_state(suspend_state_t state) 275static int enter_state(suspend_state_t state)
278{ 276{
279 int error; 277 int error;
280 278
@@ -310,24 +308,26 @@ int enter_state(suspend_state_t state)
310} 308}
311 309
312/** 310/**
313 * pm_suspend - Externally visible function for suspending system. 311 * pm_suspend - Externally visible function for suspending the system.
314 * @state: Enumerated value of state to enter. 312 * @state: System sleep state to enter.
315 * 313 *
316 * Determine whether or not value is within range, get state 314 * Check if the value of @state represents one of the supported states,
317 * structure, and enter (above). 315 * execute enter_state() and update system suspend statistics.
318 */ 316 */
319int pm_suspend(suspend_state_t state) 317int pm_suspend(suspend_state_t state)
320{ 318{
321 int ret; 319 int error;
322 if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { 320
323 ret = enter_state(state); 321 if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
324 if (ret) { 322 return -EINVAL;
325 suspend_stats.fail++; 323
326 dpm_save_failed_errno(ret); 324 error = enter_state(state);
327 } else 325 if (error) {
328 suspend_stats.success++; 326 suspend_stats.fail++;
329 return ret; 327 dpm_save_failed_errno(error);
328 } else {
329 suspend_stats.success++;
330 } 330 }
331 return -EINVAL; 331 return error;
332} 332}
333EXPORT_SYMBOL(pm_suspend); 333EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3e100075b13c..33c4329205af 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -249,16 +249,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
249 } 249 }
250 pm_restore_gfp_mask(); 250 pm_restore_gfp_mask();
251 error = hibernation_snapshot(data->platform_support); 251 error = hibernation_snapshot(data->platform_support);
252 if (error) { 252 if (!error) {
253 thaw_kernel_threads();
254 } else {
255 error = put_user(in_suspend, (int __user *)arg); 253 error = put_user(in_suspend, (int __user *)arg);
256 if (!error && !freezer_test_done) 254 data->ready = !freezer_test_done && !error;
257 data->ready = 1; 255 freezer_test_done = false;
258 if (freezer_test_done) {
259 freezer_test_done = false;
260 thaw_kernel_threads();
261 }
262 } 256 }
263 break; 257 break;
264 258
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 00ab2ca5ed11..ee8d49b9c309 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -231,26 +231,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
231} 231}
232 232
233static int ptrace_attach(struct task_struct *task, long request, 233static int ptrace_attach(struct task_struct *task, long request,
234 unsigned long addr,
234 unsigned long flags) 235 unsigned long flags)
235{ 236{
236 bool seize = (request == PTRACE_SEIZE); 237 bool seize = (request == PTRACE_SEIZE);
237 int retval; 238 int retval;
238 239
239 /*
240 * SEIZE will enable new ptrace behaviors which will be implemented
241 * gradually. SEIZE_DEVEL is used to prevent applications
242 * expecting full SEIZE behaviors trapping on kernel commits which
243 * are still in the process of implementing them.
244 *
245 * Only test programs for new ptrace behaviors being implemented
246 * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
247 *
248 * Once SEIZE behaviors are completely implemented, this flag and
249 * the following test will be removed.
250 */
251 retval = -EIO; 240 retval = -EIO;
252 if (seize && !(flags & PTRACE_SEIZE_DEVEL)) 241 if (seize) {
253 goto out; 242 if (addr != 0)
243 goto out;
244 if (flags & ~(unsigned long)PTRACE_O_MASK)
245 goto out;
246 flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
247 } else {
248 flags = PT_PTRACED;
249 }
254 250
255 audit_ptrace(task); 251 audit_ptrace(task);
256 252
@@ -262,7 +258,7 @@ static int ptrace_attach(struct task_struct *task, long request,
262 258
263 /* 259 /*
264 * Protect exec's credential calculations against our interference; 260 * Protect exec's credential calculations against our interference;
265 * interference; SUID, SGID and LSM creds get determined differently 261 * SUID, SGID and LSM creds get determined differently
266 * under ptrace. 262 * under ptrace.
267 */ 263 */
268 retval = -ERESTARTNOINTR; 264 retval = -ERESTARTNOINTR;
@@ -282,11 +278,11 @@ static int ptrace_attach(struct task_struct *task, long request,
282 if (task->ptrace) 278 if (task->ptrace)
283 goto unlock_tasklist; 279 goto unlock_tasklist;
284 280
285 task->ptrace = PT_PTRACED;
286 if (seize) 281 if (seize)
287 task->ptrace |= PT_SEIZED; 282 flags |= PT_SEIZED;
288 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) 283 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
289 task->ptrace |= PT_PTRACE_CAP; 284 flags |= PT_PTRACE_CAP;
285 task->ptrace = flags;
290 286
291 __ptrace_link(task, current); 287 __ptrace_link(task, current);
292 288
@@ -528,30 +524,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
528 524
529static int ptrace_setoptions(struct task_struct *child, unsigned long data) 525static int ptrace_setoptions(struct task_struct *child, unsigned long data)
530{ 526{
531 child->ptrace &= ~PT_TRACE_MASK; 527 unsigned flags;
532 528
533 if (data & PTRACE_O_TRACESYSGOOD) 529 if (data & ~(unsigned long)PTRACE_O_MASK)
534 child->ptrace |= PT_TRACESYSGOOD; 530 return -EINVAL;
535
536 if (data & PTRACE_O_TRACEFORK)
537 child->ptrace |= PT_TRACE_FORK;
538
539 if (data & PTRACE_O_TRACEVFORK)
540 child->ptrace |= PT_TRACE_VFORK;
541
542 if (data & PTRACE_O_TRACECLONE)
543 child->ptrace |= PT_TRACE_CLONE;
544
545 if (data & PTRACE_O_TRACEEXEC)
546 child->ptrace |= PT_TRACE_EXEC;
547
548 if (data & PTRACE_O_TRACEVFORKDONE)
549 child->ptrace |= PT_TRACE_VFORK_DONE;
550 531
551 if (data & PTRACE_O_TRACEEXIT) 532 /* Avoid intermediate state when all opts are cleared */
552 child->ptrace |= PT_TRACE_EXIT; 533 flags = child->ptrace;
534 flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
535 flags |= (data << PT_OPT_FLAG_SHIFT);
536 child->ptrace = flags;
553 537
554 return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; 538 return 0;
555} 539}
556 540
557static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) 541static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
@@ -891,7 +875,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
891 } 875 }
892 876
893 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { 877 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
894 ret = ptrace_attach(child, request, data); 878 ret = ptrace_attach(child, request, addr, data);
895 /* 879 /*
896 * Some architectures need to do book-keeping after 880 * Some architectures need to do book-keeping after
897 * a ptrace attach. 881 * a ptrace attach.
@@ -1034,7 +1018,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
1034 } 1018 }
1035 1019
1036 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { 1020 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
1037 ret = ptrace_attach(child, request, data); 1021 ret = ptrace_attach(child, request, addr, data);
1038 /* 1022 /*
1039 * Some architectures need to do book-keeping after 1023 * Some architectures need to do book-keeping after
1040 * a ptrace attach. 1024 * a ptrace attach.
diff --git a/kernel/resource.c b/kernel/resource.c
index 7640b3a947d0..7e8ea66a8c01 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
749 write_unlock(&resource_lock); 749 write_unlock(&resource_lock);
750 return result; 750 return result;
751} 751}
752EXPORT_SYMBOL(adjust_resource);
752 753
753static void __init __reserve_region_with_split(struct resource *root, 754static void __init __reserve_region_with_split(struct resource *root,
754 resource_size_t start, resource_size_t end, 755 resource_size_t start, resource_size_t end,
@@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root,
792 write_unlock(&resource_lock); 793 write_unlock(&resource_lock);
793} 794}
794 795
795EXPORT_SYMBOL(adjust_resource);
796
797/** 796/**
798 * resource_alignment - calculate resource's alignment 797 * resource_alignment - calculate resource's alignment
799 * @res: resource pointer 798 * @res: resource pointer
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d2bd4647586c..503d6426126d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,6 +71,7 @@
71#include <linux/ftrace.h> 71#include <linux/ftrace.h>
72#include <linux/slab.h> 72#include <linux/slab.h>
73#include <linux/init_task.h> 73#include <linux/init_task.h>
74#include <linux/binfmts.h>
74 75
75#include <asm/tlb.h> 76#include <asm/tlb.h>
76#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
@@ -7571,8 +7572,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7571 struct task_group, css); 7572 struct task_group, css);
7572} 7573}
7573 7574
7574static struct cgroup_subsys_state * 7575static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7575cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7576{ 7576{
7577 struct task_group *tg, *parent; 7577 struct task_group *tg, *parent;
7578 7578
@@ -7589,15 +7589,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7589 return &tg->css; 7589 return &tg->css;
7590} 7590}
7591 7591
7592static void 7592static void cpu_cgroup_destroy(struct cgroup *cgrp)
7593cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7594{ 7593{
7595 struct task_group *tg = cgroup_tg(cgrp); 7594 struct task_group *tg = cgroup_tg(cgrp);
7596 7595
7597 sched_destroy_group(tg); 7596 sched_destroy_group(tg);
7598} 7597}
7599 7598
7600static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7599static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7601 struct cgroup_taskset *tset) 7600 struct cgroup_taskset *tset)
7602{ 7601{
7603 struct task_struct *task; 7602 struct task_struct *task;
@@ -7615,7 +7614,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7615 return 0; 7614 return 0;
7616} 7615}
7617 7616
7618static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7617static void cpu_cgroup_attach(struct cgroup *cgrp,
7619 struct cgroup_taskset *tset) 7618 struct cgroup_taskset *tset)
7620{ 7619{
7621 struct task_struct *task; 7620 struct task_struct *task;
@@ -7625,8 +7624,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7625} 7624}
7626 7625
7627static void 7626static void
7628cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7627cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7629 struct cgroup *old_cgrp, struct task_struct *task) 7628 struct task_struct *task)
7630{ 7629{
7631 /* 7630 /*
7632 * cgroup_exit() is called in the copy_process() failure path. 7631 * cgroup_exit() is called in the copy_process() failure path.
@@ -7976,8 +7975,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7976 */ 7975 */
7977 7976
7978/* create a new cpu accounting group */ 7977/* create a new cpu accounting group */
7979static struct cgroup_subsys_state *cpuacct_create( 7978static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
7980 struct cgroup_subsys *ss, struct cgroup *cgrp)
7981{ 7979{
7982 struct cpuacct *ca; 7980 struct cpuacct *ca;
7983 7981
@@ -8007,8 +8005,7 @@ out:
8007} 8005}
8008 8006
8009/* destroy an existing cpu accounting group */ 8007/* destroy an existing cpu accounting group */
8010static void 8008static void cpuacct_destroy(struct cgroup *cgrp)
8011cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8012{ 8009{
8013 struct cpuacct *ca = cgroup_ca(cgrp); 8010 struct cpuacct *ca = cgroup_ca(cgrp);
8014 8011
diff --git a/kernel/signal.c b/kernel/signal.c
index 8511e39813c7..d523da02dd14 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -58,21 +58,20 @@ static int sig_handler_ignored(void __user *handler, int sig)
58 (handler == SIG_DFL && sig_kernel_ignore(sig)); 58 (handler == SIG_DFL && sig_kernel_ignore(sig));
59} 59}
60 60
61static int sig_task_ignored(struct task_struct *t, int sig, 61static int sig_task_ignored(struct task_struct *t, int sig, bool force)
62 int from_ancestor_ns)
63{ 62{
64 void __user *handler; 63 void __user *handler;
65 64
66 handler = sig_handler(t, sig); 65 handler = sig_handler(t, sig);
67 66
68 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && 67 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
69 handler == SIG_DFL && !from_ancestor_ns) 68 handler == SIG_DFL && !force)
70 return 1; 69 return 1;
71 70
72 return sig_handler_ignored(handler, sig); 71 return sig_handler_ignored(handler, sig);
73} 72}
74 73
75static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) 74static int sig_ignored(struct task_struct *t, int sig, bool force)
76{ 75{
77 /* 76 /*
78 * Blocked signals are never ignored, since the 77 * Blocked signals are never ignored, since the
@@ -82,7 +81,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
82 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 81 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
83 return 0; 82 return 0;
84 83
85 if (!sig_task_ignored(t, sig, from_ancestor_ns)) 84 if (!sig_task_ignored(t, sig, force))
86 return 0; 85 return 0;
87 86
88 /* 87 /*
@@ -855,7 +854,7 @@ static void ptrace_trap_notify(struct task_struct *t)
855 * Returns true if the signal should be actually delivered, otherwise 854 * Returns true if the signal should be actually delivered, otherwise
856 * it should be dropped. 855 * it should be dropped.
857 */ 856 */
858static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) 857static int prepare_signal(int sig, struct task_struct *p, bool force)
859{ 858{
860 struct signal_struct *signal = p->signal; 859 struct signal_struct *signal = p->signal;
861 struct task_struct *t; 860 struct task_struct *t;
@@ -915,7 +914,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
915 } 914 }
916 } 915 }
917 916
918 return !sig_ignored(p, sig, from_ancestor_ns); 917 return !sig_ignored(p, sig, force);
919} 918}
920 919
921/* 920/*
@@ -1059,7 +1058,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1059 assert_spin_locked(&t->sighand->siglock); 1058 assert_spin_locked(&t->sighand->siglock);
1060 1059
1061 result = TRACE_SIGNAL_IGNORED; 1060 result = TRACE_SIGNAL_IGNORED;
1062 if (!prepare_signal(sig, t, from_ancestor_ns)) 1061 if (!prepare_signal(sig, t,
1062 from_ancestor_ns || (info == SEND_SIG_FORCED)))
1063 goto ret; 1063 goto ret;
1064 1064
1065 pending = group ? &t->signal->shared_pending : &t->pending; 1065 pending = group ? &t->signal->shared_pending : &t->pending;
@@ -1601,7 +1601,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1601 1601
1602 ret = 1; /* the signal is ignored */ 1602 ret = 1; /* the signal is ignored */
1603 result = TRACE_SIGNAL_IGNORED; 1603 result = TRACE_SIGNAL_IGNORED;
1604 if (!prepare_signal(sig, t, 0)) 1604 if (!prepare_signal(sig, t, false))
1605 goto out; 1605 goto out;
1606 1606
1607 ret = 0; 1607 ret = 0;
@@ -1652,6 +1652,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1652 BUG_ON(!tsk->ptrace && 1652 BUG_ON(!tsk->ptrace &&
1653 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1653 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1654 1654
1655 if (sig != SIGCHLD) {
1656 /*
1657 * This is only possible if parent == real_parent.
1658 * Check if it has changed security domain.
1659 */
1660 if (tsk->parent_exec_id != tsk->parent->self_exec_id)
1661 sig = SIGCHLD;
1662 }
1663
1655 info.si_signo = sig; 1664 info.si_signo = sig;
1656 info.si_errno = 0; 1665 info.si_errno = 0;
1657 /* 1666 /*
diff --git a/kernel/sys.c b/kernel/sys.c
index 888d227fd195..9eb7fcab8df6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1962,6 +1962,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1962 case PR_SET_MM: 1962 case PR_SET_MM:
1963 error = prctl_set_mm(arg2, arg3, arg4, arg5); 1963 error = prctl_set_mm(arg2, arg3, arg4, arg5);
1964 break; 1964 break;
1965 case PR_SET_CHILD_SUBREAPER:
1966 me->signal->is_child_subreaper = !!arg2;
1967 error = 0;
1968 break;
1969 case PR_GET_CHILD_SUBREAPER:
1970 error = put_user(me->signal->is_child_subreaper,
1971 (int __user *) arg2);
1972 break;
1965 default: 1973 default:
1966 error = -EINVAL; 1974 error = -EINVAL;
1967 break; 1975 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f257e05e..d48ff4fd44c3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -58,6 +58,7 @@
58#include <linux/oom.h> 58#include <linux/oom.h>
59#include <linux/kmod.h> 59#include <linux/kmod.h>
60#include <linux/capability.h> 60#include <linux/capability.h>
61#include <linux/binfmts.h>
61 62
62#include <asm/uaccess.h> 63#include <asm/uaccess.h>
63#include <asm/processor.h> 64#include <asm/processor.h>
@@ -192,20 +193,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
192 193
193#endif 194#endif
194 195
195static struct ctl_table root_table[];
196static struct ctl_table_root sysctl_table_root;
197static struct ctl_table_header root_table_header = {
198 {{.count = 1,
199 .ctl_table = root_table,
200 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
201 .root = &sysctl_table_root,
202 .set = &sysctl_table_root.default_set,
203};
204static struct ctl_table_root sysctl_table_root = {
205 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
206 .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
207};
208
209static struct ctl_table kern_table[]; 196static struct ctl_table kern_table[];
210static struct ctl_table vm_table[]; 197static struct ctl_table vm_table[];
211static struct ctl_table fs_table[]; 198static struct ctl_table fs_table[];
@@ -222,7 +209,7 @@ int sysctl_legacy_va_layout;
222 209
223/* The default sysctl tables: */ 210/* The default sysctl tables: */
224 211
225static struct ctl_table root_table[] = { 212static struct ctl_table sysctl_base_table[] = {
226 { 213 {
227 .procname = "kernel", 214 .procname = "kernel",
228 .mode = 0555, 215 .mode = 0555,
@@ -1559,490 +1546,12 @@ static struct ctl_table dev_table[] = {
1559 { } 1546 { }
1560}; 1547};
1561 1548
1562static DEFINE_SPINLOCK(sysctl_lock); 1549int __init sysctl_init(void)
1563
1564/* called under sysctl_lock */
1565static int use_table(struct ctl_table_header *p)
1566{
1567 if (unlikely(p->unregistering))
1568 return 0;
1569 p->used++;
1570 return 1;
1571}
1572
1573/* called under sysctl_lock */
1574static void unuse_table(struct ctl_table_header *p)
1575{
1576 if (!--p->used)
1577 if (unlikely(p->unregistering))
1578 complete(p->unregistering);
1579}
1580
1581/* called under sysctl_lock, will reacquire if has to wait */
1582static void start_unregistering(struct ctl_table_header *p)
1583{
1584 /*
1585 * if p->used is 0, nobody will ever touch that entry again;
1586 * we'll eliminate all paths to it before dropping sysctl_lock
1587 */
1588 if (unlikely(p->used)) {
1589 struct completion wait;
1590 init_completion(&wait);
1591 p->unregistering = &wait;
1592 spin_unlock(&sysctl_lock);
1593 wait_for_completion(&wait);
1594 spin_lock(&sysctl_lock);
1595 } else {
1596 /* anything non-NULL; we'll never dereference it */
1597 p->unregistering = ERR_PTR(-EINVAL);
1598 }
1599 /*
1600 * do not remove from the list until nobody holds it; walking the
1601 * list in do_sysctl() relies on that.
1602 */
1603 list_del_init(&p->ctl_entry);
1604}
1605
1606void sysctl_head_get(struct ctl_table_header *head)
1607{
1608 spin_lock(&sysctl_lock);
1609 head->count++;
1610 spin_unlock(&sysctl_lock);
1611}
1612
1613void sysctl_head_put(struct ctl_table_header *head)
1614{
1615 spin_lock(&sysctl_lock);
1616 if (!--head->count)
1617 kfree_rcu(head, rcu);
1618 spin_unlock(&sysctl_lock);
1619}
1620
1621struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
1622{
1623 if (!head)
1624 BUG();
1625 spin_lock(&sysctl_lock);
1626 if (!use_table(head))
1627 head = ERR_PTR(-ENOENT);
1628 spin_unlock(&sysctl_lock);
1629 return head;
1630}
1631
1632void sysctl_head_finish(struct ctl_table_header *head)
1633{
1634 if (!head)
1635 return;
1636 spin_lock(&sysctl_lock);
1637 unuse_table(head);
1638 spin_unlock(&sysctl_lock);
1639}
1640
1641static struct ctl_table_set *
1642lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
1643{
1644 struct ctl_table_set *set = &root->default_set;
1645 if (root->lookup)
1646 set = root->lookup(root, namespaces);
1647 return set;
1648}
1649
1650static struct list_head *
1651lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
1652{
1653 struct ctl_table_set *set = lookup_header_set(root, namespaces);
1654 return &set->list;
1655}
1656
1657struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
1658 struct ctl_table_header *prev)
1659{
1660 struct ctl_table_root *root;
1661 struct list_head *header_list;
1662 struct ctl_table_header *head;
1663 struct list_head *tmp;
1664
1665 spin_lock(&sysctl_lock);
1666 if (prev) {
1667 head = prev;
1668 tmp = &prev->ctl_entry;
1669 unuse_table(prev);
1670 goto next;
1671 }
1672 tmp = &root_table_header.ctl_entry;
1673 for (;;) {
1674 head = list_entry(tmp, struct ctl_table_header, ctl_entry);
1675
1676 if (!use_table(head))
1677 goto next;
1678 spin_unlock(&sysctl_lock);
1679 return head;
1680 next:
1681 root = head->root;
1682 tmp = tmp->next;
1683 header_list = lookup_header_list(root, namespaces);
1684 if (tmp != header_list)
1685 continue;
1686
1687 do {
1688 root = list_entry(root->root_list.next,
1689 struct ctl_table_root, root_list);
1690 if (root == &sysctl_table_root)
1691 goto out;
1692 header_list = lookup_header_list(root, namespaces);
1693 } while (list_empty(header_list));
1694 tmp = header_list->next;
1695 }
1696out:
1697 spin_unlock(&sysctl_lock);
1698 return NULL;
1699}
1700
1701struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1702{
1703 return __sysctl_head_next(current->nsproxy, prev);
1704}
1705
1706void register_sysctl_root(struct ctl_table_root *root)
1707{
1708 spin_lock(&sysctl_lock);
1709 list_add_tail(&root->root_list, &sysctl_table_root.root_list);
1710 spin_unlock(&sysctl_lock);
1711}
1712
1713/*
1714 * sysctl_perm does NOT grant the superuser all rights automatically, because
1715 * some sysctl variables are readonly even to root.
1716 */
1717
1718static int test_perm(int mode, int op)
1719{
1720 if (!current_euid())
1721 mode >>= 6;
1722 else if (in_egroup_p(0))
1723 mode >>= 3;
1724 if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
1725 return 0;
1726 return -EACCES;
1727}
1728
1729int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1730{
1731 int mode;
1732
1733 if (root->permissions)
1734 mode = root->permissions(root, current->nsproxy, table);
1735 else
1736 mode = table->mode;
1737
1738 return test_perm(mode, op);
1739}
1740
1741static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1742{
1743 for (; table->procname; table++) {
1744 table->parent = parent;
1745 if (table->child)
1746 sysctl_set_parent(table, table->child);
1747 }
1748}
1749
1750static __init int sysctl_init(void)
1751{ 1550{
1752 sysctl_set_parent(NULL, root_table); 1551 register_sysctl_table(sysctl_base_table);
1753#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1754 sysctl_check_table(current->nsproxy, root_table);
1755#endif
1756 return 0; 1552 return 0;
1757} 1553}
1758 1554
1759core_initcall(sysctl_init);
1760
1761static struct ctl_table *is_branch_in(struct ctl_table *branch,
1762 struct ctl_table *table)
1763{
1764 struct ctl_table *p;
1765 const char *s = branch->procname;
1766
1767 /* branch should have named subdirectory as its first element */
1768 if (!s || !branch->child)
1769 return NULL;
1770
1771 /* ... and nothing else */
1772 if (branch[1].procname)
1773 return NULL;
1774
1775 /* table should contain subdirectory with the same name */
1776 for (p = table; p->procname; p++) {
1777 if (!p->child)
1778 continue;
1779 if (p->procname && strcmp(p->procname, s) == 0)
1780 return p;
1781 }
1782 return NULL;
1783}
1784
1785/* see if attaching q to p would be an improvement */
1786static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1787{
1788 struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
1789 struct ctl_table *next;
1790 int is_better = 0;
1791 int not_in_parent = !p->attached_by;
1792
1793 while ((next = is_branch_in(by, to)) != NULL) {
1794 if (by == q->attached_by)
1795 is_better = 1;
1796 if (to == p->attached_by)
1797 not_in_parent = 1;
1798 by = by->child;
1799 to = next->child;
1800 }
1801
1802 if (is_better && not_in_parent) {
1803 q->attached_by = by;
1804 q->attached_to = to;
1805 q->parent = p;
1806 }
1807}
1808
1809/**
1810 * __register_sysctl_paths - register a sysctl hierarchy
1811 * @root: List of sysctl headers to register on
1812 * @namespaces: Data to compute which lists of sysctl entries are visible
1813 * @path: The path to the directory the sysctl table is in.
1814 * @table: the top-level table structure
1815 *
1816 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1817 * array. A completely 0 filled entry terminates the table.
1818 *
1819 * The members of the &struct ctl_table structure are used as follows:
1820 *
1821 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
1822 * enter a sysctl file
1823 *
1824 * data - a pointer to data for use by proc_handler
1825 *
1826 * maxlen - the maximum size in bytes of the data
1827 *
1828 * mode - the file permissions for the /proc/sys file, and for sysctl(2)
1829 *
1830 * child - a pointer to the child sysctl table if this entry is a directory, or
1831 * %NULL.
1832 *
1833 * proc_handler - the text handler routine (described below)
1834 *
1835 * de - for internal use by the sysctl routines
1836 *
1837 * extra1, extra2 - extra pointers usable by the proc handler routines
1838 *
1839 * Leaf nodes in the sysctl tree will be represented by a single file
1840 * under /proc; non-leaf nodes will be represented by directories.
1841 *
1842 * sysctl(2) can automatically manage read and write requests through
1843 * the sysctl table. The data and maxlen fields of the ctl_table
1844 * struct enable minimal validation of the values being written to be
1845 * performed, and the mode field allows minimal authentication.
1846 *
1847 * There must be a proc_handler routine for any terminal nodes
1848 * mirrored under /proc/sys (non-terminals are handled by a built-in
1849 * directory handler). Several default handlers are available to
1850 * cover common cases -
1851 *
1852 * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
1853 * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
1854 * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
1855 *
1856 * It is the handler's job to read the input buffer from user memory
1857 * and process it. The handler should return 0 on success.
1858 *
1859 * This routine returns %NULL on a failure to register, and a pointer
1860 * to the table header on success.
1861 */
1862struct ctl_table_header *__register_sysctl_paths(
1863 struct ctl_table_root *root,
1864 struct nsproxy *namespaces,
1865 const struct ctl_path *path, struct ctl_table *table)
1866{
1867 struct ctl_table_header *header;
1868 struct ctl_table *new, **prevp;
1869 unsigned int n, npath;
1870 struct ctl_table_set *set;
1871
1872 /* Count the path components */
1873 for (npath = 0; path[npath].procname; ++npath)
1874 ;
1875
1876 /*
1877 * For each path component, allocate a 2-element ctl_table array.
1878 * The first array element will be filled with the sysctl entry
1879 * for this, the second will be the sentinel (procname == 0).
1880 *
1881 * We allocate everything in one go so that we don't have to
1882 * worry about freeing additional memory in unregister_sysctl_table.
1883 */
1884 header = kzalloc(sizeof(struct ctl_table_header) +
1885 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
1886 if (!header)
1887 return NULL;
1888
1889 new = (struct ctl_table *) (header + 1);
1890
1891 /* Now connect the dots */
1892 prevp = &header->ctl_table;
1893 for (n = 0; n < npath; ++n, ++path) {
1894 /* Copy the procname */
1895 new->procname = path->procname;
1896 new->mode = 0555;
1897
1898 *prevp = new;
1899 prevp = &new->child;
1900
1901 new += 2;
1902 }
1903 *prevp = table;
1904 header->ctl_table_arg = table;
1905
1906 INIT_LIST_HEAD(&header->ctl_entry);
1907 header->used = 0;
1908 header->unregistering = NULL;
1909 header->root = root;
1910 sysctl_set_parent(NULL, header->ctl_table);
1911 header->count = 1;
1912#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1913 if (sysctl_check_table(namespaces, header->ctl_table)) {
1914 kfree(header);
1915 return NULL;
1916 }
1917#endif
1918 spin_lock(&sysctl_lock);
1919 header->set = lookup_header_set(root, namespaces);
1920 header->attached_by = header->ctl_table;
1921 header->attached_to = root_table;
1922 header->parent = &root_table_header;
1923 for (set = header->set; set; set = set->parent) {
1924 struct ctl_table_header *p;
1925 list_for_each_entry(p, &set->list, ctl_entry) {
1926 if (p->unregistering)
1927 continue;
1928 try_attach(p, header);
1929 }
1930 }
1931 header->parent->count++;
1932 list_add_tail(&header->ctl_entry, &header->set->list);
1933 spin_unlock(&sysctl_lock);
1934
1935 return header;
1936}
1937
1938/**
1939 * register_sysctl_table_path - register a sysctl table hierarchy
1940 * @path: The path to the directory the sysctl table is in.
1941 * @table: the top-level table structure
1942 *
1943 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1944 * array. A completely 0 filled entry terminates the table.
1945 *
1946 * See __register_sysctl_paths for more details.
1947 */
1948struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1949 struct ctl_table *table)
1950{
1951 return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
1952 path, table);
1953}
1954
1955/**
1956 * register_sysctl_table - register a sysctl table hierarchy
1957 * @table: the top-level table structure
1958 *
1959 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1960 * array. A completely 0 filled entry terminates the table.
1961 *
1962 * See register_sysctl_paths for more details.
1963 */
1964struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
1965{
1966 static const struct ctl_path null_path[] = { {} };
1967
1968 return register_sysctl_paths(null_path, table);
1969}
1970
1971/**
1972 * unregister_sysctl_table - unregister a sysctl table hierarchy
1973 * @header: the header returned from register_sysctl_table
1974 *
1975 * Unregisters the sysctl table and all children. proc entries may not
1976 * actually be removed until they are no longer used by anyone.
1977 */
1978void unregister_sysctl_table(struct ctl_table_header * header)
1979{
1980 might_sleep();
1981
1982 if (header == NULL)
1983 return;
1984
1985 spin_lock(&sysctl_lock);
1986 start_unregistering(header);
1987 if (!--header->parent->count) {
1988 WARN_ON(1);
1989 kfree_rcu(header->parent, rcu);
1990 }
1991 if (!--header->count)
1992 kfree_rcu(header, rcu);
1993 spin_unlock(&sysctl_lock);
1994}
1995
1996int sysctl_is_seen(struct ctl_table_header *p)
1997{
1998 struct ctl_table_set *set = p->set;
1999 int res;
2000 spin_lock(&sysctl_lock);
2001 if (p->unregistering)
2002 res = 0;
2003 else if (!set->is_seen)
2004 res = 1;
2005 else
2006 res = set->is_seen(set);
2007 spin_unlock(&sysctl_lock);
2008 return res;
2009}
2010
2011void setup_sysctl_set(struct ctl_table_set *p,
2012 struct ctl_table_set *parent,
2013 int (*is_seen)(struct ctl_table_set *))
2014{
2015 INIT_LIST_HEAD(&p->list);
2016 p->parent = parent ? parent : &sysctl_table_root.default_set;
2017 p->is_seen = is_seen;
2018}
2019
2020#else /* !CONFIG_SYSCTL */
2021struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
2022{
2023 return NULL;
2024}
2025
2026struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
2027 struct ctl_table *table)
2028{
2029 return NULL;
2030}
2031
2032void unregister_sysctl_table(struct ctl_table_header * table)
2033{
2034}
2035
2036void setup_sysctl_set(struct ctl_table_set *p,
2037 struct ctl_table_set *parent,
2038 int (*is_seen)(struct ctl_table_set *))
2039{
2040}
2041
2042void sysctl_head_put(struct ctl_table_header *head)
2043{
2044}
2045
2046#endif /* CONFIG_SYSCTL */ 1555#endif /* CONFIG_SYSCTL */
2047 1556
2048/* 1557/*
@@ -3008,6 +2517,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
3008EXPORT_SYMBOL(proc_dostring); 2517EXPORT_SYMBOL(proc_dostring);
3009EXPORT_SYMBOL(proc_doulongvec_minmax); 2518EXPORT_SYMBOL(proc_doulongvec_minmax);
3010EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2519EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
3011EXPORT_SYMBOL(register_sysctl_table);
3012EXPORT_SYMBOL(register_sysctl_paths);
3013EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
deleted file mode 100644
index 362da653813d..000000000000
--- a/kernel/sysctl_check.c
+++ /dev/null
@@ -1,160 +0,0 @@
1#include <linux/stat.h>
2#include <linux/sysctl.h>
3#include "../fs/xfs/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h>
5#include <linux/string.h>
6#include <net/ip_vs.h>
7
8
9static int sysctl_depth(struct ctl_table *table)
10{
11 struct ctl_table *tmp;
12 int depth;
13
14 depth = 0;
15 for (tmp = table; tmp->parent; tmp = tmp->parent)
16 depth++;
17
18 return depth;
19}
20
21static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
22{
23 int i;
24
25 for (i = 0; table && i < n; i++)
26 table = table->parent;
27
28 return table;
29}
30
31
32static void sysctl_print_path(struct ctl_table *table)
33{
34 struct ctl_table *tmp;
35 int depth, i;
36 depth = sysctl_depth(table);
37 if (table->procname) {
38 for (i = depth; i >= 0; i--) {
39 tmp = sysctl_parent(table, i);
40 printk("/%s", tmp->procname?tmp->procname:"");
41 }
42 }
43 printk(" ");
44}
45
46static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
47 struct ctl_table *table)
48{
49 struct ctl_table_header *head;
50 struct ctl_table *ref, *test;
51 int depth, cur_depth;
52
53 depth = sysctl_depth(table);
54
55 for (head = __sysctl_head_next(namespaces, NULL); head;
56 head = __sysctl_head_next(namespaces, head)) {
57 cur_depth = depth;
58 ref = head->ctl_table;
59repeat:
60 test = sysctl_parent(table, cur_depth);
61 for (; ref->procname; ref++) {
62 int match = 0;
63 if (cur_depth && !ref->child)
64 continue;
65
66 if (test->procname && ref->procname &&
67 (strcmp(test->procname, ref->procname) == 0))
68 match++;
69
70 if (match) {
71 if (cur_depth != 0) {
72 cur_depth--;
73 ref = ref->child;
74 goto repeat;
75 }
76 goto out;
77 }
78 }
79 }
80 ref = NULL;
81out:
82 sysctl_head_finish(head);
83 return ref;
84}
85
86static void set_fail(const char **fail, struct ctl_table *table, const char *str)
87{
88 if (*fail) {
89 printk(KERN_ERR "sysctl table check failed: ");
90 sysctl_print_path(table);
91 printk(" %s\n", *fail);
92 dump_stack();
93 }
94 *fail = str;
95}
96
97static void sysctl_check_leaf(struct nsproxy *namespaces,
98 struct ctl_table *table, const char **fail)
99{
100 struct ctl_table *ref;
101
102 ref = sysctl_check_lookup(namespaces, table);
103 if (ref && (ref != table))
104 set_fail(fail, table, "Sysctl already exists");
105}
106
107int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
108{
109 int error = 0;
110 for (; table->procname; table++) {
111 const char *fail = NULL;
112
113 if (table->parent) {
114 if (!table->parent->procname)
115 set_fail(&fail, table, "Parent without procname");
116 }
117 if (table->child) {
118 if (table->data)
119 set_fail(&fail, table, "Directory with data?");
120 if (table->maxlen)
121 set_fail(&fail, table, "Directory with maxlen?");
122 if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
123 set_fail(&fail, table, "Writable sysctl directory");
124 if (table->proc_handler)
125 set_fail(&fail, table, "Directory with proc_handler");
126 if (table->extra1)
127 set_fail(&fail, table, "Directory with extra1");
128 if (table->extra2)
129 set_fail(&fail, table, "Directory with extra2");
130 } else {
131 if ((table->proc_handler == proc_dostring) ||
132 (table->proc_handler == proc_dointvec) ||
133 (table->proc_handler == proc_dointvec_minmax) ||
134 (table->proc_handler == proc_dointvec_jiffies) ||
135 (table->proc_handler == proc_dointvec_userhz_jiffies) ||
136 (table->proc_handler == proc_dointvec_ms_jiffies) ||
137 (table->proc_handler == proc_doulongvec_minmax) ||
138 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
139 if (!table->data)
140 set_fail(&fail, table, "No data");
141 if (!table->maxlen)
142 set_fail(&fail, table, "No maxlen");
143 }
144#ifdef CONFIG_PROC_SYSCTL
145 if (!table->proc_handler)
146 set_fail(&fail, table, "No proc_handler");
147#endif
148 sysctl_check_leaf(namespaces, table, &fail);
149 }
150 if (table->mode > 0777)
151 set_fail(&fail, table, "bogus .mode");
152 if (fail) {
153 set_fail(&fail, table, NULL);
154 error = -EINVAL;
155 }
156 if (table->child)
157 error |= sysctl_check_table(namespaces, table->child);
158 }
159 return error;
160}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 403c2a092830..15be32e19c6e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1260,6 +1260,8 @@ ktime_t ktime_get_monotonic_offset(void)
1260 1260
1261 return timespec_to_ktime(wtom); 1261 return timespec_to_ktime(wtom);
1262} 1262}
1263EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1264
1263 1265
1264/** 1266/**
1265 * xtime_update() - advances the timekeeping infrastructure 1267 * xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c5a01873567d..859fae6b1825 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
264 return ret; 264 return ret;
265} 265}
266 266
267int trace_seq_path(struct trace_seq *s, struct path *path) 267int trace_seq_path(struct trace_seq *s, const struct path *path)
268{ 268{
269 unsigned char *p; 269 unsigned char *p;
270 270
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14bc092fb12c..df30ee08bdd4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -9,6 +9,8 @@
9 * to those contributors as well. 9 * to those contributors as well.
10 */ 10 */
11 11
12#define pr_fmt(fmt) "NMI watchdog: " fmt
13
12#include <linux/mm.h> 14#include <linux/mm.h>
13#include <linux/cpu.h> 15#include <linux/cpu.h>
14#include <linux/nmi.h> 16#include <linux/nmi.h>
@@ -319,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
319 */ 321 */
320static int watchdog(void *unused) 322static int watchdog(void *unused)
321{ 323{
322 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 324 struct sched_param param = { .sched_priority = 0 };
323 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
324 326
325 sched_setscheduler(current, SCHED_FIFO, &param);
326
327 /* initialize timestamp */ 327 /* initialize timestamp */
328 __touch_watchdog(); 328 __touch_watchdog();
329 329
@@ -349,8 +349,11 @@ static int watchdog(void *unused)
349 349
350 set_current_state(TASK_INTERRUPTIBLE); 350 set_current_state(TASK_INTERRUPTIBLE);
351 } 351 }
352 /*
353 * Drop the policy/priority elevation during thread exit to avoid a
354 * scheduling latency spike.
355 */
352 __set_current_state(TASK_RUNNING); 356 __set_current_state(TASK_RUNNING);
353 param.sched_priority = 0;
354 sched_setscheduler(current, SCHED_NORMAL, &param); 357 sched_setscheduler(current, SCHED_NORMAL, &param);
355 return 0; 358 return 0;
356} 359}
@@ -376,18 +379,20 @@ static int watchdog_nmi_enable(int cpu)
376 /* Try to register using hardware perf events */ 379 /* Try to register using hardware perf events */
377 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 380 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
378 if (!IS_ERR(event)) { 381 if (!IS_ERR(event)) {
379 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 382 pr_info("enabled, takes one hw-pmu counter.\n");
380 goto out_save; 383 goto out_save;
381 } 384 }
382 385
383 386
384 /* vary the KERN level based on the returned errno */ 387 /* vary the KERN level based on the returned errno */
385 if (PTR_ERR(event) == -EOPNOTSUPP) 388 if (PTR_ERR(event) == -EOPNOTSUPP)
386 printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); 389 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
387 else if (PTR_ERR(event) == -ENOENT) 390 else if (PTR_ERR(event) == -ENOENT)
388 printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); 391 pr_warning("disabled (cpu%i): hardware events not enabled\n",
392 cpu);
389 else 393 else
390 printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); 394 pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
395 cpu, PTR_ERR(event));
391 return PTR_ERR(event); 396 return PTR_ERR(event);
392 397
393 /* success path */ 398 /* success path */
@@ -439,9 +444,10 @@ static int watchdog_enable(int cpu)
439 444
440 /* create the watchdog thread */ 445 /* create the watchdog thread */
441 if (!p) { 446 if (!p) {
447 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
442 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); 448 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
443 if (IS_ERR(p)) { 449 if (IS_ERR(p)) {
444 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 450 pr_err("softlockup watchdog for %i failed\n", cpu);
445 if (!err) { 451 if (!err) {
446 /* if hardlockup hasn't already set this */ 452 /* if hardlockup hasn't already set this */
447 err = PTR_ERR(p); 453 err = PTR_ERR(p);
@@ -450,6 +456,7 @@ static int watchdog_enable(int cpu)
450 } 456 }
451 goto out; 457 goto out;
452 } 458 }
459 sched_setscheduler(p, SCHED_FIFO, &param);
453 kthread_bind(p, cpu); 460 kthread_bind(p, cpu);
454 per_cpu(watchdog_touch_ts, cpu) = 0; 461 per_cpu(watchdog_touch_ts, cpu) = 0;
455 per_cpu(softlockup_watchdog, cpu) = p; 462 per_cpu(softlockup_watchdog, cpu) = p;
@@ -496,7 +503,7 @@ static void watchdog_enable_all_cpus(void)
496 watchdog_enabled = 1; 503 watchdog_enabled = 1;
497 504
498 if (!watchdog_enabled) 505 if (!watchdog_enabled)
499 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); 506 pr_err("failed to be enabled on some cpus\n");
500 507
501} 508}
502 509
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f2c5638bb5ab..5abf42f63c08 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -476,13 +476,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
476 struct workqueue_struct *wq) 476 struct workqueue_struct *wq)
477{ 477{
478 if (!(wq->flags & WQ_UNBOUND)) { 478 if (!(wq->flags & WQ_UNBOUND)) {
479 if (likely(cpu < nr_cpu_ids)) { 479 if (likely(cpu < nr_cpu_ids))
480#ifdef CONFIG_SMP
481 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); 480 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
482#else
483 return wq->cpu_wq.single;
484#endif
485 }
486 } else if (likely(cpu == WORK_CPU_UNBOUND)) 481 } else if (likely(cpu == WORK_CPU_UNBOUND))
487 return wq->cpu_wq.single; 482 return wq->cpu_wq.single;
488 return NULL; 483 return NULL;
@@ -2899,13 +2894,8 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2899 const size_t size = sizeof(struct cpu_workqueue_struct); 2894 const size_t size = sizeof(struct cpu_workqueue_struct);
2900 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 2895 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2901 __alignof__(unsigned long long)); 2896 __alignof__(unsigned long long));
2902#ifdef CONFIG_SMP
2903 bool percpu = !(wq->flags & WQ_UNBOUND);
2904#else
2905 bool percpu = false;
2906#endif
2907 2897
2908 if (percpu) 2898 if (!(wq->flags & WQ_UNBOUND))
2909 wq->cpu_wq.pcpu = __alloc_percpu(size, align); 2899 wq->cpu_wq.pcpu = __alloc_percpu(size, align);
2910 else { 2900 else {
2911 void *ptr; 2901 void *ptr;
@@ -2929,13 +2919,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2929 2919
2930static void free_cwqs(struct workqueue_struct *wq) 2920static void free_cwqs(struct workqueue_struct *wq)
2931{ 2921{
2932#ifdef CONFIG_SMP 2922 if (!(wq->flags & WQ_UNBOUND))
2933 bool percpu = !(wq->flags & WQ_UNBOUND);
2934#else
2935 bool percpu = false;
2936#endif
2937
2938 if (percpu)
2939 free_percpu(wq->cpu_wq.pcpu); 2923 free_percpu(wq->cpu_wq.pcpu);
2940 else if (wq->cpu_wq.single) { 2924 else if (wq->cpu_wq.single) {
2941 /* the pointer to free is stored right after the cwq */ 2925 /* the pointer to free is stored right after the cwq */