aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Kconfig.preempt1
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/cgroup.c354
-rw-r--r--kernel/cgroup_freezer.c11
-rw-r--r--kernel/cpuset.c59
-rw-r--r--kernel/cred.c1
-rw-r--r--kernel/debug/debug_core.c34
-rw-r--r--kernel/debug/gdbstub.c10
-rw-r--r--kernel/debug/kdb/kdb_bp.c7
-rw-r--r--kernel/debug/kdb/kdb_bt.c1
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c95
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/debug/kdb/kdb_private.h7
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/dma.c1
-rw-r--r--kernel/events/core.c259
-rw-r--r--kernel/events/hw_breakpoint.c13
-rw-r--r--kernel/exit.c68
-rw-r--r--kernel/fork.c36
-rw-r--r--kernel/freezer.c6
-rw-r--r--kernel/futex.c51
-rw-r--r--kernel/irq/Kconfig10
-rw-r--r--kernel/irq/chip.c5
-rw-r--r--kernel/irq/handle.c14
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdomain.c828
-rw-r--r--kernel/irq/manage.c86
-rw-r--r--kernel/jump_label.c135
-rw-r--r--kernel/kexec.c15
-rw-r--r--kernel/kmod.c84
-rw-r--r--kernel/lockdep.c8
-rw-r--r--kernel/module.c37
-rw-r--r--kernel/padata.c44
-rw-r--r--kernel/params.c40
-rw-r--r--kernel/pid_namespace.c41
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/hibernate.c47
-rw-r--r--kernel/power/main.c20
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/process.c24
-rw-r--r--kernel/power/qos.c23
-rw-r--r--kernel/power/snapshot.c35
-rw-r--r--kernel/power/suspend.c84
-rw-r--r--kernel/power/user.c12
-rw-r--r--kernel/printk.c5
-rw-r--r--kernel/ptrace.c66
-rw-r--r--kernel/rcu.h26
-rw-r--r--kernel/rcupdate.c5
-rw-r--r--kernel/rcutiny.c26
-rw-r--r--kernel/rcutiny_plugin.h77
-rw-r--r--kernel/rcutorture.c91
-rw-r--r--kernel/rcutree.c507
-rw-r--r--kernel/rcutree.h27
-rw-r--r--kernel/rcutree_plugin.h450
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/rwsem.c1
-rw-r--r--kernel/sched/core.c40
-rw-r--r--kernel/sched/fair.c8
-rw-r--r--kernel/sched/sched.h14
-rw-r--r--kernel/signal.c56
-rw-r--r--kernel/smp.c90
-rw-r--r--kernel/softirq.c26
-rw-r--r--kernel/spinlock.c2
-rw-r--r--kernel/srcu.c33
-rw-r--r--kernel/sys.c19
-rw-r--r--kernel/sysctl.c514
-rw-r--r--kernel/sysctl_check.c160
-rw-r--r--kernel/time.c6
-rw-r--r--kernel/time/alarmtimer.c8
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/ntp.c191
-rw-r--r--kernel/time/tick-broadcast.c4
-rw-r--r--kernel/time/tick-sched.c17
-rw-r--r--kernel/time/timekeeping.c373
-rw-r--r--kernel/trace/ftrace.c134
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--kernel/trace/trace.h38
-rw-r--r--kernel/trace/trace_entries.h54
-rw-r--r--kernel/trace/trace_event_perf.c208
-rw-r--r--kernel/trace/trace_events.c12
-rw-r--r--kernel/trace/trace_events_filter.c175
-rw-r--r--kernel/trace/trace_export.c64
-rw-r--r--kernel/trace/trace_kprobe.c8
-rw-r--r--kernel/trace/trace_output.c14
-rw-r--r--kernel/trace/trace_syscalls.c22
-rw-r--r--kernel/tracepoint.c20
-rw-r--r--kernel/watchdog.c51
-rw-r--r--kernel/workqueue.c29
92 files changed, 3887 insertions, 2441 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 5068e2a4e75f..2251882daf53 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -124,8 +124,8 @@ config INLINE_SPIN_LOCK_IRQSAVE
124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
125 ARCH_INLINE_SPIN_LOCK_IRQSAVE 125 ARCH_INLINE_SPIN_LOCK_IRQSAVE
126 126
127config INLINE_SPIN_UNLOCK 127config UNINLINE_SPIN_UNLOCK
128 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) 128 bool
129 129
130config INLINE_SPIN_UNLOCK_BH 130config INLINE_SPIN_UNLOCK_BH
131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH 131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 24e7cb0ba26a..3f9c97419f02 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY
36config PREEMPT 36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)" 37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 select PREEMPT_COUNT 38 select PREEMPT_COUNT
39 select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
39 help 40 help
40 This option reduces the latency of the kernel by making 41 This option reduces the latency of the kernel by making
41 all kernel code (that is not executing in a critical section) 42 all kernel code (that is not executing in a critical section)
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e76..cb41b9547c9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,7 +27,6 @@ obj-y += power/
27 27
28obj-$(CONFIG_FREEZER) += freezer.o 28obj-$(CONFIG_FREEZER) += freezer.o
29obj-$(CONFIG_PROFILING) += profile.o 29obj-$(CONFIG_PROFILING) += profile.o
30obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
31obj-$(CONFIG_STACKTRACE) += stacktrace.o 30obj-$(CONFIG_STACKTRACE) += stacktrace.o
32obj-y += time/ 31obj-y += time/
33obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 32obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/audit.c b/kernel/audit.c
index bb0eb5bb9a0a..1c7f2c61416b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1418 1418
1419/* This is a helper-function to print the escaped d_path */ 1419/* This is a helper-function to print the escaped d_path */
1420void audit_log_d_path(struct audit_buffer *ab, const char *prefix, 1420void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1421 struct path *path) 1421 const struct path *path)
1422{ 1422{
1423 char *p, *pathname; 1423 char *p, *pathname;
1424 1424
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b5325f77..f4ea4b6f3cf1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -818,7 +818,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
818 818
819 for_each_subsys(cgrp->root, ss) 819 for_each_subsys(cgrp->root, ss)
820 if (ss->pre_destroy) { 820 if (ss->pre_destroy) {
821 ret = ss->pre_destroy(ss, cgrp); 821 ret = ss->pre_destroy(cgrp);
822 if (ret) 822 if (ret)
823 break; 823 break;
824 } 824 }
@@ -846,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
846 * Release the subsystem state objects. 846 * Release the subsystem state objects.
847 */ 847 */
848 for_each_subsys(cgrp->root, ss) 848 for_each_subsys(cgrp->root, ss)
849 ss->destroy(ss, cgrp); 849 ss->destroy(cgrp);
850 850
851 cgrp->root->number_of_cgroups--; 851 cgrp->root->number_of_cgroups--;
852 mutex_unlock(&cgroup_mutex); 852 mutex_unlock(&cgroup_mutex);
@@ -1015,7 +1015,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1015 list_move(&ss->sibling, &root->subsys_list); 1015 list_move(&ss->sibling, &root->subsys_list);
1016 ss->root = root; 1016 ss->root = root;
1017 if (ss->bind) 1017 if (ss->bind)
1018 ss->bind(ss, cgrp); 1018 ss->bind(cgrp);
1019 mutex_unlock(&ss->hierarchy_mutex); 1019 mutex_unlock(&ss->hierarchy_mutex);
1020 /* refcount was already taken, and we're keeping it */ 1020 /* refcount was already taken, and we're keeping it */
1021 } else if (bit & removed_bits) { 1021 } else if (bit & removed_bits) {
@@ -1025,7 +1025,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1025 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1025 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1026 mutex_lock(&ss->hierarchy_mutex); 1026 mutex_lock(&ss->hierarchy_mutex);
1027 if (ss->bind) 1027 if (ss->bind)
1028 ss->bind(ss, dummytop); 1028 ss->bind(dummytop);
1029 dummytop->subsys[i]->cgroup = dummytop; 1029 dummytop->subsys[i]->cgroup = dummytop;
1030 cgrp->subsys[i] = NULL; 1030 cgrp->subsys[i] = NULL;
1031 subsys[i]->root = &rootnode; 1031 subsys[i]->root = &rootnode;
@@ -1472,7 +1472,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
1472 1472
1473 struct inode *inode = 1473 struct inode *inode =
1474 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1474 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1475 struct dentry *dentry;
1476 1475
1477 if (!inode) 1476 if (!inode)
1478 return -ENOMEM; 1477 return -ENOMEM;
@@ -1481,12 +1480,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
1481 inode->i_op = &cgroup_dir_inode_operations; 1480 inode->i_op = &cgroup_dir_inode_operations;
1482 /* directories start off with i_nlink == 2 (for "." entry) */ 1481 /* directories start off with i_nlink == 2 (for "." entry) */
1483 inc_nlink(inode); 1482 inc_nlink(inode);
1484 dentry = d_alloc_root(inode); 1483 sb->s_root = d_make_root(inode);
1485 if (!dentry) { 1484 if (!sb->s_root)
1486 iput(inode);
1487 return -ENOMEM; 1485 return -ENOMEM;
1488 }
1489 sb->s_root = dentry;
1490 /* for everything else we want ->d_op set */ 1486 /* for everything else we want ->d_op set */
1491 sb->s_d_op = &cgroup_dops; 1487 sb->s_d_op = &cgroup_dops;
1492 return 0; 1488 return 0;
@@ -1763,6 +1759,7 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1763struct task_and_cgroup { 1759struct task_and_cgroup {
1764 struct task_struct *task; 1760 struct task_struct *task;
1765 struct cgroup *cgrp; 1761 struct cgroup *cgrp;
1762 struct css_set *cg;
1766}; 1763};
1767 1764
1768struct cgroup_taskset { 1765struct cgroup_taskset {
@@ -1843,11 +1840,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1843 * will already exist. If not set, this function might sleep, and can fail with 1840 * will already exist. If not set, this function might sleep, and can fail with
1844 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. 1841 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1845 */ 1842 */
1846static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1843static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1847 struct task_struct *tsk, bool guarantee) 1844 struct task_struct *tsk, struct css_set *newcg)
1848{ 1845{
1849 struct css_set *oldcg; 1846 struct css_set *oldcg;
1850 struct css_set *newcg;
1851 1847
1852 /* 1848 /*
1853 * We are synchronized through threadgroup_lock() against PF_EXITING 1849 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1857,23 +1853,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1857 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1853 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1858 oldcg = tsk->cgroups; 1854 oldcg = tsk->cgroups;
1859 1855
1860 /* locate or allocate a new css_set for this task. */
1861 if (guarantee) {
1862 /* we know the css_set we want already exists. */
1863 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1864 read_lock(&css_set_lock);
1865 newcg = find_existing_css_set(oldcg, cgrp, template);
1866 BUG_ON(!newcg);
1867 get_css_set(newcg);
1868 read_unlock(&css_set_lock);
1869 } else {
1870 might_sleep();
1871 /* find_css_set will give us newcg already referenced. */
1872 newcg = find_css_set(oldcg, cgrp);
1873 if (!newcg)
1874 return -ENOMEM;
1875 }
1876
1877 task_lock(tsk); 1856 task_lock(tsk);
1878 rcu_assign_pointer(tsk->cgroups, newcg); 1857 rcu_assign_pointer(tsk->cgroups, newcg);
1879 task_unlock(tsk); 1858 task_unlock(tsk);
@@ -1892,7 +1871,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1892 put_css_set(oldcg); 1871 put_css_set(oldcg);
1893 1872
1894 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1873 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1895 return 0;
1896} 1874}
1897 1875
1898/** 1876/**
@@ -1910,6 +1888,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1910 struct cgroup *oldcgrp; 1888 struct cgroup *oldcgrp;
1911 struct cgroupfs_root *root = cgrp->root; 1889 struct cgroupfs_root *root = cgrp->root;
1912 struct cgroup_taskset tset = { }; 1890 struct cgroup_taskset tset = { };
1891 struct css_set *newcg;
1913 1892
1914 /* @tsk either already exited or can't exit until the end */ 1893 /* @tsk either already exited or can't exit until the end */
1915 if (tsk->flags & PF_EXITING) 1894 if (tsk->flags & PF_EXITING)
@@ -1925,7 +1904,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1925 1904
1926 for_each_subsys(root, ss) { 1905 for_each_subsys(root, ss) {
1927 if (ss->can_attach) { 1906 if (ss->can_attach) {
1928 retval = ss->can_attach(ss, cgrp, &tset); 1907 retval = ss->can_attach(cgrp, &tset);
1929 if (retval) { 1908 if (retval) {
1930 /* 1909 /*
1931 * Remember on which subsystem the can_attach() 1910 * Remember on which subsystem the can_attach()
@@ -1939,13 +1918,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1939 } 1918 }
1940 } 1919 }
1941 1920
1942 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1921 newcg = find_css_set(tsk->cgroups, cgrp);
1943 if (retval) 1922 if (!newcg) {
1923 retval = -ENOMEM;
1944 goto out; 1924 goto out;
1925 }
1926
1927 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
1945 1928
1946 for_each_subsys(root, ss) { 1929 for_each_subsys(root, ss) {
1947 if (ss->attach) 1930 if (ss->attach)
1948 ss->attach(ss, cgrp, &tset); 1931 ss->attach(cgrp, &tset);
1949 } 1932 }
1950 1933
1951 synchronize_rcu(); 1934 synchronize_rcu();
@@ -1967,7 +1950,7 @@ out:
1967 */ 1950 */
1968 break; 1951 break;
1969 if (ss->cancel_attach) 1952 if (ss->cancel_attach)
1970 ss->cancel_attach(ss, cgrp, &tset); 1953 ss->cancel_attach(cgrp, &tset);
1971 } 1954 }
1972 } 1955 }
1973 return retval; 1956 return retval;
@@ -1997,66 +1980,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1997} 1980}
1998EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1981EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1999 1982
2000/*
2001 * cgroup_attach_proc works in two stages, the first of which prefetches all
2002 * new css_sets needed (to make sure we have enough memory before committing
2003 * to the move) and stores them in a list of entries of the following type.
2004 * TODO: possible optimization: use css_set->rcu_head for chaining instead
2005 */
2006struct cg_list_entry {
2007 struct css_set *cg;
2008 struct list_head links;
2009};
2010
2011static bool css_set_check_fetched(struct cgroup *cgrp,
2012 struct task_struct *tsk, struct css_set *cg,
2013 struct list_head *newcg_list)
2014{
2015 struct css_set *newcg;
2016 struct cg_list_entry *cg_entry;
2017 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
2018
2019 read_lock(&css_set_lock);
2020 newcg = find_existing_css_set(cg, cgrp, template);
2021 read_unlock(&css_set_lock);
2022
2023 /* doesn't exist at all? */
2024 if (!newcg)
2025 return false;
2026 /* see if it's already in the list */
2027 list_for_each_entry(cg_entry, newcg_list, links)
2028 if (cg_entry->cg == newcg)
2029 return true;
2030
2031 /* not found */
2032 return false;
2033}
2034
2035/*
2036 * Find the new css_set and store it in the list in preparation for moving the
2037 * given task to the given cgroup. Returns 0 or -ENOMEM.
2038 */
2039static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
2040 struct list_head *newcg_list)
2041{
2042 struct css_set *newcg;
2043 struct cg_list_entry *cg_entry;
2044
2045 /* ensure a new css_set will exist for this thread */
2046 newcg = find_css_set(cg, cgrp);
2047 if (!newcg)
2048 return -ENOMEM;
2049 /* add it to the list */
2050 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
2051 if (!cg_entry) {
2052 put_css_set(newcg);
2053 return -ENOMEM;
2054 }
2055 cg_entry->cg = newcg;
2056 list_add(&cg_entry->links, newcg_list);
2057 return 0;
2058}
2059
2060/** 1983/**
2061 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup 1984 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
2062 * @cgrp: the cgroup to attach to 1985 * @cgrp: the cgroup to attach to
@@ -2070,20 +1993,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2070 int retval, i, group_size; 1993 int retval, i, group_size;
2071 struct cgroup_subsys *ss, *failed_ss = NULL; 1994 struct cgroup_subsys *ss, *failed_ss = NULL;
2072 /* guaranteed to be initialized later, but the compiler needs this */ 1995 /* guaranteed to be initialized later, but the compiler needs this */
2073 struct css_set *oldcg;
2074 struct cgroupfs_root *root = cgrp->root; 1996 struct cgroupfs_root *root = cgrp->root;
2075 /* threadgroup list cursor and array */ 1997 /* threadgroup list cursor and array */
2076 struct task_struct *tsk; 1998 struct task_struct *tsk;
2077 struct task_and_cgroup *tc; 1999 struct task_and_cgroup *tc;
2078 struct flex_array *group; 2000 struct flex_array *group;
2079 struct cgroup_taskset tset = { }; 2001 struct cgroup_taskset tset = { };
2080 /*
2081 * we need to make sure we have css_sets for all the tasks we're
2082 * going to move -before- we actually start moving them, so that in
2083 * case we get an ENOMEM we can bail out before making any changes.
2084 */
2085 struct list_head newcg_list;
2086 struct cg_list_entry *cg_entry, *temp_nobe;
2087 2002
2088 /* 2003 /*
2089 * step 0: in order to do expensive, possibly blocking operations for 2004 * step 0: in order to do expensive, possibly blocking operations for
@@ -2102,23 +2017,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2102 if (retval) 2017 if (retval)
2103 goto out_free_group_list; 2018 goto out_free_group_list;
2104 2019
2105 /* prevent changes to the threadgroup list while we take a snapshot. */
2106 read_lock(&tasklist_lock);
2107 if (!thread_group_leader(leader)) {
2108 /*
2109 * a race with de_thread from another thread's exec() may strip
2110 * us of our leadership, making while_each_thread unsafe to use
2111 * on this task. if this happens, there is no choice but to
2112 * throw this task away and try again (from cgroup_procs_write);
2113 * this is "double-double-toil-and-trouble-check locking".
2114 */
2115 read_unlock(&tasklist_lock);
2116 retval = -EAGAIN;
2117 goto out_free_group_list;
2118 }
2119
2120 tsk = leader; 2020 tsk = leader;
2121 i = 0; 2021 i = 0;
2022 /*
2023 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2024 * already PF_EXITING could be freed from underneath us unless we
2025 * take an rcu_read_lock.
2026 */
2027 rcu_read_lock();
2122 do { 2028 do {
2123 struct task_and_cgroup ent; 2029 struct task_and_cgroup ent;
2124 2030
@@ -2128,24 +2034,24 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2128 2034
2129 /* as per above, nr_threads may decrease, but not increase. */ 2035 /* as per above, nr_threads may decrease, but not increase. */
2130 BUG_ON(i >= group_size); 2036 BUG_ON(i >= group_size);
2131 /*
2132 * saying GFP_ATOMIC has no effect here because we did prealloc
2133 * earlier, but it's good form to communicate our expectations.
2134 */
2135 ent.task = tsk; 2037 ent.task = tsk;
2136 ent.cgrp = task_cgroup_from_root(tsk, root); 2038 ent.cgrp = task_cgroup_from_root(tsk, root);
2137 /* nothing to do if this task is already in the cgroup */ 2039 /* nothing to do if this task is already in the cgroup */
2138 if (ent.cgrp == cgrp) 2040 if (ent.cgrp == cgrp)
2139 continue; 2041 continue;
2042 /*
2043 * saying GFP_ATOMIC has no effect here because we did prealloc
2044 * earlier, but it's good form to communicate our expectations.
2045 */
2140 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2046 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2141 BUG_ON(retval != 0); 2047 BUG_ON(retval != 0);
2142 i++; 2048 i++;
2143 } while_each_thread(leader, tsk); 2049 } while_each_thread(leader, tsk);
2050 rcu_read_unlock();
2144 /* remember the number of threads in the array for later. */ 2051 /* remember the number of threads in the array for later. */
2145 group_size = i; 2052 group_size = i;
2146 tset.tc_array = group; 2053 tset.tc_array = group;
2147 tset.tc_array_len = group_size; 2054 tset.tc_array_len = group_size;
2148 read_unlock(&tasklist_lock);
2149 2055
2150 /* methods shouldn't be called if no task is actually migrating */ 2056 /* methods shouldn't be called if no task is actually migrating */
2151 retval = 0; 2057 retval = 0;
@@ -2157,7 +2063,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2157 */ 2063 */
2158 for_each_subsys(root, ss) { 2064 for_each_subsys(root, ss) {
2159 if (ss->can_attach) { 2065 if (ss->can_attach) {
2160 retval = ss->can_attach(ss, cgrp, &tset); 2066 retval = ss->can_attach(cgrp, &tset);
2161 if (retval) { 2067 if (retval) {
2162 failed_ss = ss; 2068 failed_ss = ss;
2163 goto out_cancel_attach; 2069 goto out_cancel_attach;
@@ -2169,17 +2075,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2169 * step 2: make sure css_sets exist for all threads to be migrated. 2075 * step 2: make sure css_sets exist for all threads to be migrated.
2170 * we use find_css_set, which allocates a new one if necessary. 2076 * we use find_css_set, which allocates a new one if necessary.
2171 */ 2077 */
2172 INIT_LIST_HEAD(&newcg_list);
2173 for (i = 0; i < group_size; i++) { 2078 for (i = 0; i < group_size; i++) {
2174 tc = flex_array_get(group, i); 2079 tc = flex_array_get(group, i);
2175 oldcg = tc->task->cgroups; 2080 tc->cg = find_css_set(tc->task->cgroups, cgrp);
2176 2081 if (!tc->cg) {
2177 /* if we don't already have it in the list get a new one */ 2082 retval = -ENOMEM;
2178 if (!css_set_check_fetched(cgrp, tc->task, oldcg, 2083 goto out_put_css_set_refs;
2179 &newcg_list)) {
2180 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2181 if (retval)
2182 goto out_list_teardown;
2183 } 2084 }
2184 } 2085 }
2185 2086
@@ -2190,8 +2091,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2190 */ 2091 */
2191 for (i = 0; i < group_size; i++) { 2092 for (i = 0; i < group_size; i++) {
2192 tc = flex_array_get(group, i); 2093 tc = flex_array_get(group, i);
2193 retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); 2094 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
2194 BUG_ON(retval);
2195 } 2095 }
2196 /* nothing is sensitive to fork() after this point. */ 2096 /* nothing is sensitive to fork() after this point. */
2197 2097
@@ -2200,7 +2100,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2200 */ 2100 */
2201 for_each_subsys(root, ss) { 2101 for_each_subsys(root, ss) {
2202 if (ss->attach) 2102 if (ss->attach)
2203 ss->attach(ss, cgrp, &tset); 2103 ss->attach(cgrp, &tset);
2204 } 2104 }
2205 2105
2206 /* 2106 /*
@@ -2209,21 +2109,22 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2209 synchronize_rcu(); 2109 synchronize_rcu();
2210 cgroup_wakeup_rmdir_waiter(cgrp); 2110 cgroup_wakeup_rmdir_waiter(cgrp);
2211 retval = 0; 2111 retval = 0;
2212out_list_teardown: 2112out_put_css_set_refs:
2213 /* clean up the list of prefetched css_sets. */ 2113 if (retval) {
2214 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { 2114 for (i = 0; i < group_size; i++) {
2215 list_del(&cg_entry->links); 2115 tc = flex_array_get(group, i);
2216 put_css_set(cg_entry->cg); 2116 if (!tc->cg)
2217 kfree(cg_entry); 2117 break;
2118 put_css_set(tc->cg);
2119 }
2218 } 2120 }
2219out_cancel_attach: 2121out_cancel_attach:
2220 /* same deal as in cgroup_attach_task */
2221 if (retval) { 2122 if (retval) {
2222 for_each_subsys(root, ss) { 2123 for_each_subsys(root, ss) {
2223 if (ss == failed_ss) 2124 if (ss == failed_ss)
2224 break; 2125 break;
2225 if (ss->cancel_attach) 2126 if (ss->cancel_attach)
2226 ss->cancel_attach(ss, cgrp, &tset); 2127 ss->cancel_attach(cgrp, &tset);
2227 } 2128 }
2228 } 2129 }
2229out_free_group_list: 2130out_free_group_list:
@@ -2245,22 +2146,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2245 if (!cgroup_lock_live_group(cgrp)) 2146 if (!cgroup_lock_live_group(cgrp))
2246 return -ENODEV; 2147 return -ENODEV;
2247 2148
2149retry_find_task:
2150 rcu_read_lock();
2248 if (pid) { 2151 if (pid) {
2249 rcu_read_lock();
2250 tsk = find_task_by_vpid(pid); 2152 tsk = find_task_by_vpid(pid);
2251 if (!tsk) { 2153 if (!tsk) {
2252 rcu_read_unlock(); 2154 rcu_read_unlock();
2253 cgroup_unlock(); 2155 ret= -ESRCH;
2254 return -ESRCH; 2156 goto out_unlock_cgroup;
2255 }
2256 if (threadgroup) {
2257 /*
2258 * RCU protects this access, since tsk was found in the
2259 * tid map. a race with de_thread may cause group_leader
2260 * to stop being the leader, but cgroup_attach_proc will
2261 * detect it later.
2262 */
2263 tsk = tsk->group_leader;
2264 } 2157 }
2265 /* 2158 /*
2266 * even if we're attaching all tasks in the thread group, we 2159 * even if we're attaching all tasks in the thread group, we
@@ -2271,29 +2164,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2271 cred->euid != tcred->uid && 2164 cred->euid != tcred->uid &&
2272 cred->euid != tcred->suid) { 2165 cred->euid != tcred->suid) {
2273 rcu_read_unlock(); 2166 rcu_read_unlock();
2274 cgroup_unlock(); 2167 ret = -EACCES;
2275 return -EACCES; 2168 goto out_unlock_cgroup;
2276 } 2169 }
2277 get_task_struct(tsk); 2170 } else
2278 rcu_read_unlock(); 2171 tsk = current;
2279 } else {
2280 if (threadgroup)
2281 tsk = current->group_leader;
2282 else
2283 tsk = current;
2284 get_task_struct(tsk);
2285 }
2286
2287 threadgroup_lock(tsk);
2288 2172
2289 if (threadgroup) 2173 if (threadgroup)
2174 tsk = tsk->group_leader;
2175 get_task_struct(tsk);
2176 rcu_read_unlock();
2177
2178 threadgroup_lock(tsk);
2179 if (threadgroup) {
2180 if (!thread_group_leader(tsk)) {
2181 /*
2182 * a race with de_thread from another thread's exec()
2183 * may strip us of our leadership, if this happens,
2184 * there is no choice but to throw this task away and
2185 * try again; this is
2186 * "double-double-toil-and-trouble-check locking".
2187 */
2188 threadgroup_unlock(tsk);
2189 put_task_struct(tsk);
2190 goto retry_find_task;
2191 }
2290 ret = cgroup_attach_proc(cgrp, tsk); 2192 ret = cgroup_attach_proc(cgrp, tsk);
2291 else 2193 } else
2292 ret = cgroup_attach_task(cgrp, tsk); 2194 ret = cgroup_attach_task(cgrp, tsk);
2293
2294 threadgroup_unlock(tsk); 2195 threadgroup_unlock(tsk);
2295 2196
2296 put_task_struct(tsk); 2197 put_task_struct(tsk);
2198out_unlock_cgroup:
2297 cgroup_unlock(); 2199 cgroup_unlock();
2298 return ret; 2200 return ret;
2299} 2201}
@@ -2305,16 +2207,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2305 2207
2306static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2208static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2307{ 2209{
2308 int ret; 2210 return attach_task_by_pid(cgrp, tgid, true);
2309 do {
2310 /*
2311 * attach_proc fails with -EAGAIN if threadgroup leadership
2312 * changes in the middle of the operation, in which case we need
2313 * to find the task_struct for the new leader and start over.
2314 */
2315 ret = attach_task_by_pid(cgrp, tgid, true);
2316 } while (ret == -EAGAIN);
2317 return ret;
2318} 2211}
2319 2212
2320/** 2213/**
@@ -2804,15 +2697,20 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
2804 * using their cgroups capability, we don't maintain the lists running 2697 * using their cgroups capability, we don't maintain the lists running
2805 * through each css_set to its tasks until we see the list actually 2698 * through each css_set to its tasks until we see the list actually
2806 * used - in other words after the first call to cgroup_iter_start(). 2699 * used - in other words after the first call to cgroup_iter_start().
2807 *
2808 * The tasklist_lock is not held here, as do_each_thread() and
2809 * while_each_thread() are protected by RCU.
2810 */ 2700 */
2811static void cgroup_enable_task_cg_lists(void) 2701static void cgroup_enable_task_cg_lists(void)
2812{ 2702{
2813 struct task_struct *p, *g; 2703 struct task_struct *p, *g;
2814 write_lock(&css_set_lock); 2704 write_lock(&css_set_lock);
2815 use_task_css_set_links = 1; 2705 use_task_css_set_links = 1;
2706 /*
2707 * We need tasklist_lock because RCU is not safe against
2708 * while_each_thread(). Besides, a forking task that has passed
2709 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2710 * is not guaranteed to have its child immediately visible in the
2711 * tasklist if we walk through it with RCU.
2712 */
2713 read_lock(&tasklist_lock);
2816 do_each_thread(g, p) { 2714 do_each_thread(g, p) {
2817 task_lock(p); 2715 task_lock(p);
2818 /* 2716 /*
@@ -2824,6 +2722,7 @@ static void cgroup_enable_task_cg_lists(void)
2824 list_add(&p->cg_list, &p->cgroups->tasks); 2722 list_add(&p->cg_list, &p->cgroups->tasks);
2825 task_unlock(p); 2723 task_unlock(p);
2826 } while_each_thread(g, p); 2724 } while_each_thread(g, p);
2725 read_unlock(&tasklist_lock);
2827 write_unlock(&css_set_lock); 2726 write_unlock(&css_set_lock);
2828} 2727}
2829 2728
@@ -3043,6 +2942,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3043 * 2942 *
3044 */ 2943 */
3045 2944
2945/* which pidlist file are we talking about? */
2946enum cgroup_filetype {
2947 CGROUP_FILE_PROCS,
2948 CGROUP_FILE_TASKS,
2949};
2950
2951/*
2952 * A pidlist is a list of pids that virtually represents the contents of one
2953 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
2954 * a pair (one each for procs, tasks) for each pid namespace that's relevant
2955 * to the cgroup.
2956 */
2957struct cgroup_pidlist {
2958 /*
2959 * used to find which pidlist is wanted. doesn't change as long as
2960 * this particular list stays in the list.
2961 */
2962 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
2963 /* array of xids */
2964 pid_t *list;
2965 /* how many elements the above list has */
2966 int length;
2967 /* how many files are using the current array */
2968 int use_count;
2969 /* each of these stored in a list by its cgroup */
2970 struct list_head links;
2971 /* pointer to the cgroup we belong to, for list removal purposes */
2972 struct cgroup *owner;
2973 /* protects the other fields */
2974 struct rw_semaphore mutex;
2975};
2976
3046/* 2977/*
3047 * The following two functions "fix" the issue where there are more pids 2978 * The following two functions "fix" the issue where there are more pids
3048 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. 2979 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3827,7 +3758,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3827 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3758 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3828 3759
3829 for_each_subsys(root, ss) { 3760 for_each_subsys(root, ss) {
3830 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3761 struct cgroup_subsys_state *css = ss->create(cgrp);
3831 3762
3832 if (IS_ERR(css)) { 3763 if (IS_ERR(css)) {
3833 err = PTR_ERR(css); 3764 err = PTR_ERR(css);
@@ -3841,7 +3772,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3841 } 3772 }
3842 /* At error, ->destroy() callback has to free assigned ID. */ 3773 /* At error, ->destroy() callback has to free assigned ID. */
3843 if (clone_children(parent) && ss->post_clone) 3774 if (clone_children(parent) && ss->post_clone)
3844 ss->post_clone(ss, cgrp); 3775 ss->post_clone(cgrp);
3845 } 3776 }
3846 3777
3847 cgroup_lock_hierarchy(root); 3778 cgroup_lock_hierarchy(root);
@@ -3875,7 +3806,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3875 3806
3876 for_each_subsys(root, ss) { 3807 for_each_subsys(root, ss) {
3877 if (cgrp->subsys[ss->subsys_id]) 3808 if (cgrp->subsys[ss->subsys_id])
3878 ss->destroy(ss, cgrp); 3809 ss->destroy(cgrp);
3879 } 3810 }
3880 3811
3881 mutex_unlock(&cgroup_mutex); 3812 mutex_unlock(&cgroup_mutex);
@@ -4099,7 +4030,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4099 /* Create the top cgroup state for this subsystem */ 4030 /* Create the top cgroup state for this subsystem */
4100 list_add(&ss->sibling, &rootnode.subsys_list); 4031 list_add(&ss->sibling, &rootnode.subsys_list);
4101 ss->root = &rootnode; 4032 ss->root = &rootnode;
4102 css = ss->create(ss, dummytop); 4033 css = ss->create(dummytop);
4103 /* We don't handle early failures gracefully */ 4034 /* We don't handle early failures gracefully */
4104 BUG_ON(IS_ERR(css)); 4035 BUG_ON(IS_ERR(css));
4105 init_cgroup_css(css, ss, dummytop); 4036 init_cgroup_css(css, ss, dummytop);
@@ -4188,7 +4119,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4188 * no ss->create seems to need anything important in the ss struct, so 4119 * no ss->create seems to need anything important in the ss struct, so
4189 * this can happen first (i.e. before the rootnode attachment). 4120 * this can happen first (i.e. before the rootnode attachment).
4190 */ 4121 */
4191 css = ss->create(ss, dummytop); 4122 css = ss->create(dummytop);
4192 if (IS_ERR(css)) { 4123 if (IS_ERR(css)) {
4193 /* failure case - need to deassign the subsys[] slot. */ 4124 /* failure case - need to deassign the subsys[] slot. */
4194 subsys[i] = NULL; 4125 subsys[i] = NULL;
@@ -4206,7 +4137,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4206 int ret = cgroup_init_idr(ss, css); 4137 int ret = cgroup_init_idr(ss, css);
4207 if (ret) { 4138 if (ret) {
4208 dummytop->subsys[ss->subsys_id] = NULL; 4139 dummytop->subsys[ss->subsys_id] = NULL;
4209 ss->destroy(ss, dummytop); 4140 ss->destroy(dummytop);
4210 subsys[i] = NULL; 4141 subsys[i] = NULL;
4211 mutex_unlock(&cgroup_mutex); 4142 mutex_unlock(&cgroup_mutex);
4212 return ret; 4143 return ret;
@@ -4304,7 +4235,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4304 * pointer to find their state. note that this also takes care of 4235 * pointer to find their state. note that this also takes care of
4305 * freeing the css_id. 4236 * freeing the css_id.
4306 */ 4237 */
4307 ss->destroy(ss, dummytop); 4238 ss->destroy(dummytop);
4308 dummytop->subsys[ss->subsys_id] = NULL; 4239 dummytop->subsys[ss->subsys_id] = NULL;
4309 4240
4310 mutex_unlock(&cgroup_mutex); 4241 mutex_unlock(&cgroup_mutex);
@@ -4580,7 +4511,7 @@ void cgroup_fork_callbacks(struct task_struct *child)
4580 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 4511 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4581 struct cgroup_subsys *ss = subsys[i]; 4512 struct cgroup_subsys *ss = subsys[i];
4582 if (ss->fork) 4513 if (ss->fork)
4583 ss->fork(ss, child); 4514 ss->fork(child);
4584 } 4515 }
4585 } 4516 }
4586} 4517}
@@ -4596,6 +4527,17 @@ void cgroup_fork_callbacks(struct task_struct *child)
4596 */ 4527 */
4597void cgroup_post_fork(struct task_struct *child) 4528void cgroup_post_fork(struct task_struct *child)
4598{ 4529{
4530 /*
4531 * use_task_css_set_links is set to 1 before we walk the tasklist
4532 * under the tasklist_lock and we read it here after we added the child
4533 * to the tasklist under the tasklist_lock as well. If the child wasn't
4534 * yet in the tasklist when we walked through it from
4535 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
4536 * should be visible now due to the paired locking and barriers implied
4537 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
4538 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
4539 * lock on fork.
4540 */
4599 if (use_task_css_set_links) { 4541 if (use_task_css_set_links) {
4600 write_lock(&css_set_lock); 4542 write_lock(&css_set_lock);
4601 if (list_empty(&child->cg_list)) { 4543 if (list_empty(&child->cg_list)) {
@@ -4682,7 +4624,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4682 struct cgroup *old_cgrp = 4624 struct cgroup *old_cgrp =
4683 rcu_dereference_raw(cg->subsys[i])->cgroup; 4625 rcu_dereference_raw(cg->subsys[i])->cgroup;
4684 struct cgroup *cgrp = task_cgroup(tsk, i); 4626 struct cgroup *cgrp = task_cgroup(tsk, i);
4685 ss->exit(ss, cgrp, old_cgrp, tsk); 4627 ss->exit(cgrp, old_cgrp, tsk);
4686 } 4628 }
4687 } 4629 }
4688 } 4630 }
@@ -4939,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4939 4881
4940 rcu_assign_pointer(id->css, NULL); 4882 rcu_assign_pointer(id->css, NULL);
4941 rcu_assign_pointer(css->id, NULL); 4883 rcu_assign_pointer(css->id, NULL);
4942 write_lock(&ss->id_lock); 4884 spin_lock(&ss->id_lock);
4943 idr_remove(&ss->idr, id->id); 4885 idr_remove(&ss->idr, id->id);
4944 write_unlock(&ss->id_lock); 4886 spin_unlock(&ss->id_lock);
4945 kfree_rcu(id, rcu_head); 4887 kfree_rcu(id, rcu_head);
4946} 4888}
4947EXPORT_SYMBOL_GPL(free_css_id); 4889EXPORT_SYMBOL_GPL(free_css_id);
@@ -4967,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4967 error = -ENOMEM; 4909 error = -ENOMEM;
4968 goto err_out; 4910 goto err_out;
4969 } 4911 }
4970 write_lock(&ss->id_lock); 4912 spin_lock(&ss->id_lock);
4971 /* Don't use 0. allocates an ID of 1-65535 */ 4913 /* Don't use 0. allocates an ID of 1-65535 */
4972 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 4914 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
4973 write_unlock(&ss->id_lock); 4915 spin_unlock(&ss->id_lock);
4974 4916
4975 /* Returns error when there are no free spaces for new ID.*/ 4917 /* Returns error when there are no free spaces for new ID.*/
4976 if (error) { 4918 if (error) {
@@ -4985,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4985 return newid; 4927 return newid;
4986remove_idr: 4928remove_idr:
4987 error = -ENOSPC; 4929 error = -ENOSPC;
4988 write_lock(&ss->id_lock); 4930 spin_lock(&ss->id_lock);
4989 idr_remove(&ss->idr, myid); 4931 idr_remove(&ss->idr, myid);
4990 write_unlock(&ss->id_lock); 4932 spin_unlock(&ss->id_lock);
4991err_out: 4933err_out:
4992 kfree(newid); 4934 kfree(newid);
4993 return ERR_PTR(error); 4935 return ERR_PTR(error);
@@ -4999,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4999{ 4941{
5000 struct css_id *newid; 4942 struct css_id *newid;
5001 4943
5002 rwlock_init(&ss->id_lock); 4944 spin_lock_init(&ss->id_lock);
5003 idr_init(&ss->idr); 4945 idr_init(&ss->idr);
5004 4946
5005 newid = get_new_cssid(ss, 0); 4947 newid = get_new_cssid(ss, 0);
@@ -5087,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id,
5087 return NULL; 5029 return NULL;
5088 5030
5089 BUG_ON(!ss->use_id); 5031 BUG_ON(!ss->use_id);
5032 WARN_ON_ONCE(!rcu_read_lock_held());
5033
5090 /* fill start point for scan */ 5034 /* fill start point for scan */
5091 tmpid = id; 5035 tmpid = id;
5092 while (1) { 5036 while (1) {
@@ -5094,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id,
5094 * scan next entry from bitmap(tree), tmpid is updated after 5038 * scan next entry from bitmap(tree), tmpid is updated after
5095 * idr_get_next(). 5039 * idr_get_next().
5096 */ 5040 */
5097 read_lock(&ss->id_lock);
5098 tmp = idr_get_next(&ss->idr, &tmpid); 5041 tmp = idr_get_next(&ss->idr, &tmpid);
5099 read_unlock(&ss->id_lock);
5100
5101 if (!tmp) 5042 if (!tmp)
5102 break; 5043 break;
5103 if (tmp->depth >= depth && tmp->stack[depth] == rootid) { 5044 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
@@ -5137,8 +5078,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5137} 5078}
5138 5079
5139#ifdef CONFIG_CGROUP_DEBUG 5080#ifdef CONFIG_CGROUP_DEBUG
5140static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 5081static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5141 struct cgroup *cont)
5142{ 5082{
5143 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5083 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5144 5084
@@ -5148,7 +5088,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
5148 return css; 5088 return css;
5149} 5089}
5150 5090
5151static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 5091static void debug_destroy(struct cgroup *cont)
5152{ 5092{
5153 kfree(cont->subsys[debug_subsys_id]); 5093 kfree(cont->subsys[debug_subsys_id]);
5154} 5094}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fc0646b78a64..f86e93920b62 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys;
128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) 128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
129 * sighand->siglock 129 * sighand->siglock
130 */ 130 */
131static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 131static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
132 struct cgroup *cgroup)
133{ 132{
134 struct freezer *freezer; 133 struct freezer *freezer;
135 134
@@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
142 return &freezer->css; 141 return &freezer->css;
143} 142}
144 143
145static void freezer_destroy(struct cgroup_subsys *ss, 144static void freezer_destroy(struct cgroup *cgroup)
146 struct cgroup *cgroup)
147{ 145{
148 struct freezer *freezer = cgroup_freezer(cgroup); 146 struct freezer *freezer = cgroup_freezer(cgroup);
149 147
@@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
164 * a write to that file racing against an attach, and hence the 162 * a write to that file racing against an attach, and hence the
165 * can_attach() result will remain valid until the attach completes. 163 * can_attach() result will remain valid until the attach completes.
166 */ 164 */
167static int freezer_can_attach(struct cgroup_subsys *ss, 165static int freezer_can_attach(struct cgroup *new_cgroup,
168 struct cgroup *new_cgroup,
169 struct cgroup_taskset *tset) 166 struct cgroup_taskset *tset)
170{ 167{
171 struct freezer *freezer; 168 struct freezer *freezer;
@@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
185 return 0; 182 return 0;
186} 183}
187 184
188static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) 185static void freezer_fork(struct task_struct *task)
189{ 186{
190 struct freezer *freezer; 187 struct freezer *freezer;
191 188
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4ef4d7ecb9fb..b96ad75b7e64 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
964{ 964{
965 bool need_loop; 965 bool need_loop;
966 966
967repeat:
968 /* 967 /*
969 * Allow tasks that have access to memory reserves because they have 968 * Allow tasks that have access to memory reserves because they have
970 * been OOM killed to get memory anywhere. 969 * been OOM killed to get memory anywhere.
@@ -983,45 +982,19 @@ repeat:
983 */ 982 */
984 need_loop = task_has_mempolicy(tsk) || 983 need_loop = task_has_mempolicy(tsk) ||
985 !nodes_intersects(*newmems, tsk->mems_allowed); 984 !nodes_intersects(*newmems, tsk->mems_allowed);
986 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
987 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
988 985
989 /* 986 if (need_loop)
990 * ensure checking ->mems_allowed_change_disable after setting all new 987 write_seqcount_begin(&tsk->mems_allowed_seq);
991 * allowed nodes.
992 *
993 * the read-side task can see an nodemask with new allowed nodes and
994 * old allowed nodes. and if it allocates page when cpuset clears newly
995 * disallowed ones continuous, it can see the new allowed bits.
996 *
997 * And if setting all new allowed nodes is after the checking, setting
998 * all new allowed nodes and clearing newly disallowed ones will be done
999 * continuous, and the read-side task may find no node to alloc page.
1000 */
1001 smp_mb();
1002 988
1003 /* 989 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1004 * Allocation of memory is very fast, we needn't sleep when waiting 990 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
1005 * for the read-side.
1006 */
1007 while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
1008 task_unlock(tsk);
1009 if (!task_curr(tsk))
1010 yield();
1011 goto repeat;
1012 }
1013
1014 /*
1015 * ensure checking ->mems_allowed_change_disable before clearing all new
1016 * disallowed nodes.
1017 *
1018 * if clearing newly disallowed bits before the checking, the read-side
1019 * task may find no node to alloc page.
1020 */
1021 smp_mb();
1022 991
1023 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); 992 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1024 tsk->mems_allowed = *newmems; 993 tsk->mems_allowed = *newmems;
994
995 if (need_loop)
996 write_seqcount_end(&tsk->mems_allowed_seq);
997
1025 task_unlock(tsk); 998 task_unlock(tsk);
1026} 999}
1027 1000
@@ -1399,8 +1372,7 @@ static nodemask_t cpuset_attach_nodemask_from;
1399static nodemask_t cpuset_attach_nodemask_to; 1372static nodemask_t cpuset_attach_nodemask_to;
1400 1373
1401/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1374/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1402static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1375static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1403 struct cgroup_taskset *tset)
1404{ 1376{
1405 struct cpuset *cs = cgroup_cs(cgrp); 1377 struct cpuset *cs = cgroup_cs(cgrp);
1406 struct task_struct *task; 1378 struct task_struct *task;
@@ -1436,8 +1408,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1436 return 0; 1408 return 0;
1437} 1409}
1438 1410
1439static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1411static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1440 struct cgroup_taskset *tset)
1441{ 1412{
1442 struct mm_struct *mm; 1413 struct mm_struct *mm;
1443 struct task_struct *task; 1414 struct task_struct *task;
@@ -1833,8 +1804,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1833 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex 1804 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1834 * held. 1805 * held.
1835 */ 1806 */
1836static void cpuset_post_clone(struct cgroup_subsys *ss, 1807static void cpuset_post_clone(struct cgroup *cgroup)
1837 struct cgroup *cgroup)
1838{ 1808{
1839 struct cgroup *parent, *child; 1809 struct cgroup *parent, *child;
1840 struct cpuset *cs, *parent_cs; 1810 struct cpuset *cs, *parent_cs;
@@ -1857,13 +1827,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1857 1827
1858/* 1828/*
1859 * cpuset_create - create a cpuset 1829 * cpuset_create - create a cpuset
1860 * ss: cpuset cgroup subsystem
1861 * cont: control group that the new cpuset will be part of 1830 * cont: control group that the new cpuset will be part of
1862 */ 1831 */
1863 1832
1864static struct cgroup_subsys_state *cpuset_create( 1833static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1865 struct cgroup_subsys *ss,
1866 struct cgroup *cont)
1867{ 1834{
1868 struct cpuset *cs; 1835 struct cpuset *cs;
1869 struct cpuset *parent; 1836 struct cpuset *parent;
@@ -1902,7 +1869,7 @@ static struct cgroup_subsys_state *cpuset_create(
1902 * will call async_rebuild_sched_domains(). 1869 * will call async_rebuild_sched_domains().
1903 */ 1870 */
1904 1871
1905static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1872static void cpuset_destroy(struct cgroup *cont)
1906{ 1873{
1907 struct cpuset *cs = cgroup_cs(cont); 1874 struct cpuset *cs = cgroup_cs(cont);
1908 1875
diff --git a/kernel/cred.c b/kernel/cred.c
index 5791612a4045..97b36eeca4c9 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,7 @@
16#include <linux/keyctl.h> 16#include <linux/keyctl.h>
17#include <linux/init_task.h> 17#include <linux/init_task.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/binfmts.h>
19#include <linux/cn_proc.h> 20#include <linux/cn_proc.h>
20 21
21#if 0 22#if 0
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0d7c08784efb..1dc53bae56e1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -41,6 +41,7 @@
41#include <linux/delay.h> 41#include <linux/delay.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/sysrq.h> 43#include <linux/sysrq.h>
44#include <linux/reboot.h>
44#include <linux/init.h> 45#include <linux/init.h>
45#include <linux/kgdb.h> 46#include <linux/kgdb.h>
46#include <linux/kdb.h> 47#include <linux/kdb.h>
@@ -52,7 +53,6 @@
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
53#include <asm/byteorder.h> 54#include <asm/byteorder.h>
54#include <linux/atomic.h> 55#include <linux/atomic.h>
55#include <asm/system.h>
56 56
57#include "debug_core.h" 57#include "debug_core.h"
58 58
@@ -75,6 +75,8 @@ static int exception_level;
75struct kgdb_io *dbg_io_ops; 75struct kgdb_io *dbg_io_ops;
76static DEFINE_SPINLOCK(kgdb_registration_lock); 76static DEFINE_SPINLOCK(kgdb_registration_lock);
77 77
78/* Action for the reboot notifiter, a global allow kdb to change it */
79static int kgdbreboot;
78/* kgdb console driver is loaded */ 80/* kgdb console driver is loaded */
79static int kgdb_con_registered; 81static int kgdb_con_registered;
80/* determine if kgdb console output should be used */ 82/* determine if kgdb console output should be used */
@@ -96,6 +98,7 @@ static int __init opt_kgdb_con(char *str)
96early_param("kgdbcon", opt_kgdb_con); 98early_param("kgdbcon", opt_kgdb_con);
97 99
98module_param(kgdb_use_con, int, 0644); 100module_param(kgdb_use_con, int, 0644);
101module_param(kgdbreboot, int, 0644);
99 102
100/* 103/*
101 * Holds information about breakpoints in a kernel. These breakpoints are 104 * Holds information about breakpoints in a kernel. These breakpoints are
@@ -784,6 +787,33 @@ void __init dbg_late_init(void)
784 kdb_init(KDB_INIT_FULL); 787 kdb_init(KDB_INIT_FULL);
785} 788}
786 789
790static int
791dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
792{
793 /*
794 * Take the following action on reboot notify depending on value:
795 * 1 == Enter debugger
796 * 0 == [the default] detatch debug client
797 * -1 == Do nothing... and use this until the board resets
798 */
799 switch (kgdbreboot) {
800 case 1:
801 kgdb_breakpoint();
802 case -1:
803 goto done;
804 }
805 if (!dbg_kdb_mode)
806 gdbstub_exit(code);
807done:
808 return NOTIFY_DONE;
809}
810
811static struct notifier_block dbg_reboot_notifier = {
812 .notifier_call = dbg_notify_reboot,
813 .next = NULL,
814 .priority = INT_MAX,
815};
816
787static void kgdb_register_callbacks(void) 817static void kgdb_register_callbacks(void)
788{ 818{
789 if (!kgdb_io_module_registered) { 819 if (!kgdb_io_module_registered) {
@@ -791,6 +821,7 @@ static void kgdb_register_callbacks(void)
791 kgdb_arch_init(); 821 kgdb_arch_init();
792 if (!dbg_is_early) 822 if (!dbg_is_early)
793 kgdb_arch_late(); 823 kgdb_arch_late();
824 register_reboot_notifier(&dbg_reboot_notifier);
794 atomic_notifier_chain_register(&panic_notifier_list, 825 atomic_notifier_chain_register(&panic_notifier_list,
795 &kgdb_panic_event_nb); 826 &kgdb_panic_event_nb);
796#ifdef CONFIG_MAGIC_SYSRQ 827#ifdef CONFIG_MAGIC_SYSRQ
@@ -812,6 +843,7 @@ static void kgdb_unregister_callbacks(void)
812 */ 843 */
813 if (kgdb_io_module_registered) { 844 if (kgdb_io_module_registered) {
814 kgdb_io_module_registered = 0; 845 kgdb_io_module_registered = 0;
846 unregister_reboot_notifier(&dbg_reboot_notifier);
815 atomic_notifier_chain_unregister(&panic_notifier_list, 847 atomic_notifier_chain_unregister(&panic_notifier_list,
816 &kgdb_panic_event_nb); 848 &kgdb_panic_event_nb);
817 kgdb_arch_exit(); 849 kgdb_arch_exit();
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index c22d8c28ad84..ce615e064482 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1111,6 +1111,13 @@ void gdbstub_exit(int status)
1111 unsigned char checksum, ch, buffer[3]; 1111 unsigned char checksum, ch, buffer[3];
1112 int loop; 1112 int loop;
1113 1113
1114 if (!kgdb_connected)
1115 return;
1116 kgdb_connected = 0;
1117
1118 if (!dbg_io_ops || dbg_kdb_mode)
1119 return;
1120
1114 buffer[0] = 'W'; 1121 buffer[0] = 'W';
1115 buffer[1] = hex_asc_hi(status); 1122 buffer[1] = hex_asc_hi(status);
1116 buffer[2] = hex_asc_lo(status); 1123 buffer[2] = hex_asc_lo(status);
@@ -1129,5 +1136,6 @@ void gdbstub_exit(int status)
1129 dbg_io_ops->write_char(hex_asc_lo(checksum)); 1136 dbg_io_ops->write_char(hex_asc_lo(checksum));
1130 1137
1131 /* make sure the output is flushed, lest the bootloader clobber it */ 1138 /* make sure the output is flushed, lest the bootloader clobber it */
1132 dbg_io_ops->flush(); 1139 if (dbg_io_ops->flush)
1140 dbg_io_ops->flush();
1133} 1141}
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 20059ef4459a..8418c2f8ec5d 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
153 } else { 153 } else {
154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n", 154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
155 __func__, bp->bp_addr); 155 __func__, bp->bp_addr);
156#ifdef CONFIG_DEBUG_RODATA
157 if (!bp->bp_type) {
158 kdb_printf("Software breakpoints are unavailable.\n"
159 " Change the kernel CONFIG_DEBUG_RODATA=n\n"
160 " OR use hw breaks: help bph\n");
161 }
162#endif
156 return 1; 163 return 1;
157 } 164 }
158 return 0; 165 return 0;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7179eac7b41c..07c9bbb94a0b 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -15,7 +15,6 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/kdb.h> 16#include <linux/kdb.h>
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <asm/system.h>
19#include "kdb_private.h" 18#include "kdb_private.h"
20 19
21 20
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 4802eb5840e1..9b5f17da1c56 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -689,7 +689,7 @@ kdb_printit:
689 if (!dbg_kdb_mode && kgdb_connected) { 689 if (!dbg_kdb_mode && kgdb_connected) {
690 gdbstub_msg_write(kdb_buffer, retlen); 690 gdbstub_msg_write(kdb_buffer, retlen);
691 } else { 691 } else {
692 if (!dbg_io_ops->is_console) { 692 if (dbg_io_ops && !dbg_io_ops->is_console) {
693 len = strlen(kdb_buffer); 693 len = strlen(kdb_buffer);
694 cp = kdb_buffer; 694 cp = kdb_buffer;
695 while (len--) { 695 while (len--) {
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 4bca634975c0..118527aa60ea 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,7 @@
25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ 25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
26 26
27static int kbd_exists; 27static int kbd_exists;
28static int kbd_last_ret;
28 29
29/* 30/*
30 * Check if the keyboard controller has a keypress for us. 31 * Check if the keyboard controller has a keypress for us.
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)
90 return -1; 91 return -1;
91 } 92 }
92 93
93 if ((scancode & 0x80) != 0) 94 if ((scancode & 0x80) != 0) {
95 if (scancode == 0x9c)
96 kbd_last_ret = 0;
94 return -1; 97 return -1;
98 }
95 99
96 scancode &= 0x7f; 100 scancode &= 0x7f;
97 101
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)
178 return -1; /* ignore unprintables */ 182 return -1; /* ignore unprintables */
179 } 183 }
180 184
181 if ((scancode & 0x7f) == 0x1c) { 185 if (scancode == 0x1c) {
182 /* 186 kbd_last_ret = 1;
183 * enter key. All done. Absorb the release scancode. 187 return 13;
184 */ 188 }
189
190 return keychar & 0xff;
191}
192EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
193
194/*
195 * Best effort cleanup of ENTER break codes on leaving KDB. Called on
196 * exiting KDB, when we know we processed an ENTER or KP ENTER scan
197 * code.
198 */
199void kdb_kbd_cleanup_state(void)
200{
201 int scancode, scanstatus;
202
203 /*
204 * Nothing to clean up, since either
205 * ENTER was never pressed, or has already
206 * gotten cleaned up.
207 */
208 if (!kbd_last_ret)
209 return;
210
211 kbd_last_ret = 0;
212 /*
213 * Enter key. Need to absorb the break code here, lest it gets
214 * leaked out if we exit KDB as the result of processing 'g'.
215 *
216 * This has several interesting implications:
217 * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
218 * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
219 * only get a break code at the end of the repeated
220 * sequence. This means we can't propagate the repeated key
221 * press, and must swallow it away.
222 * + Need to handle possible PS/2 mouse input.
223 * + Need to handle mashed keys.
224 */
225
226 while (1) {
185 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) 227 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
186 ; 228 cpu_relax();
187 229
188 /* 230 /*
189 * Fetch the scancode 231 * Fetch the scancode.
190 */ 232 */
191 scancode = inb(KBD_DATA_REG); 233 scancode = inb(KBD_DATA_REG);
192 scanstatus = inb(KBD_STATUS_REG); 234 scanstatus = inb(KBD_STATUS_REG);
193 235
194 while (scanstatus & KBD_STAT_MOUSE_OBF) { 236 /*
195 scancode = inb(KBD_DATA_REG); 237 * Skip mouse input.
196 scanstatus = inb(KBD_STATUS_REG); 238 */
197 } 239 if (scanstatus & KBD_STAT_MOUSE_OBF)
240 continue;
198 241
199 if (scancode != 0x9c) { 242 /*
200 /* 243 * If we see 0xe0, this is either a break code for KP
201 * Wasn't an enter-release, why not? 244 * ENTER, or a repeat make for KP ENTER. Either way,
202 */ 245 * since the second byte is equivalent to an ENTER,
203 kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", 246 * skip the 0xe0 and try again.
204 scancode, scanstatus); 247 *
205 } 248 * If we see 0x1c, this must be a repeat ENTER or KP
249 * ENTER (and we swallowed 0xe0 before). Try again.
250 *
251 * We can also see make and break codes for other keys
252 * mashed before or after pressing ENTER. Thus, if we
253 * see anything other than 0x9c, we have to try again.
254 *
255 * Note, if you held some key as ENTER was depressed,
256 * that break code would get leaked out.
257 */
258 if (scancode != 0x9c)
259 continue;
206 260
207 return 13; 261 return;
208 } 262 }
209
210 return keychar & 0xff;
211} 263}
212EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e2ae7349437f..67b847dfa2bb 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1400 if (KDB_STATE(DOING_SS)) 1400 if (KDB_STATE(DOING_SS))
1401 KDB_STATE_CLEAR(SSBPT); 1401 KDB_STATE_CLEAR(SSBPT);
1402 1402
1403 /* Clean up any keyboard devices before leaving */
1404 kdb_kbd_cleanup_state();
1405
1403 return result; 1406 return result;
1404} 1407}
1405 1408
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index e381d105b40b..47c4e56e513b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -246,6 +246,13 @@ extern void debug_kusage(void);
246 246
247extern void kdb_set_current_task(struct task_struct *); 247extern void kdb_set_current_task(struct task_struct *);
248extern struct task_struct *kdb_current_task; 248extern struct task_struct *kdb_current_task;
249
250#ifdef CONFIG_KDB_KEYBOARD
251extern void kdb_kbd_cleanup_state(void);
252#else /* ! CONFIG_KDB_KEYBOARD */
253#define kdb_kbd_cleanup_state()
254#endif /* ! CONFIG_KDB_KEYBOARD */
255
249#ifdef CONFIG_MODULES 256#ifdef CONFIG_MODULES
250extern struct list_head *kdb_modules; 257extern struct list_head *kdb_modules;
251#endif /* CONFIG_MODULES */ 258#endif /* CONFIG_MODULES */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 7d6fb40d2188..d35cc2d3a4cc 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
384 if (!pfn_valid(pfn)) 384 if (!pfn_valid(pfn))
385 return 1; 385 return 1;
386 page = pfn_to_page(pfn); 386 page = pfn_to_page(pfn);
387 vaddr = kmap_atomic(page, KM_KDB); 387 vaddr = kmap_atomic(page);
388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); 388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
389 kunmap_atomic(vaddr, KM_KDB); 389 kunmap_atomic(vaddr);
390 390
391 return 0; 391 return 0;
392} 392}
diff --git a/kernel/dma.c b/kernel/dma.c
index 68a2306522c8..6c6262f86c17 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -18,7 +18,6 @@
18#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
19#include <linux/init.h> 19#include <linux/init.h>
20#include <asm/dma.h> 20#include <asm/dma.h>
21#include <asm/system.h>
22 21
23 22
24 23
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1b5c081d8b9f..4b50357914fb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
118 PERF_FLAG_FD_OUTPUT |\ 118 PERF_FLAG_FD_OUTPUT |\
119 PERF_FLAG_PID_CGROUP) 119 PERF_FLAG_PID_CGROUP)
120 120
121/*
122 * branch priv levels that need permission checks
123 */
124#define PERF_SAMPLE_BRANCH_PERM_PLM \
125 (PERF_SAMPLE_BRANCH_KERNEL |\
126 PERF_SAMPLE_BRANCH_HV)
127
121enum event_type_t { 128enum event_type_t {
122 EVENT_FLEXIBLE = 0x1, 129 EVENT_FLEXIBLE = 0x1,
123 EVENT_PINNED = 0x2, 130 EVENT_PINNED = 0x2,
@@ -128,8 +135,9 @@ enum event_type_t {
128 * perf_sched_events : >0 events exist 135 * perf_sched_events : >0 events exist
129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 136 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
130 */ 137 */
131struct jump_label_key_deferred perf_sched_events __read_mostly; 138struct static_key_deferred perf_sched_events __read_mostly;
132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 139static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
140static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
133 141
134static atomic_t nr_mmap_events __read_mostly; 142static atomic_t nr_mmap_events __read_mostly;
135static atomic_t nr_comm_events __read_mostly; 143static atomic_t nr_comm_events __read_mostly;
@@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
881 if (is_cgroup_event(event)) 889 if (is_cgroup_event(event))
882 ctx->nr_cgroups++; 890 ctx->nr_cgroups++;
883 891
892 if (has_branch_stack(event))
893 ctx->nr_branch_stack++;
894
884 list_add_rcu(&event->event_entry, &ctx->event_list); 895 list_add_rcu(&event->event_entry, &ctx->event_list);
885 if (!ctx->nr_events) 896 if (!ctx->nr_events)
886 perf_pmu_rotate_start(ctx->pmu); 897 perf_pmu_rotate_start(ctx->pmu);
@@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1020 cpuctx->cgrp = NULL; 1031 cpuctx->cgrp = NULL;
1021 } 1032 }
1022 1033
1034 if (has_branch_stack(event))
1035 ctx->nr_branch_stack--;
1036
1023 ctx->nr_events--; 1037 ctx->nr_events--;
1024 if (event->attr.inherit_stat) 1038 if (event->attr.inherit_stat)
1025 ctx->nr_stat--; 1039 ctx->nr_stat--;
@@ -2195,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2195} 2209}
2196 2210
2197/* 2211/*
2212 * When sampling the branck stack in system-wide, it may be necessary
2213 * to flush the stack on context switch. This happens when the branch
2214 * stack does not tag its entries with the pid of the current task.
2215 * Otherwise it becomes impossible to associate a branch entry with a
2216 * task. This ambiguity is more likely to appear when the branch stack
2217 * supports priv level filtering and the user sets it to monitor only
2218 * at the user level (which could be a useful measurement in system-wide
2219 * mode). In that case, the risk is high of having a branch stack with
2220 * branch from multiple tasks. Flushing may mean dropping the existing
2221 * entries or stashing them somewhere in the PMU specific code layer.
2222 *
2223 * This function provides the context switch callback to the lower code
2224 * layer. It is invoked ONLY when there is at least one system-wide context
2225 * with at least one active event using taken branch sampling.
2226 */
2227static void perf_branch_stack_sched_in(struct task_struct *prev,
2228 struct task_struct *task)
2229{
2230 struct perf_cpu_context *cpuctx;
2231 struct pmu *pmu;
2232 unsigned long flags;
2233
2234 /* no need to flush branch stack if not changing task */
2235 if (prev == task)
2236 return;
2237
2238 local_irq_save(flags);
2239
2240 rcu_read_lock();
2241
2242 list_for_each_entry_rcu(pmu, &pmus, entry) {
2243 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2244
2245 /*
2246 * check if the context has at least one
2247 * event using PERF_SAMPLE_BRANCH_STACK
2248 */
2249 if (cpuctx->ctx.nr_branch_stack > 0
2250 && pmu->flush_branch_stack) {
2251
2252 pmu = cpuctx->ctx.pmu;
2253
2254 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2255
2256 perf_pmu_disable(pmu);
2257
2258 pmu->flush_branch_stack();
2259
2260 perf_pmu_enable(pmu);
2261
2262 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2263 }
2264 }
2265
2266 rcu_read_unlock();
2267
2268 local_irq_restore(flags);
2269}
2270
2271/*
2198 * Called from scheduler to add the events of the current task 2272 * Called from scheduler to add the events of the current task
2199 * with interrupts disabled. 2273 * with interrupts disabled.
2200 * 2274 *
@@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2225 */ 2299 */
2226 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2300 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2227 perf_cgroup_sched_in(prev, task); 2301 perf_cgroup_sched_in(prev, task);
2302
2303 /* check for system-wide branch_stack events */
2304 if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2305 perf_branch_stack_sched_in(prev, task);
2228} 2306}
2229 2307
2230static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2308static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2778,7 +2856,7 @@ static void free_event(struct perf_event *event)
2778 2856
2779 if (!event->parent) { 2857 if (!event->parent) {
2780 if (event->attach_state & PERF_ATTACH_TASK) 2858 if (event->attach_state & PERF_ATTACH_TASK)
2781 jump_label_dec_deferred(&perf_sched_events); 2859 static_key_slow_dec_deferred(&perf_sched_events);
2782 if (event->attr.mmap || event->attr.mmap_data) 2860 if (event->attr.mmap || event->attr.mmap_data)
2783 atomic_dec(&nr_mmap_events); 2861 atomic_dec(&nr_mmap_events);
2784 if (event->attr.comm) 2862 if (event->attr.comm)
@@ -2789,7 +2867,15 @@ static void free_event(struct perf_event *event)
2789 put_callchain_buffers(); 2867 put_callchain_buffers();
2790 if (is_cgroup_event(event)) { 2868 if (is_cgroup_event(event)) {
2791 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); 2869 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2792 jump_label_dec_deferred(&perf_sched_events); 2870 static_key_slow_dec_deferred(&perf_sched_events);
2871 }
2872
2873 if (has_branch_stack(event)) {
2874 static_key_slow_dec_deferred(&perf_sched_events);
2875 /* is system-wide event */
2876 if (!(event->attach_state & PERF_ATTACH_TASK))
2877 atomic_dec(&per_cpu(perf_branch_stack_events,
2878 event->cpu));
2793 } 2879 }
2794 } 2880 }
2795 2881
@@ -3238,10 +3324,6 @@ int perf_event_task_disable(void)
3238 return 0; 3324 return 0;
3239} 3325}
3240 3326
3241#ifndef PERF_EVENT_INDEX_OFFSET
3242# define PERF_EVENT_INDEX_OFFSET 0
3243#endif
3244
3245static int perf_event_index(struct perf_event *event) 3327static int perf_event_index(struct perf_event *event)
3246{ 3328{
3247 if (event->hw.state & PERF_HES_STOPPED) 3329 if (event->hw.state & PERF_HES_STOPPED)
@@ -3250,21 +3332,26 @@ static int perf_event_index(struct perf_event *event)
3250 if (event->state != PERF_EVENT_STATE_ACTIVE) 3332 if (event->state != PERF_EVENT_STATE_ACTIVE)
3251 return 0; 3333 return 0;
3252 3334
3253 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; 3335 return event->pmu->event_idx(event);
3254} 3336}
3255 3337
3256static void calc_timer_values(struct perf_event *event, 3338static void calc_timer_values(struct perf_event *event,
3339 u64 *now,
3257 u64 *enabled, 3340 u64 *enabled,
3258 u64 *running) 3341 u64 *running)
3259{ 3342{
3260 u64 now, ctx_time; 3343 u64 ctx_time;
3261 3344
3262 now = perf_clock(); 3345 *now = perf_clock();
3263 ctx_time = event->shadow_ctx_time + now; 3346 ctx_time = event->shadow_ctx_time + *now;
3264 *enabled = ctx_time - event->tstamp_enabled; 3347 *enabled = ctx_time - event->tstamp_enabled;
3265 *running = ctx_time - event->tstamp_running; 3348 *running = ctx_time - event->tstamp_running;
3266} 3349}
3267 3350
3351void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
3352{
3353}
3354
3268/* 3355/*
3269 * Callers need to ensure there can be no nesting of this function, otherwise 3356 * Callers need to ensure there can be no nesting of this function, otherwise
3270 * the seqlock logic goes bad. We can not serialize this because the arch 3357 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3274,7 +3361,7 @@ void perf_event_update_userpage(struct perf_event *event)
3274{ 3361{
3275 struct perf_event_mmap_page *userpg; 3362 struct perf_event_mmap_page *userpg;
3276 struct ring_buffer *rb; 3363 struct ring_buffer *rb;
3277 u64 enabled, running; 3364 u64 enabled, running, now;
3278 3365
3279 rcu_read_lock(); 3366 rcu_read_lock();
3280 /* 3367 /*
@@ -3286,7 +3373,7 @@ void perf_event_update_userpage(struct perf_event *event)
3286 * because of locking issue as we can be called in 3373 * because of locking issue as we can be called in
3287 * NMI context 3374 * NMI context
3288 */ 3375 */
3289 calc_timer_values(event, &enabled, &running); 3376 calc_timer_values(event, &now, &enabled, &running);
3290 rb = rcu_dereference(event->rb); 3377 rb = rcu_dereference(event->rb);
3291 if (!rb) 3378 if (!rb)
3292 goto unlock; 3379 goto unlock;
@@ -3302,7 +3389,7 @@ void perf_event_update_userpage(struct perf_event *event)
3302 barrier(); 3389 barrier();
3303 userpg->index = perf_event_index(event); 3390 userpg->index = perf_event_index(event);
3304 userpg->offset = perf_event_count(event); 3391 userpg->offset = perf_event_count(event);
3305 if (event->state == PERF_EVENT_STATE_ACTIVE) 3392 if (userpg->index)
3306 userpg->offset -= local64_read(&event->hw.prev_count); 3393 userpg->offset -= local64_read(&event->hw.prev_count);
3307 3394
3308 userpg->time_enabled = enabled + 3395 userpg->time_enabled = enabled +
@@ -3311,6 +3398,8 @@ void perf_event_update_userpage(struct perf_event *event)
3311 userpg->time_running = running + 3398 userpg->time_running = running +
3312 atomic64_read(&event->child_total_time_running); 3399 atomic64_read(&event->child_total_time_running);
3313 3400
3401 perf_update_user_clock(userpg, now);
3402
3314 barrier(); 3403 barrier();
3315 ++userpg->lock; 3404 ++userpg->lock;
3316 preempt_enable(); 3405 preempt_enable();
@@ -3568,6 +3657,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3568 event->mmap_user = get_current_user(); 3657 event->mmap_user = get_current_user();
3569 vma->vm_mm->pinned_vm += event->mmap_locked; 3658 vma->vm_mm->pinned_vm += event->mmap_locked;
3570 3659
3660 perf_event_update_userpage(event);
3661
3571unlock: 3662unlock:
3572 if (!ret) 3663 if (!ret)
3573 atomic_inc(&event->mmap_count); 3664 atomic_inc(&event->mmap_count);
@@ -3799,7 +3890,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3799static void perf_output_read(struct perf_output_handle *handle, 3890static void perf_output_read(struct perf_output_handle *handle,
3800 struct perf_event *event) 3891 struct perf_event *event)
3801{ 3892{
3802 u64 enabled = 0, running = 0; 3893 u64 enabled = 0, running = 0, now;
3803 u64 read_format = event->attr.read_format; 3894 u64 read_format = event->attr.read_format;
3804 3895
3805 /* 3896 /*
@@ -3812,7 +3903,7 @@ static void perf_output_read(struct perf_output_handle *handle,
3812 * NMI context 3903 * NMI context
3813 */ 3904 */
3814 if (read_format & PERF_FORMAT_TOTAL_TIMES) 3905 if (read_format & PERF_FORMAT_TOTAL_TIMES)
3815 calc_timer_values(event, &enabled, &running); 3906 calc_timer_values(event, &now, &enabled, &running);
3816 3907
3817 if (event->attr.read_format & PERF_FORMAT_GROUP) 3908 if (event->attr.read_format & PERF_FORMAT_GROUP)
3818 perf_output_read_group(handle, event, enabled, running); 3909 perf_output_read_group(handle, event, enabled, running);
@@ -3902,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle,
3902 } 3993 }
3903 } 3994 }
3904 } 3995 }
3996
3997 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
3998 if (data->br_stack) {
3999 size_t size;
4000
4001 size = data->br_stack->nr
4002 * sizeof(struct perf_branch_entry);
4003
4004 perf_output_put(handle, data->br_stack->nr);
4005 perf_output_copy(handle, data->br_stack->entries, size);
4006 } else {
4007 /*
4008 * we always store at least the value of nr
4009 */
4010 u64 nr = 0;
4011 perf_output_put(handle, nr);
4012 }
4013 }
3905} 4014}
3906 4015
3907void perf_prepare_sample(struct perf_event_header *header, 4016void perf_prepare_sample(struct perf_event_header *header,
@@ -3944,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header,
3944 WARN_ON_ONCE(size & (sizeof(u64)-1)); 4053 WARN_ON_ONCE(size & (sizeof(u64)-1));
3945 header->size += size; 4054 header->size += size;
3946 } 4055 }
4056
4057 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4058 int size = sizeof(u64); /* nr */
4059 if (data->br_stack) {
4060 size += data->br_stack->nr
4061 * sizeof(struct perf_branch_entry);
4062 }
4063 header->size += size;
4064 }
3947} 4065}
3948 4066
3949static void perf_event_output(struct perf_event *event, 4067static void perf_event_output(struct perf_event *event,
@@ -4986,7 +5104,7 @@ fail:
4986 return err; 5104 return err;
4987} 5105}
4988 5106
4989struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5107struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
4990 5108
4991static void sw_perf_event_destroy(struct perf_event *event) 5109static void sw_perf_event_destroy(struct perf_event *event)
4992{ 5110{
@@ -4994,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4994 5112
4995 WARN_ON(event->parent); 5113 WARN_ON(event->parent);
4996 5114
4997 jump_label_dec(&perf_swevent_enabled[event_id]); 5115 static_key_slow_dec(&perf_swevent_enabled[event_id]);
4998 swevent_hlist_put(event); 5116 swevent_hlist_put(event);
4999} 5117}
5000 5118
@@ -5005,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event)
5005 if (event->attr.type != PERF_TYPE_SOFTWARE) 5123 if (event->attr.type != PERF_TYPE_SOFTWARE)
5006 return -ENOENT; 5124 return -ENOENT;
5007 5125
5126 /*
5127 * no branch sampling for software events
5128 */
5129 if (has_branch_stack(event))
5130 return -EOPNOTSUPP;
5131
5008 switch (event_id) { 5132 switch (event_id) {
5009 case PERF_COUNT_SW_CPU_CLOCK: 5133 case PERF_COUNT_SW_CPU_CLOCK:
5010 case PERF_COUNT_SW_TASK_CLOCK: 5134 case PERF_COUNT_SW_TASK_CLOCK:
@@ -5024,13 +5148,18 @@ static int perf_swevent_init(struct perf_event *event)
5024 if (err) 5148 if (err)
5025 return err; 5149 return err;
5026 5150
5027 jump_label_inc(&perf_swevent_enabled[event_id]); 5151 static_key_slow_inc(&perf_swevent_enabled[event_id]);
5028 event->destroy = sw_perf_event_destroy; 5152 event->destroy = sw_perf_event_destroy;
5029 } 5153 }
5030 5154
5031 return 0; 5155 return 0;
5032} 5156}
5033 5157
5158static int perf_swevent_event_idx(struct perf_event *event)
5159{
5160 return 0;
5161}
5162
5034static struct pmu perf_swevent = { 5163static struct pmu perf_swevent = {
5035 .task_ctx_nr = perf_sw_context, 5164 .task_ctx_nr = perf_sw_context,
5036 5165
@@ -5040,6 +5169,8 @@ static struct pmu perf_swevent = {
5040 .start = perf_swevent_start, 5169 .start = perf_swevent_start,
5041 .stop = perf_swevent_stop, 5170 .stop = perf_swevent_stop,
5042 .read = perf_swevent_read, 5171 .read = perf_swevent_read,
5172
5173 .event_idx = perf_swevent_event_idx,
5043}; 5174};
5044 5175
5045#ifdef CONFIG_EVENT_TRACING 5176#ifdef CONFIG_EVENT_TRACING
@@ -5108,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event)
5108 if (event->attr.type != PERF_TYPE_TRACEPOINT) 5239 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5109 return -ENOENT; 5240 return -ENOENT;
5110 5241
5242 /*
5243 * no branch sampling for tracepoint events
5244 */
5245 if (has_branch_stack(event))
5246 return -EOPNOTSUPP;
5247
5111 err = perf_trace_init(event); 5248 err = perf_trace_init(event);
5112 if (err) 5249 if (err)
5113 return err; 5250 return err;
@@ -5126,6 +5263,8 @@ static struct pmu perf_tracepoint = {
5126 .start = perf_swevent_start, 5263 .start = perf_swevent_start,
5127 .stop = perf_swevent_stop, 5264 .stop = perf_swevent_stop,
5128 .read = perf_swevent_read, 5265 .read = perf_swevent_read,
5266
5267 .event_idx = perf_swevent_event_idx,
5129}; 5268};
5130 5269
5131static inline void perf_tp_register(void) 5270static inline void perf_tp_register(void)
@@ -5331,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event)
5331 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5470 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5332 return -ENOENT; 5471 return -ENOENT;
5333 5472
5473 /*
5474 * no branch sampling for software events
5475 */
5476 if (has_branch_stack(event))
5477 return -EOPNOTSUPP;
5478
5334 perf_swevent_init_hrtimer(event); 5479 perf_swevent_init_hrtimer(event);
5335 5480
5336 return 0; 5481 return 0;
@@ -5345,6 +5490,8 @@ static struct pmu perf_cpu_clock = {
5345 .start = cpu_clock_event_start, 5490 .start = cpu_clock_event_start,
5346 .stop = cpu_clock_event_stop, 5491 .stop = cpu_clock_event_stop,
5347 .read = cpu_clock_event_read, 5492 .read = cpu_clock_event_read,
5493
5494 .event_idx = perf_swevent_event_idx,
5348}; 5495};
5349 5496
5350/* 5497/*
@@ -5403,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event)
5403 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5550 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5404 return -ENOENT; 5551 return -ENOENT;
5405 5552
5553 /*
5554 * no branch sampling for software events
5555 */
5556 if (has_branch_stack(event))
5557 return -EOPNOTSUPP;
5558
5406 perf_swevent_init_hrtimer(event); 5559 perf_swevent_init_hrtimer(event);
5407 5560
5408 return 0; 5561 return 0;
@@ -5417,6 +5570,8 @@ static struct pmu perf_task_clock = {
5417 .start = task_clock_event_start, 5570 .start = task_clock_event_start,
5418 .stop = task_clock_event_stop, 5571 .stop = task_clock_event_stop,
5419 .read = task_clock_event_read, 5572 .read = task_clock_event_read,
5573
5574 .event_idx = perf_swevent_event_idx,
5420}; 5575};
5421 5576
5422static void perf_pmu_nop_void(struct pmu *pmu) 5577static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5444,6 +5599,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
5444 perf_pmu_enable(pmu); 5599 perf_pmu_enable(pmu);
5445} 5600}
5446 5601
5602static int perf_event_idx_default(struct perf_event *event)
5603{
5604 return event->hw.idx + 1;
5605}
5606
5447/* 5607/*
5448 * Ensures all contexts with the same task_ctx_nr have the same 5608 * Ensures all contexts with the same task_ctx_nr have the same
5449 * pmu_cpu_context too. 5609 * pmu_cpu_context too.
@@ -5530,6 +5690,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
5530 if (!pmu->dev) 5690 if (!pmu->dev)
5531 goto out; 5691 goto out;
5532 5692
5693 pmu->dev->groups = pmu->attr_groups;
5533 device_initialize(pmu->dev); 5694 device_initialize(pmu->dev);
5534 ret = dev_set_name(pmu->dev, "%s", pmu->name); 5695 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5535 if (ret) 5696 if (ret)
@@ -5633,6 +5794,9 @@ got_cpu_context:
5633 pmu->pmu_disable = perf_pmu_nop_void; 5794 pmu->pmu_disable = perf_pmu_nop_void;
5634 } 5795 }
5635 5796
5797 if (!pmu->event_idx)
5798 pmu->event_idx = perf_event_idx_default;
5799
5636 list_add_rcu(&pmu->entry, &pmus); 5800 list_add_rcu(&pmu->entry, &pmus);
5637 ret = 0; 5801 ret = 0;
5638unlock: 5802unlock:
@@ -5825,7 +5989,7 @@ done:
5825 5989
5826 if (!event->parent) { 5990 if (!event->parent) {
5827 if (event->attach_state & PERF_ATTACH_TASK) 5991 if (event->attach_state & PERF_ATTACH_TASK)
5828 jump_label_inc(&perf_sched_events.key); 5992 static_key_slow_inc(&perf_sched_events.key);
5829 if (event->attr.mmap || event->attr.mmap_data) 5993 if (event->attr.mmap || event->attr.mmap_data)
5830 atomic_inc(&nr_mmap_events); 5994 atomic_inc(&nr_mmap_events);
5831 if (event->attr.comm) 5995 if (event->attr.comm)
@@ -5839,6 +6003,12 @@ done:
5839 return ERR_PTR(err); 6003 return ERR_PTR(err);
5840 } 6004 }
5841 } 6005 }
6006 if (has_branch_stack(event)) {
6007 static_key_slow_inc(&perf_sched_events.key);
6008 if (!(event->attach_state & PERF_ATTACH_TASK))
6009 atomic_inc(&per_cpu(perf_branch_stack_events,
6010 event->cpu));
6011 }
5842 } 6012 }
5843 6013
5844 return event; 6014 return event;
@@ -5908,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
5908 if (attr->read_format & ~(PERF_FORMAT_MAX-1)) 6078 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
5909 return -EINVAL; 6079 return -EINVAL;
5910 6080
6081 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
6082 u64 mask = attr->branch_sample_type;
6083
6084 /* only using defined bits */
6085 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
6086 return -EINVAL;
6087
6088 /* at least one branch bit must be set */
6089 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6090 return -EINVAL;
6091
6092 /* kernel level capture: check permissions */
6093 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6094 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6095 return -EACCES;
6096
6097 /* propagate priv level, when not set for branch */
6098 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6099
6100 /* exclude_kernel checked on syscall entry */
6101 if (!attr->exclude_kernel)
6102 mask |= PERF_SAMPLE_BRANCH_KERNEL;
6103
6104 if (!attr->exclude_user)
6105 mask |= PERF_SAMPLE_BRANCH_USER;
6106
6107 if (!attr->exclude_hv)
6108 mask |= PERF_SAMPLE_BRANCH_HV;
6109 /*
6110 * adjust user setting (for HW filter setup)
6111 */
6112 attr->branch_sample_type = mask;
6113 }
6114 }
5911out: 6115out:
5912 return ret; 6116 return ret;
5913 6117
@@ -6063,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open,
6063 * - that may need work on context switch 6267 * - that may need work on context switch
6064 */ 6268 */
6065 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); 6269 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6066 jump_label_inc(&perf_sched_events.key); 6270 static_key_slow_inc(&perf_sched_events.key);
6067 } 6271 }
6068 6272
6069 /* 6273 /*
@@ -6943,8 +7147,7 @@ unlock:
6943device_initcall(perf_event_sysfs_init); 7147device_initcall(perf_event_sysfs_init);
6944 7148
6945#ifdef CONFIG_CGROUP_PERF 7149#ifdef CONFIG_CGROUP_PERF
6946static struct cgroup_subsys_state *perf_cgroup_create( 7150static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
6947 struct cgroup_subsys *ss, struct cgroup *cont)
6948{ 7151{
6949 struct perf_cgroup *jc; 7152 struct perf_cgroup *jc;
6950 7153
@@ -6961,8 +7164,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(
6961 return &jc->css; 7164 return &jc->css;
6962} 7165}
6963 7166
6964static void perf_cgroup_destroy(struct cgroup_subsys *ss, 7167static void perf_cgroup_destroy(struct cgroup *cont)
6965 struct cgroup *cont)
6966{ 7168{
6967 struct perf_cgroup *jc; 7169 struct perf_cgroup *jc;
6968 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7170 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -6978,8 +7180,7 @@ static int __perf_cgroup_move(void *info)
6978 return 0; 7180 return 0;
6979} 7181}
6980 7182
6981static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7183static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
6982 struct cgroup_taskset *tset)
6983{ 7184{
6984 struct task_struct *task; 7185 struct task_struct *task;
6985 7186
@@ -6987,8 +7188,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
6987 task_function_call(task, __perf_cgroup_move, task); 7188 task_function_call(task, __perf_cgroup_move, task);
6988} 7189}
6989 7190
6990static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7191static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
6991 struct cgroup *old_cgrp, struct task_struct *task) 7192 struct task_struct *task)
6992{ 7193{
6993 /* 7194 /*
6994 * cgroup_exit() is called in the copy_process() failure path. 7195 * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index ee706ce44aa0..bb38c4d3ee12 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
581 if (bp->attr.type != PERF_TYPE_BREAKPOINT) 581 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
582 return -ENOENT; 582 return -ENOENT;
583 583
584 /*
585 * no branch sampling for breakpoint events
586 */
587 if (has_branch_stack(bp))
588 return -EOPNOTSUPP;
589
584 err = register_perf_hw_breakpoint(bp); 590 err = register_perf_hw_breakpoint(bp);
585 if (err) 591 if (err)
586 return err; 592 return err;
@@ -613,6 +619,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
613 bp->hw.state = PERF_HES_STOPPED; 619 bp->hw.state = PERF_HES_STOPPED;
614} 620}
615 621
622static int hw_breakpoint_event_idx(struct perf_event *bp)
623{
624 return 0;
625}
626
616static struct pmu perf_breakpoint = { 627static struct pmu perf_breakpoint = {
617 .task_ctx_nr = perf_sw_context, /* could eventually get its own */ 628 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
618 629
@@ -622,6 +633,8 @@ static struct pmu perf_breakpoint = {
622 .start = hw_breakpoint_start, 633 .start = hw_breakpoint_start,
623 .stop = hw_breakpoint_stop, 634 .stop = hw_breakpoint_stop,
624 .read = hw_breakpoint_pmu_read, 635 .read = hw_breakpoint_pmu_read,
636
637 .event_idx = hw_breakpoint_event_idx,
625}; 638};
626 639
627int __init init_hw_breakpoint(void) 640int __init init_hw_breakpoint(void)
diff --git a/kernel/exit.c b/kernel/exit.c
index 4b4042f9bc6a..3db1909faed9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,7 @@
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h> 53#include <linux/oom.h>
54#include <linux/writeback.h> 54#include <linux/writeback.h>
55#include <linux/shm.h>
55 56
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
57#include <asm/unistd.h> 58#include <asm/unistd.h>
@@ -424,7 +425,7 @@ void daemonize(const char *name, ...)
424 */ 425 */
425 exit_mm(current); 426 exit_mm(current);
426 /* 427 /*
427 * We don't want to have TIF_FREEZE set if the system-wide hibernation 428 * We don't want to get frozen, in case system-wide hibernation
428 * or suspend transition begins right now. 429 * or suspend transition begins right now.
429 */ 430 */
430 current->flags |= (PF_NOFREEZE | PF_KTHREAD); 431 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
@@ -686,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)
686} 687}
687 688
688/* 689/*
689 * When we die, we re-parent all our children. 690 * When we die, we re-parent all our children, and try to:
690 * Try to give them to another thread in our thread 691 * 1. give them to another thread in our thread group, if such a member exists
691 * group, and if no such member exists, give it to 692 * 2. give it to the first ancestor process which prctl'd itself as a
692 * the child reaper process (ie "init") in our pid 693 * child_subreaper for its children (like a service manager)
693 * space. 694 * 3. give it to the init process (PID 1) in our pid namespace
694 */ 695 */
695static struct task_struct *find_new_reaper(struct task_struct *father) 696static struct task_struct *find_new_reaper(struct task_struct *father)
696 __releases(&tasklist_lock) 697 __releases(&tasklist_lock)
@@ -710,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
710 711
711 if (unlikely(pid_ns->child_reaper == father)) { 712 if (unlikely(pid_ns->child_reaper == father)) {
712 write_unlock_irq(&tasklist_lock); 713 write_unlock_irq(&tasklist_lock);
713 if (unlikely(pid_ns == &init_pid_ns)) 714 if (unlikely(pid_ns == &init_pid_ns)) {
714 panic("Attempted to kill init!"); 715 panic("Attempted to kill init! exitcode=0x%08x\n",
716 father->signal->group_exit_code ?:
717 father->exit_code);
718 }
715 719
716 zap_pid_ns_processes(pid_ns); 720 zap_pid_ns_processes(pid_ns);
717 write_lock_irq(&tasklist_lock); 721 write_lock_irq(&tasklist_lock);
@@ -721,6 +725,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
721 * forget_original_parent() must move them somewhere. 725 * forget_original_parent() must move them somewhere.
722 */ 726 */
723 pid_ns->child_reaper = init_pid_ns.child_reaper; 727 pid_ns->child_reaper = init_pid_ns.child_reaper;
728 } else if (father->signal->has_child_subreaper) {
729 struct task_struct *reaper;
730
731 /*
732 * Find the first ancestor marked as child_subreaper.
733 * Note that the code below checks same_thread_group(reaper,
734 * pid_ns->child_reaper). This is what we need to DTRT in a
735 * PID namespace. However we still need the check above, see
736 * http://marc.info/?l=linux-kernel&m=131385460420380
737 */
738 for (reaper = father->real_parent;
739 reaper != &init_task;
740 reaper = reaper->real_parent) {
741 if (same_thread_group(reaper, pid_ns->child_reaper))
742 break;
743 if (!reaper->signal->is_child_subreaper)
744 continue;
745 thread = reaper;
746 do {
747 if (!(thread->flags & PF_EXITING))
748 return reaper;
749 } while_each_thread(reaper, thread);
750 }
724 } 751 }
725 752
726 return pid_ns->child_reaper; 753 return pid_ns->child_reaper;
@@ -818,25 +845,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
818 if (group_dead) 845 if (group_dead)
819 kill_orphaned_pgrp(tsk->group_leader, NULL); 846 kill_orphaned_pgrp(tsk->group_leader, NULL);
820 847
821 /* Let father know we died
822 *
823 * Thread signals are configurable, but you aren't going to use
824 * that to send signals to arbitrary processes.
825 * That stops right now.
826 *
827 * If the parent exec id doesn't match the exec id we saved
828 * when we started then we know the parent has changed security
829 * domain.
830 *
831 * If our self_exec id doesn't match our parent_exec_id then
832 * we have changed execution domain as these two values started
833 * the same after a fork.
834 */
835 if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
836 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
837 tsk->self_exec_id != tsk->parent_exec_id))
838 tsk->exit_signal = SIGCHLD;
839
840 if (unlikely(tsk->ptrace)) { 848 if (unlikely(tsk->ptrace)) {
841 int sig = thread_group_leader(tsk) && 849 int sig = thread_group_leader(tsk) &&
842 thread_group_empty(tsk) && 850 thread_group_empty(tsk) &&
@@ -935,8 +943,6 @@ void do_exit(long code)
935 schedule(); 943 schedule();
936 } 944 }
937 945
938 exit_irq_thread();
939
940 exit_signals(tsk); /* sets PF_EXITING */ 946 exit_signals(tsk); /* sets PF_EXITING */
941 /* 947 /*
942 * tsk->flags are checked in the futex code to protect against 948 * tsk->flags are checked in the futex code to protect against
@@ -945,6 +951,8 @@ void do_exit(long code)
945 smp_mb(); 951 smp_mb();
946 raw_spin_unlock_wait(&tsk->pi_lock); 952 raw_spin_unlock_wait(&tsk->pi_lock);
947 953
954 exit_irq_thread();
955
948 if (unlikely(in_atomic())) 956 if (unlikely(in_atomic()))
949 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 957 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
950 current->comm, task_pid_nr(current), 958 current->comm, task_pid_nr(current),
@@ -953,7 +961,7 @@ void do_exit(long code)
953 acct_update_integrals(tsk); 961 acct_update_integrals(tsk);
954 /* sync mm's RSS info before statistics gathering */ 962 /* sync mm's RSS info before statistics gathering */
955 if (tsk->mm) 963 if (tsk->mm)
956 sync_mm_rss(tsk, tsk->mm); 964 sync_mm_rss(tsk->mm);
957 group_dead = atomic_dec_and_test(&tsk->signal->live); 965 group_dead = atomic_dec_and_test(&tsk->signal->live);
958 if (group_dead) { 966 if (group_dead) {
959 hrtimer_cancel(&tsk->signal->real_timer); 967 hrtimer_cancel(&tsk->signal->real_timer);
diff --git a/kernel/fork.c b/kernel/fork.c
index 26a7a6707fa7..b9372a0bff18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)
193 WARN_ON(atomic_read(&tsk->usage)); 193 WARN_ON(atomic_read(&tsk->usage));
194 WARN_ON(tsk == current); 194 WARN_ON(tsk == current);
195 195
196 security_task_free(tsk);
196 exit_creds(tsk); 197 exit_creds(tsk);
197 delayacct_tsk_free(tsk); 198 delayacct_tsk_free(tsk);
198 put_signal_struct(tsk->signal); 199 put_signal_struct(tsk->signal);
@@ -355,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
355 charge = 0; 356 charge = 0;
356 if (mpnt->vm_flags & VM_ACCOUNT) { 357 if (mpnt->vm_flags & VM_ACCOUNT) {
357 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; 358 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
358 if (security_vm_enough_memory(len)) 359 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
359 goto fail_nomem; 360 goto fail_nomem;
360 charge = len; 361 charge = len;
361 } 362 }
@@ -511,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
511 return NULL; 512 return NULL;
512} 513}
513 514
515static void check_mm(struct mm_struct *mm)
516{
517 int i;
518
519 for (i = 0; i < NR_MM_COUNTERS; i++) {
520 long x = atomic_long_read(&mm->rss_stat.count[i]);
521
522 if (unlikely(x))
523 printk(KERN_ALERT "BUG: Bad rss-counter state "
524 "mm:%p idx:%d val:%ld\n", mm, i, x);
525 }
526
527#ifdef CONFIG_TRANSPARENT_HUGEPAGE
528 VM_BUG_ON(mm->pmd_huge_pte);
529#endif
530}
531
514/* 532/*
515 * Allocate and initialize an mm_struct. 533 * Allocate and initialize an mm_struct.
516 */ 534 */
@@ -538,9 +556,7 @@ void __mmdrop(struct mm_struct *mm)
538 mm_free_pgd(mm); 556 mm_free_pgd(mm);
539 destroy_context(mm); 557 destroy_context(mm);
540 mmu_notifier_mm_destroy(mm); 558 mmu_notifier_mm_destroy(mm);
541#ifdef CONFIG_TRANSPARENT_HUGEPAGE 559 check_mm(mm);
542 VM_BUG_ON(mm->pmd_huge_pte);
543#endif
544 free_mm(mm); 560 free_mm(mm);
545} 561}
546EXPORT_SYMBOL_GPL(__mmdrop); 562EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1035,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1035 sig->oom_score_adj = current->signal->oom_score_adj; 1051 sig->oom_score_adj = current->signal->oom_score_adj;
1036 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1052 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
1037 1053
1054 sig->has_child_subreaper = current->signal->has_child_subreaper ||
1055 current->signal->is_child_subreaper;
1056
1038 mutex_init(&sig->cred_guard_mutex); 1057 mutex_init(&sig->cred_guard_mutex);
1039 1058
1040 return 0; 1059 return 0;
@@ -1222,6 +1241,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1222#ifdef CONFIG_CPUSETS 1241#ifdef CONFIG_CPUSETS
1223 p->cpuset_mem_spread_rotor = NUMA_NO_NODE; 1242 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
1224 p->cpuset_slab_spread_rotor = NUMA_NO_NODE; 1243 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
1244 seqcount_init(&p->mems_allowed_seq);
1225#endif 1245#endif
1226#ifdef CONFIG_TRACE_IRQFLAGS 1246#ifdef CONFIG_TRACE_IRQFLAGS
1227 p->irq_events = 0; 1247 p->irq_events = 0;
@@ -1340,7 +1360,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1340 clear_all_latency_tracing(p); 1360 clear_all_latency_tracing(p);
1341 1361
1342 /* ok, now we should be set up.. */ 1362 /* ok, now we should be set up.. */
1343 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); 1363 if (clone_flags & CLONE_THREAD)
1364 p->exit_signal = -1;
1365 else if (clone_flags & CLONE_PARENT)
1366 p->exit_signal = current->group_leader->exit_signal;
1367 else
1368 p->exit_signal = (clone_flags & CSIGNAL);
1369
1344 p->pdeath_signal = 0; 1370 p->pdeath_signal = 0;
1345 p->exit_state = 0; 1371 p->exit_state = 0;
1346 1372
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 9815b8d1eed5..11f82a4d4eae 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p)
99 * freeze_task - send a freeze request to given task 99 * freeze_task - send a freeze request to given task
100 * @p: task to send the request to 100 * @p: task to send the request to
101 * 101 *
102 * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE 102 * If @p is freezing, the freeze request is sent either by sending a fake
103 * flag and either sending a fake signal to it or waking it up, depending 103 * signal (if it's not a kernel thread) or waking it up (if it's a kernel
104 * on whether it has %PF_FREEZER_NOSIG set. 104 * thread).
105 * 105 *
106 * RETURNS: 106 * RETURNS:
107 * %false, if @p is not freezing or already frozen; %true, otherwise 107 * %false, if @p is not freezing or already frozen; %true, otherwise
diff --git a/kernel/futex.c b/kernel/futex.c
index 1614be20173d..72efa1e4359a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2628,7 +2628,7 @@ void exit_robust_list(struct task_struct *curr)
2628long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2628long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2629 u32 __user *uaddr2, u32 val2, u32 val3) 2629 u32 __user *uaddr2, u32 val2, u32 val3)
2630{ 2630{
2631 int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; 2631 int cmd = op & FUTEX_CMD_MASK;
2632 unsigned int flags = 0; 2632 unsigned int flags = 0;
2633 2633
2634 if (!(op & FUTEX_PRIVATE_FLAG)) 2634 if (!(op & FUTEX_PRIVATE_FLAG))
@@ -2641,49 +2641,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2641 } 2641 }
2642 2642
2643 switch (cmd) { 2643 switch (cmd) {
2644 case FUTEX_LOCK_PI:
2645 case FUTEX_UNLOCK_PI:
2646 case FUTEX_TRYLOCK_PI:
2647 case FUTEX_WAIT_REQUEUE_PI:
2648 case FUTEX_CMP_REQUEUE_PI:
2649 if (!futex_cmpxchg_enabled)
2650 return -ENOSYS;
2651 }
2652
2653 switch (cmd) {
2644 case FUTEX_WAIT: 2654 case FUTEX_WAIT:
2645 val3 = FUTEX_BITSET_MATCH_ANY; 2655 val3 = FUTEX_BITSET_MATCH_ANY;
2646 case FUTEX_WAIT_BITSET: 2656 case FUTEX_WAIT_BITSET:
2647 ret = futex_wait(uaddr, flags, val, timeout, val3); 2657 return futex_wait(uaddr, flags, val, timeout, val3);
2648 break;
2649 case FUTEX_WAKE: 2658 case FUTEX_WAKE:
2650 val3 = FUTEX_BITSET_MATCH_ANY; 2659 val3 = FUTEX_BITSET_MATCH_ANY;
2651 case FUTEX_WAKE_BITSET: 2660 case FUTEX_WAKE_BITSET:
2652 ret = futex_wake(uaddr, flags, val, val3); 2661 return futex_wake(uaddr, flags, val, val3);
2653 break;
2654 case FUTEX_REQUEUE: 2662 case FUTEX_REQUEUE:
2655 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); 2663 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2656 break;
2657 case FUTEX_CMP_REQUEUE: 2664 case FUTEX_CMP_REQUEUE:
2658 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); 2665 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2659 break;
2660 case FUTEX_WAKE_OP: 2666 case FUTEX_WAKE_OP:
2661 ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 2667 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2662 break;
2663 case FUTEX_LOCK_PI: 2668 case FUTEX_LOCK_PI:
2664 if (futex_cmpxchg_enabled) 2669 return futex_lock_pi(uaddr, flags, val, timeout, 0);
2665 ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2666 break;
2667 case FUTEX_UNLOCK_PI: 2670 case FUTEX_UNLOCK_PI:
2668 if (futex_cmpxchg_enabled) 2671 return futex_unlock_pi(uaddr, flags);
2669 ret = futex_unlock_pi(uaddr, flags);
2670 break;
2671 case FUTEX_TRYLOCK_PI: 2672 case FUTEX_TRYLOCK_PI:
2672 if (futex_cmpxchg_enabled) 2673 return futex_lock_pi(uaddr, flags, 0, timeout, 1);
2673 ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2674 break;
2675 case FUTEX_WAIT_REQUEUE_PI: 2674 case FUTEX_WAIT_REQUEUE_PI:
2676 val3 = FUTEX_BITSET_MATCH_ANY; 2675 val3 = FUTEX_BITSET_MATCH_ANY;
2677 ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 2676 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2678 uaddr2); 2677 uaddr2);
2679 break;
2680 case FUTEX_CMP_REQUEUE_PI: 2678 case FUTEX_CMP_REQUEUE_PI:
2681 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); 2679 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2682 break;
2683 default:
2684 ret = -ENOSYS;
2685 } 2680 }
2686 return ret; 2681 return -ENOSYS;
2687} 2682}
2688 2683
2689 2684
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5a38bf4de641..d8e323d12496 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -56,6 +56,16 @@ config GENERIC_IRQ_CHIP
56config IRQ_DOMAIN 56config IRQ_DOMAIN
57 bool 57 bool
58 58
59config IRQ_DOMAIN_DEBUG
60 bool "Expose hardware/virtual IRQ mapping via debugfs"
61 depends on IRQ_DOMAIN && DEBUG_FS
62 help
63 This option will show the mapping relationship between hardware irq
64 numbers and Linux irq numbers. The mapping is exposed via debugfs
65 in the file "virq_mapping".
66
67 If you don't know what this means you don't need it.
68
59# Support forced irq threading 69# Support forced irq threading
60config IRQ_FORCED_THREADING 70config IRQ_FORCED_THREADING
61 bool 71 bool
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index fb7db75ee0c8..6080f6bc8c33 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -16,6 +16,8 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18 18
19#include <trace/events/irq.h>
20
19#include "internals.h" 21#include "internals.h"
20 22
21/** 23/**
@@ -61,8 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
61 return -EINVAL; 63 return -EINVAL;
62 64
63 type &= IRQ_TYPE_SENSE_MASK; 65 type &= IRQ_TYPE_SENSE_MASK;
64 if (type != IRQ_TYPE_NONE) 66 ret = __irq_set_trigger(desc, irq, type);
65 ret = __irq_set_trigger(desc, irq, type);
66 irq_put_desc_busunlock(desc, flags); 67 irq_put_desc_busunlock(desc, flags);
67 return ret; 68 return ret;
68} 69}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 470d08c82bbe..6ff84e6a954c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -60,7 +60,7 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
60 * device interrupt, so no irq storm is lurking. If the 60 * device interrupt, so no irq storm is lurking. If the
61 * RUNTHREAD bit is already set, nothing to do. 61 * RUNTHREAD bit is already set, nothing to do.
62 */ 62 */
63 if (test_bit(IRQTF_DIED, &action->thread_flags) || 63 if ((action->thread->flags & PF_EXITING) ||
64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) 64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
65 return; 65 return;
66 66
@@ -110,6 +110,18 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
110 * threads_oneshot untouched and runs the thread another time. 110 * threads_oneshot untouched and runs the thread another time.
111 */ 111 */
112 desc->threads_oneshot |= action->thread_mask; 112 desc->threads_oneshot |= action->thread_mask;
113
114 /*
115 * We increment the threads_active counter in case we wake up
116 * the irq thread. The irq thread decrements the counter when
117 * it returns from the handler or in the exit path and wakes
118 * up waiters which are stuck in synchronize_irq() when the
119 * active count becomes zero. synchronize_irq() is serialized
120 * against this code (hard irq handler) via IRQS_INPROGRESS
121 * like the finalize_oneshot() code. See comment above.
122 */
123 atomic_inc(&desc->threads_active);
124
113 wake_up_process(action->thread); 125 wake_up_process(action->thread);
114} 126}
115 127
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 40378ff877e7..8e5c56b3b7d9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -20,14 +20,12 @@ extern bool noirqdebug;
20/* 20/*
21 * Bits used by threaded handlers: 21 * Bits used by threaded handlers:
22 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run 22 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
23 * IRQTF_DIED - handler thread died
24 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed 23 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
25 * IRQTF_AFFINITY - irq thread is requested to adjust affinity 24 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
26 * IRQTF_FORCED_THREAD - irq action is force threaded 25 * IRQTF_FORCED_THREAD - irq action is force threaded
27 */ 26 */
28enum { 27enum {
29 IRQTF_RUNTHREAD, 28 IRQTF_RUNTHREAD,
30 IRQTF_DIED,
31 IRQTF_WARNED, 29 IRQTF_WARNED,
32 IRQTF_AFFINITY, 30 IRQTF_AFFINITY,
33 IRQTF_FORCED_THREAD, 31 IRQTF_FORCED_THREAD,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1f9e26526b69..3601f3fbf67c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,189 +1,793 @@
1#include <linux/debugfs.h>
2#include <linux/hardirq.h>
3#include <linux/interrupt.h>
1#include <linux/irq.h> 4#include <linux/irq.h>
5#include <linux/irqdesc.h>
2#include <linux/irqdomain.h> 6#include <linux/irqdomain.h>
3#include <linux/module.h> 7#include <linux/module.h>
4#include <linux/mutex.h> 8#include <linux/mutex.h>
5#include <linux/of.h> 9#include <linux/of.h>
6#include <linux/of_address.h> 10#include <linux/of_address.h>
11#include <linux/seq_file.h>
7#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/smp.h>
14#include <linux/fs.h>
15
16#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
17 * ie. legacy 8259, gets irqs 1..15 */
18#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
19#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
20#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
8 21
9static LIST_HEAD(irq_domain_list); 22static LIST_HEAD(irq_domain_list);
10static DEFINE_MUTEX(irq_domain_mutex); 23static DEFINE_MUTEX(irq_domain_mutex);
11 24
25static DEFINE_MUTEX(revmap_trees_mutex);
26static unsigned int irq_virq_count = NR_IRQS;
27static struct irq_domain *irq_default_domain;
28
12/** 29/**
13 * irq_domain_add() - Register an irq_domain 30 * irq_domain_alloc() - Allocate a new irq_domain data structure
14 * @domain: ptr to initialized irq_domain structure 31 * @of_node: optional device-tree node of the interrupt controller
32 * @revmap_type: type of reverse mapping to use
33 * @ops: map/unmap domain callbacks
34 * @host_data: Controller private data pointer
15 * 35 *
16 * Registers an irq_domain structure. The irq_domain must at a minimum be 36 * Allocates and initialize and irq_domain structure. Caller is expected to
17 * initialized with an ops structure pointer, and either a ->to_irq hook or 37 * register allocated irq_domain with irq_domain_register(). Returns pointer
18 * a valid irq_base value. Everything else is optional. 38 * to IRQ domain, or NULL on failure.
19 */ 39 */
20void irq_domain_add(struct irq_domain *domain) 40static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
41 unsigned int revmap_type,
42 const struct irq_domain_ops *ops,
43 void *host_data)
21{ 44{
22 struct irq_data *d; 45 struct irq_domain *domain;
23 int hwirq, irq;
24 46
25 /* 47 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
26 * This assumes that the irq_domain owner has already allocated 48 if (WARN_ON(!domain))
27 * the irq_descs. This block will be removed when support for dynamic 49 return NULL;
28 * allocation of irq_descs is added to irq_domain. 50
29 */ 51 /* Fill structure */
30 irq_domain_for_each_irq(domain, hwirq, irq) { 52 domain->revmap_type = revmap_type;
31 d = irq_get_irq_data(irq); 53 domain->ops = ops;
32 if (!d) { 54 domain->host_data = host_data;
33 WARN(1, "error: assigning domain to non existant irq_desc"); 55 domain->of_node = of_node_get(of_node);
34 return; 56
35 } 57 return domain;
36 if (d->domain) { 58}
37 /* things are broken; just report, don't clean up */ 59
38 WARN(1, "error: irq_desc already assigned to a domain"); 60static void irq_domain_add(struct irq_domain *domain)
39 return; 61{
62 mutex_lock(&irq_domain_mutex);
63 list_add(&domain->link, &irq_domain_list);
64 mutex_unlock(&irq_domain_mutex);
65 pr_debug("irq: Allocated domain of type %d @0x%p\n",
66 domain->revmap_type, domain);
67}
68
69static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
70 irq_hw_number_t hwirq)
71{
72 irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
73 int size = domain->revmap_data.legacy.size;
74
75 if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
76 return 0;
77 return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
78}
79
80/**
81 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
82 * @of_node: pointer to interrupt controller's device tree node.
83 * @size: total number of irqs in legacy mapping
84 * @first_irq: first number of irq block assigned to the domain
85 * @first_hwirq: first hwirq number to use for the translation. Should normally
86 * be '0', but a positive integer can be used if the effective
87 * hwirqs numbering does not begin at zero.
88 * @ops: map/unmap domain callbacks
89 * @host_data: Controller private data pointer
90 *
91 * Note: the map() callback will be called before this function returns
92 * for all legacy interrupts except 0 (which is always the invalid irq for
93 * a legacy controller).
94 */
95struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
96 unsigned int size,
97 unsigned int first_irq,
98 irq_hw_number_t first_hwirq,
99 const struct irq_domain_ops *ops,
100 void *host_data)
101{
102 struct irq_domain *domain;
103 unsigned int i;
104
105 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data);
106 if (!domain)
107 return NULL;
108
109 domain->revmap_data.legacy.first_irq = first_irq;
110 domain->revmap_data.legacy.first_hwirq = first_hwirq;
111 domain->revmap_data.legacy.size = size;
112
113 mutex_lock(&irq_domain_mutex);
114 /* Verify that all the irqs are available */
115 for (i = 0; i < size; i++) {
116 int irq = first_irq + i;
117 struct irq_data *irq_data = irq_get_irq_data(irq);
118
119 if (WARN_ON(!irq_data || irq_data->domain)) {
120 mutex_unlock(&irq_domain_mutex);
121 of_node_put(domain->of_node);
122 kfree(domain);
123 return NULL;
40 } 124 }
41 d->domain = domain;
42 d->hwirq = hwirq;
43 } 125 }
44 126
45 mutex_lock(&irq_domain_mutex); 127 /* Claim all of the irqs before registering a legacy domain */
46 list_add(&domain->list, &irq_domain_list); 128 for (i = 0; i < size; i++) {
129 struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
130 irq_data->hwirq = first_hwirq + i;
131 irq_data->domain = domain;
132 }
47 mutex_unlock(&irq_domain_mutex); 133 mutex_unlock(&irq_domain_mutex);
134
135 for (i = 0; i < size; i++) {
136 int irq = first_irq + i;
137 int hwirq = first_hwirq + i;
138
139 /* IRQ0 gets ignored */
140 if (!irq)
141 continue;
142
143 /* Legacy flags are left to default at this point,
144 * one can then use irq_create_mapping() to
145 * explicitly change them
146 */
147 ops->map(domain, irq, hwirq);
148
149 /* Clear norequest flags */
150 irq_clear_status_flags(irq, IRQ_NOREQUEST);
151 }
152
153 irq_domain_add(domain);
154 return domain;
155}
156
157/**
158 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
159 * @of_node: pointer to interrupt controller's device tree node.
160 * @ops: map/unmap domain callbacks
161 * @host_data: Controller private data pointer
162 */
163struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
164 unsigned int size,
165 const struct irq_domain_ops *ops,
166 void *host_data)
167{
168 struct irq_domain *domain;
169 unsigned int *revmap;
170
171 revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL);
172 if (WARN_ON(!revmap))
173 return NULL;
174
175 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
176 if (!domain) {
177 kfree(revmap);
178 return NULL;
179 }
180 domain->revmap_data.linear.size = size;
181 domain->revmap_data.linear.revmap = revmap;
182 irq_domain_add(domain);
183 return domain;
184}
185
186struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
187 const struct irq_domain_ops *ops,
188 void *host_data)
189{
190 struct irq_domain *domain = irq_domain_alloc(of_node,
191 IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
192 if (domain)
193 irq_domain_add(domain);
194 return domain;
195}
196
197/**
198 * irq_domain_add_tree()
199 * @of_node: pointer to interrupt controller's device tree node.
200 * @ops: map/unmap domain callbacks
201 *
202 * Note: The radix tree will be allocated later during boot automatically
203 * (the reverse mapping will use the slow path until that happens).
204 */
205struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
206 const struct irq_domain_ops *ops,
207 void *host_data)
208{
209 struct irq_domain *domain = irq_domain_alloc(of_node,
210 IRQ_DOMAIN_MAP_TREE, ops, host_data);
211 if (domain) {
212 INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
213 irq_domain_add(domain);
214 }
215 return domain;
48} 216}
49 217
50/** 218/**
51 * irq_domain_del() - Unregister an irq_domain 219 * irq_find_host() - Locates a domain for a given device node
52 * @domain: ptr to registered irq_domain. 220 * @node: device-tree node of the interrupt controller
53 */ 221 */
54void irq_domain_del(struct irq_domain *domain) 222struct irq_domain *irq_find_host(struct device_node *node)
55{ 223{
56 struct irq_data *d; 224 struct irq_domain *h, *found = NULL;
57 int hwirq, irq; 225 int rc;
58 226
227 /* We might want to match the legacy controller last since
228 * it might potentially be set to match all interrupts in
229 * the absence of a device node. This isn't a problem so far
230 * yet though...
231 */
59 mutex_lock(&irq_domain_mutex); 232 mutex_lock(&irq_domain_mutex);
60 list_del(&domain->list); 233 list_for_each_entry(h, &irq_domain_list, link) {
234 if (h->ops->match)
235 rc = h->ops->match(h, node);
236 else
237 rc = (h->of_node != NULL) && (h->of_node == node);
238
239 if (rc) {
240 found = h;
241 break;
242 }
243 }
61 mutex_unlock(&irq_domain_mutex); 244 mutex_unlock(&irq_domain_mutex);
245 return found;
246}
247EXPORT_SYMBOL_GPL(irq_find_host);
248
249/**
250 * irq_set_default_host() - Set a "default" irq domain
251 * @domain: default domain pointer
252 *
253 * For convenience, it's possible to set a "default" domain that will be used
254 * whenever NULL is passed to irq_create_mapping(). It makes life easier for
255 * platforms that want to manipulate a few hard coded interrupt numbers that
256 * aren't properly represented in the device-tree.
257 */
258void irq_set_default_host(struct irq_domain *domain)
259{
260 pr_debug("irq: Default domain set to @0x%p\n", domain);
261
262 irq_default_domain = domain;
263}
264
265/**
266 * irq_set_virq_count() - Set the maximum number of linux irqs
267 * @count: number of linux irqs, capped with NR_IRQS
268 *
269 * This is mainly for use by platforms like iSeries who want to program
270 * the virtual irq number in the controller to avoid the reverse mapping
271 */
272void irq_set_virq_count(unsigned int count)
273{
274 pr_debug("irq: Trying to set virq count to %d\n", count);
62 275
63 /* Clear the irq_domain assignments */ 276 BUG_ON(count < NUM_ISA_INTERRUPTS);
64 irq_domain_for_each_irq(domain, hwirq, irq) { 277 if (count < NR_IRQS)
65 d = irq_get_irq_data(irq); 278 irq_virq_count = count;
66 d->domain = NULL; 279}
280
281static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
282 irq_hw_number_t hwirq)
283{
284 struct irq_data *irq_data = irq_get_irq_data(virq);
285
286 irq_data->hwirq = hwirq;
287 irq_data->domain = domain;
288 if (domain->ops->map(domain, virq, hwirq)) {
289 pr_debug("irq: -> mapping failed, freeing\n");
290 irq_data->domain = NULL;
291 irq_data->hwirq = 0;
292 return -1;
67 } 293 }
294
295 irq_clear_status_flags(virq, IRQ_NOREQUEST);
296
297 return 0;
68} 298}
69 299
70#if defined(CONFIG_OF_IRQ)
71/** 300/**
72 * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec 301 * irq_create_direct_mapping() - Allocate an irq for direct mapping
302 * @domain: domain to allocate the irq for or NULL for default domain
73 * 303 *
74 * Used by the device tree interrupt mapping code to translate a device tree 304 * This routine is used for irq controllers which can choose the hardware
75 * interrupt specifier to a valid linux irq number. Returns either a valid 305 * interrupt numbers they generate. In such a case it's simplest to use
76 * linux IRQ number or 0. 306 * the linux irq as the hardware interrupt number.
307 */
308unsigned int irq_create_direct_mapping(struct irq_domain *domain)
309{
310 unsigned int virq;
311
312 if (domain == NULL)
313 domain = irq_default_domain;
314
315 BUG_ON(domain == NULL);
316 WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
317
318 virq = irq_alloc_desc_from(1, 0);
319 if (!virq) {
320 pr_debug("irq: create_direct virq allocation failed\n");
321 return 0;
322 }
323 if (virq >= irq_virq_count) {
324 pr_err("ERROR: no free irqs available below %i maximum\n",
325 irq_virq_count);
326 irq_free_desc(virq);
327 return 0;
328 }
329
330 pr_debug("irq: create_direct obtained virq %d\n", virq);
331
332 if (irq_setup_virq(domain, virq, virq)) {
333 irq_free_desc(virq);
334 return 0;
335 }
336
337 return virq;
338}
339
340/**
341 * irq_create_mapping() - Map a hardware interrupt into linux irq space
342 * @domain: domain owning this hardware interrupt or NULL for default domain
343 * @hwirq: hardware irq number in that domain space
77 * 344 *
78 * When the caller no longer need the irq number returned by this function it 345 * Only one mapping per hardware interrupt is permitted. Returns a linux
79 * should arrange to call irq_dispose_mapping(). 346 * irq number.
347 * If the sense/trigger is to be specified, set_irq_type() should be called
348 * on the number returned from that call.
80 */ 349 */
350unsigned int irq_create_mapping(struct irq_domain *domain,
351 irq_hw_number_t hwirq)
352{
353 unsigned int virq, hint;
354
355 pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
356
357 /* Look for default domain if nececssary */
358 if (domain == NULL)
359 domain = irq_default_domain;
360 if (domain == NULL) {
361 printk(KERN_WARNING "irq_create_mapping called for"
362 " NULL domain, hwirq=%lx\n", hwirq);
363 WARN_ON(1);
364 return 0;
365 }
366 pr_debug("irq: -> using domain @%p\n", domain);
367
368 /* Check if mapping already exists */
369 virq = irq_find_mapping(domain, hwirq);
370 if (virq) {
371 pr_debug("irq: -> existing mapping on virq %d\n", virq);
372 return virq;
373 }
374
375 /* Get a virtual interrupt number */
376 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
377 return irq_domain_legacy_revmap(domain, hwirq);
378
379 /* Allocate a virtual interrupt number */
380 hint = hwirq % irq_virq_count;
381 if (hint == 0)
382 hint++;
383 virq = irq_alloc_desc_from(hint, 0);
384 if (!virq)
385 virq = irq_alloc_desc_from(1, 0);
386 if (!virq) {
387 pr_debug("irq: -> virq allocation failed\n");
388 return 0;
389 }
390
391 if (irq_setup_virq(domain, virq, hwirq)) {
392 if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
393 irq_free_desc(virq);
394 return 0;
395 }
396
397 pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n",
398 hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
399
400 return virq;
401}
402EXPORT_SYMBOL_GPL(irq_create_mapping);
403
81unsigned int irq_create_of_mapping(struct device_node *controller, 404unsigned int irq_create_of_mapping(struct device_node *controller,
82 const u32 *intspec, unsigned int intsize) 405 const u32 *intspec, unsigned int intsize)
83{ 406{
84 struct irq_domain *domain; 407 struct irq_domain *domain;
85 unsigned long hwirq; 408 irq_hw_number_t hwirq;
86 unsigned int irq, type; 409 unsigned int type = IRQ_TYPE_NONE;
87 int rc = -EINVAL; 410 unsigned int virq;
88 411
89 /* Find a domain which can translate the irq spec */ 412 domain = controller ? irq_find_host(controller) : irq_default_domain;
90 mutex_lock(&irq_domain_mutex); 413 if (!domain) {
91 list_for_each_entry(domain, &irq_domain_list, list) { 414#ifdef CONFIG_MIPS
92 if (!domain->ops->dt_translate) 415 /*
93 continue; 416 * Workaround to avoid breaking interrupt controller drivers
94 rc = domain->ops->dt_translate(domain, controller, 417 * that don't yet register an irq_domain. This is temporary
95 intspec, intsize, &hwirq, &type); 418 * code. ~~~gcl, Feb 24, 2012
96 if (rc == 0) 419 *
97 break; 420 * Scheduled for removal in Linux v3.6. That should be enough
421 * time.
422 */
423 if (intsize > 0)
424 return intspec[0];
425#endif
426 printk(KERN_WARNING "irq: no irq domain found for %s !\n",
427 controller->full_name);
428 return 0;
98 } 429 }
99 mutex_unlock(&irq_domain_mutex);
100 430
101 if (rc != 0) 431 /* If domain has no translation, then we assume interrupt line */
102 return 0; 432 if (domain->ops->xlate == NULL)
433 hwirq = intspec[0];
434 else {
435 if (domain->ops->xlate(domain, controller, intspec, intsize,
436 &hwirq, &type))
437 return 0;
438 }
439
440 /* Create mapping */
441 virq = irq_create_mapping(domain, hwirq);
442 if (!virq)
443 return virq;
103 444
104 irq = irq_domain_to_irq(domain, hwirq); 445 /* Set type if specified and different than the current one */
105 if (type != IRQ_TYPE_NONE) 446 if (type != IRQ_TYPE_NONE &&
106 irq_set_irq_type(irq, type); 447 type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
107 pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", 448 irq_set_irq_type(virq, type);
108 controller->full_name, (int)hwirq, irq, type); 449 return virq;
109 return irq;
110} 450}
111EXPORT_SYMBOL_GPL(irq_create_of_mapping); 451EXPORT_SYMBOL_GPL(irq_create_of_mapping);
112 452
113/** 453/**
114 * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() 454 * irq_dispose_mapping() - Unmap an interrupt
115 * @irq: linux irq number to be discarded 455 * @virq: linux irq number of the interrupt to unmap
456 */
457void irq_dispose_mapping(unsigned int virq)
458{
459 struct irq_data *irq_data = irq_get_irq_data(virq);
460 struct irq_domain *domain;
461 irq_hw_number_t hwirq;
462
463 if (!virq || !irq_data)
464 return;
465
466 domain = irq_data->domain;
467 if (WARN_ON(domain == NULL))
468 return;
469
470 /* Never unmap legacy interrupts */
471 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
472 return;
473
474 irq_set_status_flags(virq, IRQ_NOREQUEST);
475
476 /* remove chip and handler */
477 irq_set_chip_and_handler(virq, NULL, NULL);
478
479 /* Make sure it's completed */
480 synchronize_irq(virq);
481
482 /* Tell the PIC about it */
483 if (domain->ops->unmap)
484 domain->ops->unmap(domain, virq);
485 smp_mb();
486
487 /* Clear reverse map */
488 hwirq = irq_data->hwirq;
489 switch(domain->revmap_type) {
490 case IRQ_DOMAIN_MAP_LINEAR:
491 if (hwirq < domain->revmap_data.linear.size)
492 domain->revmap_data.linear.revmap[hwirq] = 0;
493 break;
494 case IRQ_DOMAIN_MAP_TREE:
495 mutex_lock(&revmap_trees_mutex);
496 radix_tree_delete(&domain->revmap_data.tree, hwirq);
497 mutex_unlock(&revmap_trees_mutex);
498 break;
499 }
500
501 irq_free_desc(virq);
502}
503EXPORT_SYMBOL_GPL(irq_dispose_mapping);
504
505/**
506 * irq_find_mapping() - Find a linux irq from an hw irq number.
507 * @domain: domain owning this hardware interrupt
508 * @hwirq: hardware irq number in that domain space
509 *
510 * This is a slow path, for use by generic code. It's expected that an
511 * irq controller implementation directly calls the appropriate low level
512 * mapping function.
513 */
514unsigned int irq_find_mapping(struct irq_domain *domain,
515 irq_hw_number_t hwirq)
516{
517 unsigned int i;
518 unsigned int hint = hwirq % irq_virq_count;
519
520 /* Look for default domain if nececssary */
521 if (domain == NULL)
522 domain = irq_default_domain;
523 if (domain == NULL)
524 return 0;
525
526 /* legacy -> bail early */
527 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
528 return irq_domain_legacy_revmap(domain, hwirq);
529
530 /* Slow path does a linear search of the map */
531 if (hint == 0)
532 hint = 1;
533 i = hint;
534 do {
535 struct irq_data *data = irq_get_irq_data(i);
536 if (data && (data->domain == domain) && (data->hwirq == hwirq))
537 return i;
538 i++;
539 if (i >= irq_virq_count)
540 i = 1;
541 } while(i != hint);
542 return 0;
543}
544EXPORT_SYMBOL_GPL(irq_find_mapping);
545
546/**
547 * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
548 * @domain: domain owning this hardware interrupt
549 * @hwirq: hardware irq number in that domain space
116 * 550 *
117 * Calling this function indicates the caller no longer needs a reference to 551 * This is a fast path, for use by irq controller code that uses radix tree
118 * the linux irq number returned by a prior call to irq_create_of_mapping(). 552 * revmaps
119 */ 553 */
120void irq_dispose_mapping(unsigned int irq) 554unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
555 irq_hw_number_t hwirq)
121{ 556{
557 struct irq_data *irq_data;
558
559 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
560 return irq_find_mapping(domain, hwirq);
561
562 /*
563 * Freeing an irq can delete nodes along the path to
564 * do the lookup via call_rcu.
565 */
566 rcu_read_lock();
567 irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
568 rcu_read_unlock();
569
122 /* 570 /*
123 * nothing yet; will be filled when support for dynamic allocation of 571 * If found in radix tree, then fine.
124 * irq_descs is added to irq_domain 572 * Else fallback to linear lookup - this should not happen in practice
573 * as it means that we failed to insert the node in the radix tree.
125 */ 574 */
575 return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
126} 576}
127EXPORT_SYMBOL_GPL(irq_dispose_mapping);
128 577
129int irq_domain_simple_dt_translate(struct irq_domain *d, 578/**
130 struct device_node *controller, 579 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
131 const u32 *intspec, unsigned int intsize, 580 * @domain: domain owning this hardware interrupt
132 unsigned long *out_hwirq, unsigned int *out_type) 581 * @virq: linux irq number
582 * @hwirq: hardware irq number in that domain space
583 *
584 * This is for use by irq controllers that use a radix tree reverse
585 * mapping for fast lookup.
586 */
587void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
588 irq_hw_number_t hwirq)
133{ 589{
134 if (d->of_node != controller) 590 struct irq_data *irq_data = irq_get_irq_data(virq);
135 return -EINVAL; 591
136 if (intsize < 1) 592 if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
137 return -EINVAL; 593 return;
138 if (d->nr_irq && ((intspec[0] < d->hwirq_base) || 594
139 (intspec[0] >= d->hwirq_base + d->nr_irq))) 595 if (virq) {
140 return -EINVAL; 596 mutex_lock(&revmap_trees_mutex);
597 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
598 mutex_unlock(&revmap_trees_mutex);
599 }
600}
601
602/**
603 * irq_linear_revmap() - Find a linux irq from a hw irq number.
604 * @domain: domain owning this hardware interrupt
605 * @hwirq: hardware irq number in that domain space
606 *
607 * This is a fast path, for use by irq controller code that uses linear
608 * revmaps. It does fallback to the slow path if the revmap doesn't exist
609 * yet and will create the revmap entry with appropriate locking
610 */
611unsigned int irq_linear_revmap(struct irq_domain *domain,
612 irq_hw_number_t hwirq)
613{
614 unsigned int *revmap;
615
616 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
617 return irq_find_mapping(domain, hwirq);
618
619 /* Check revmap bounds */
620 if (unlikely(hwirq >= domain->revmap_data.linear.size))
621 return irq_find_mapping(domain, hwirq);
622
623 /* Check if revmap was allocated */
624 revmap = domain->revmap_data.linear.revmap;
625 if (unlikely(revmap == NULL))
626 return irq_find_mapping(domain, hwirq);
627
628 /* Fill up revmap with slow path if no mapping found */
629 if (unlikely(!revmap[hwirq]))
630 revmap[hwirq] = irq_find_mapping(domain, hwirq);
631
632 return revmap[hwirq];
633}
634
635#ifdef CONFIG_IRQ_DOMAIN_DEBUG
636static int virq_debug_show(struct seq_file *m, void *private)
637{
638 unsigned long flags;
639 struct irq_desc *desc;
640 const char *p;
641 static const char none[] = "none";
642 void *data;
643 int i;
644
645 seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq",
646 "chip name", "chip data", "domain name");
647
648 for (i = 1; i < nr_irqs; i++) {
649 desc = irq_to_desc(i);
650 if (!desc)
651 continue;
652
653 raw_spin_lock_irqsave(&desc->lock, flags);
654
655 if (desc->action && desc->action->handler) {
656 struct irq_chip *chip;
657
658 seq_printf(m, "%5d ", i);
659 seq_printf(m, "0x%05lx ", desc->irq_data.hwirq);
660
661 chip = irq_desc_get_chip(desc);
662 if (chip && chip->name)
663 p = chip->name;
664 else
665 p = none;
666 seq_printf(m, "%-15s ", p);
667
668 data = irq_desc_get_chip_data(desc);
669 seq_printf(m, "0x%16p ", data);
670
671 if (desc->irq_data.domain && desc->irq_data.domain->of_node)
672 p = desc->irq_data.domain->of_node->full_name;
673 else
674 p = none;
675 seq_printf(m, "%s\n", p);
676 }
677
678 raw_spin_unlock_irqrestore(&desc->lock, flags);
679 }
680
681 return 0;
682}
141 683
684static int virq_debug_open(struct inode *inode, struct file *file)
685{
686 return single_open(file, virq_debug_show, inode->i_private);
687}
688
689static const struct file_operations virq_debug_fops = {
690 .open = virq_debug_open,
691 .read = seq_read,
692 .llseek = seq_lseek,
693 .release = single_release,
694};
695
696static int __init irq_debugfs_init(void)
697{
698 if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
699 NULL, &virq_debug_fops) == NULL)
700 return -ENOMEM;
701
702 return 0;
703}
704__initcall(irq_debugfs_init);
705#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
706
707int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
708 irq_hw_number_t hwirq)
709{
710 return 0;
711}
712
713/**
714 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
715 *
716 * Device Tree IRQ specifier translation function which works with one cell
717 * bindings where the cell value maps directly to the hwirq number.
718 */
719int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
720 const u32 *intspec, unsigned int intsize,
721 unsigned long *out_hwirq, unsigned int *out_type)
722{
723 if (WARN_ON(intsize < 1))
724 return -EINVAL;
142 *out_hwirq = intspec[0]; 725 *out_hwirq = intspec[0];
143 *out_type = IRQ_TYPE_NONE; 726 *out_type = IRQ_TYPE_NONE;
144 if (intsize > 1)
145 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
146 return 0; 727 return 0;
147} 728}
729EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
148 730
149/** 731/**
150 * irq_domain_create_simple() - Set up a 'simple' translation range 732 * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
733 *
734 * Device Tree IRQ specifier translation function which works with two cell
735 * bindings where the cell values map directly to the hwirq number
736 * and linux irq flags.
151 */ 737 */
152void irq_domain_add_simple(struct device_node *controller, int irq_base) 738int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
739 const u32 *intspec, unsigned int intsize,
740 irq_hw_number_t *out_hwirq, unsigned int *out_type)
153{ 741{
154 struct irq_domain *domain; 742 if (WARN_ON(intsize < 2))
155 743 return -EINVAL;
156 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 744 *out_hwirq = intspec[0];
157 if (!domain) { 745 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
158 WARN_ON(1); 746 return 0;
159 return; 747}
160 } 748EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
161 749
162 domain->irq_base = irq_base; 750/**
163 domain->of_node = of_node_get(controller); 751 * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
164 domain->ops = &irq_domain_simple_ops; 752 *
165 irq_domain_add(domain); 753 * Device Tree IRQ specifier translation function which works with either one
754 * or two cell bindings where the cell values map directly to the hwirq number
755 * and linux irq flags.
756 *
757 * Note: don't use this function unless your interrupt controller explicitly
758 * supports both one and two cell bindings. For the majority of controllers
759 * the _onecell() or _twocell() variants above should be used.
760 */
761int irq_domain_xlate_onetwocell(struct irq_domain *d,
762 struct device_node *ctrlr,
763 const u32 *intspec, unsigned int intsize,
764 unsigned long *out_hwirq, unsigned int *out_type)
765{
766 if (WARN_ON(intsize < 1))
767 return -EINVAL;
768 *out_hwirq = intspec[0];
769 *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
770 return 0;
166} 771}
167EXPORT_SYMBOL_GPL(irq_domain_add_simple); 772EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
168 773
774const struct irq_domain_ops irq_domain_simple_ops = {
775 .map = irq_domain_simple_map,
776 .xlate = irq_domain_xlate_onetwocell,
777};
778EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
779
780#ifdef CONFIG_OF_IRQ
169void irq_domain_generate_simple(const struct of_device_id *match, 781void irq_domain_generate_simple(const struct of_device_id *match,
170 u64 phys_base, unsigned int irq_start) 782 u64 phys_base, unsigned int irq_start)
171{ 783{
172 struct device_node *node; 784 struct device_node *node;
173 pr_info("looking for phys_base=%llx, irq_start=%i\n", 785 pr_debug("looking for phys_base=%llx, irq_start=%i\n",
174 (unsigned long long) phys_base, (int) irq_start); 786 (unsigned long long) phys_base, (int) irq_start);
175 node = of_find_matching_node_by_address(NULL, match, phys_base); 787 node = of_find_matching_node_by_address(NULL, match, phys_base);
176 if (node) 788 if (node)
177 irq_domain_add_simple(node, irq_start); 789 irq_domain_add_legacy(node, 32, irq_start, 0,
178 else 790 &irq_domain_simple_ops, NULL);
179 pr_info("no node found\n");
180} 791}
181EXPORT_SYMBOL_GPL(irq_domain_generate_simple); 792EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
182#endif /* CONFIG_OF_IRQ */ 793#endif
183
184struct irq_domain_ops irq_domain_simple_ops = {
185#ifdef CONFIG_OF_IRQ
186 .dt_translate = irq_domain_simple_dt_translate,
187#endif /* CONFIG_OF_IRQ */
188};
189EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0f0d4704ddd8..b0ccd1ac2d6a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -759,6 +759,13 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
759 return ret; 759 return ret;
760} 760}
761 761
762static void wake_threads_waitq(struct irq_desc *desc)
763{
764 if (atomic_dec_and_test(&desc->threads_active) &&
765 waitqueue_active(&desc->wait_for_threads))
766 wake_up(&desc->wait_for_threads);
767}
768
762/* 769/*
763 * Interrupt handler thread 770 * Interrupt handler thread
764 */ 771 */
@@ -771,57 +778,41 @@ static int irq_thread(void *data)
771 struct irq_desc *desc = irq_to_desc(action->irq); 778 struct irq_desc *desc = irq_to_desc(action->irq);
772 irqreturn_t (*handler_fn)(struct irq_desc *desc, 779 irqreturn_t (*handler_fn)(struct irq_desc *desc,
773 struct irqaction *action); 780 struct irqaction *action);
774 int wake;
775 781
776 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, 782 if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
777 &action->thread_flags)) 783 &action->thread_flags))
778 handler_fn = irq_forced_thread_fn; 784 handler_fn = irq_forced_thread_fn;
779 else 785 else
780 handler_fn = irq_thread_fn; 786 handler_fn = irq_thread_fn;
781 787
782 sched_setscheduler(current, SCHED_FIFO, &param); 788 sched_setscheduler(current, SCHED_FIFO, &param);
783 current->irqaction = action; 789 current->irq_thread = 1;
784 790
785 while (!irq_wait_for_interrupt(action)) { 791 while (!irq_wait_for_interrupt(action)) {
792 irqreturn_t action_ret;
786 793
787 irq_thread_check_affinity(desc, action); 794 irq_thread_check_affinity(desc, action);
788 795
789 atomic_inc(&desc->threads_active); 796 action_ret = handler_fn(desc, action);
797 if (!noirqdebug)
798 note_interrupt(action->irq, desc, action_ret);
790 799
791 raw_spin_lock_irq(&desc->lock); 800 wake_threads_waitq(desc);
792 if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
793 /*
794 * CHECKME: We might need a dedicated
795 * IRQ_THREAD_PENDING flag here, which
796 * retriggers the thread in check_irq_resend()
797 * but AFAICT IRQS_PENDING should be fine as it
798 * retriggers the interrupt itself --- tglx
799 */
800 desc->istate |= IRQS_PENDING;
801 raw_spin_unlock_irq(&desc->lock);
802 } else {
803 irqreturn_t action_ret;
804
805 raw_spin_unlock_irq(&desc->lock);
806 action_ret = handler_fn(desc, action);
807 if (!noirqdebug)
808 note_interrupt(action->irq, desc, action_ret);
809 }
810
811 wake = atomic_dec_and_test(&desc->threads_active);
812
813 if (wake && waitqueue_active(&desc->wait_for_threads))
814 wake_up(&desc->wait_for_threads);
815 } 801 }
816 802
817 /* Prevent a stale desc->threads_oneshot */
818 irq_finalize_oneshot(desc, action, true);
819
820 /* 803 /*
821 * Clear irqaction. Otherwise exit_irq_thread() would make 804 * This is the regular exit path. __free_irq() is stopping the
805 * thread via kthread_stop() after calling
806 * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the
807 * oneshot mask bit can be set. We cannot verify that as we
808 * cannot touch the oneshot mask at this point anymore as
809 * __setup_irq() might have given out currents thread_mask
810 * again.
811 *
812 * Clear irq_thread. Otherwise exit_irq_thread() would make
822 * fuzz about an active irq thread going into nirvana. 813 * fuzz about an active irq thread going into nirvana.
823 */ 814 */
824 current->irqaction = NULL; 815 current->irq_thread = 0;
825 return 0; 816 return 0;
826} 817}
827 818
@@ -832,27 +823,28 @@ void exit_irq_thread(void)
832{ 823{
833 struct task_struct *tsk = current; 824 struct task_struct *tsk = current;
834 struct irq_desc *desc; 825 struct irq_desc *desc;
826 struct irqaction *action;
835 827
836 if (!tsk->irqaction) 828 if (!tsk->irq_thread)
837 return; 829 return;
838 830
831 action = kthread_data(tsk);
832
839 printk(KERN_ERR 833 printk(KERN_ERR
840 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 834 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
841 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); 835 tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
842 836
843 desc = irq_to_desc(tsk->irqaction->irq); 837 desc = irq_to_desc(action->irq);
844 838
845 /* 839 /*
846 * Prevent a stale desc->threads_oneshot. Must be called 840 * If IRQTF_RUNTHREAD is set, we need to decrement
847 * before setting the IRQTF_DIED flag. 841 * desc->threads_active and wake possible waiters.
848 */ 842 */
849 irq_finalize_oneshot(desc, tsk->irqaction, true); 843 if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
844 wake_threads_waitq(desc);
850 845
851 /* 846 /* Prevent a stale desc->threads_oneshot */
852 * Set the THREAD DIED flag to prevent further wakeups of the 847 irq_finalize_oneshot(desc, action, true);
853 * soon to be gone threaded handler.
854 */
855 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
856} 848}
857 849
858static void irq_setup_forced_threading(struct irqaction *new) 850static void irq_setup_forced_threading(struct irqaction *new)
@@ -1135,8 +1127,7 @@ out_thread:
1135 struct task_struct *t = new->thread; 1127 struct task_struct *t = new->thread;
1136 1128
1137 new->thread = NULL; 1129 new->thread = NULL;
1138 if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) 1130 kthread_stop(t);
1139 kthread_stop(t);
1140 put_task_struct(t); 1131 put_task_struct(t);
1141 } 1132 }
1142out_mput: 1133out_mput:
@@ -1246,8 +1237,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1246#endif 1237#endif
1247 1238
1248 if (action->thread) { 1239 if (action->thread) {
1249 if (!test_bit(IRQTF_DIED, &action->thread_flags)) 1240 kthread_stop(action->thread);
1250 kthread_stop(action->thread);
1251 put_task_struct(action->thread); 1241 put_task_struct(action->thread);
1252 } 1242 }
1253 1243
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 01d3b70fc98a..43049192b5ec 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -12,7 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/sort.h> 13#include <linux/sort.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/jump_label.h> 15#include <linux/static_key.h>
16 16
17#ifdef HAVE_JUMP_LABEL 17#ifdef HAVE_JUMP_LABEL
18 18
@@ -29,11 +29,6 @@ void jump_label_unlock(void)
29 mutex_unlock(&jump_label_mutex); 29 mutex_unlock(&jump_label_mutex);
30} 30}
31 31
32bool jump_label_enabled(struct jump_label_key *key)
33{
34 return !!atomic_read(&key->enabled);
35}
36
37static int jump_label_cmp(const void *a, const void *b) 32static int jump_label_cmp(const void *a, const void *b)
38{ 33{
39 const struct jump_entry *jea = a; 34 const struct jump_entry *jea = a;
@@ -58,56 +53,66 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
58 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); 53 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
59} 54}
60 55
61static void jump_label_update(struct jump_label_key *key, int enable); 56static void jump_label_update(struct static_key *key, int enable);
62 57
63void jump_label_inc(struct jump_label_key *key) 58void static_key_slow_inc(struct static_key *key)
64{ 59{
65 if (atomic_inc_not_zero(&key->enabled)) 60 if (atomic_inc_not_zero(&key->enabled))
66 return; 61 return;
67 62
68 jump_label_lock(); 63 jump_label_lock();
69 if (atomic_read(&key->enabled) == 0) 64 if (atomic_read(&key->enabled) == 0) {
70 jump_label_update(key, JUMP_LABEL_ENABLE); 65 if (!jump_label_get_branch_default(key))
66 jump_label_update(key, JUMP_LABEL_ENABLE);
67 else
68 jump_label_update(key, JUMP_LABEL_DISABLE);
69 }
71 atomic_inc(&key->enabled); 70 atomic_inc(&key->enabled);
72 jump_label_unlock(); 71 jump_label_unlock();
73} 72}
74EXPORT_SYMBOL_GPL(jump_label_inc); 73EXPORT_SYMBOL_GPL(static_key_slow_inc);
75 74
76static void __jump_label_dec(struct jump_label_key *key, 75static void __static_key_slow_dec(struct static_key *key,
77 unsigned long rate_limit, struct delayed_work *work) 76 unsigned long rate_limit, struct delayed_work *work)
78{ 77{
79 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) 78 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
79 WARN(atomic_read(&key->enabled) < 0,
80 "jump label: negative count!\n");
80 return; 81 return;
82 }
81 83
82 if (rate_limit) { 84 if (rate_limit) {
83 atomic_inc(&key->enabled); 85 atomic_inc(&key->enabled);
84 schedule_delayed_work(work, rate_limit); 86 schedule_delayed_work(work, rate_limit);
85 } else 87 } else {
86 jump_label_update(key, JUMP_LABEL_DISABLE); 88 if (!jump_label_get_branch_default(key))
87 89 jump_label_update(key, JUMP_LABEL_DISABLE);
90 else
91 jump_label_update(key, JUMP_LABEL_ENABLE);
92 }
88 jump_label_unlock(); 93 jump_label_unlock();
89} 94}
90EXPORT_SYMBOL_GPL(jump_label_dec);
91 95
92static void jump_label_update_timeout(struct work_struct *work) 96static void jump_label_update_timeout(struct work_struct *work)
93{ 97{
94 struct jump_label_key_deferred *key = 98 struct static_key_deferred *key =
95 container_of(work, struct jump_label_key_deferred, work.work); 99 container_of(work, struct static_key_deferred, work.work);
96 __jump_label_dec(&key->key, 0, NULL); 100 __static_key_slow_dec(&key->key, 0, NULL);
97} 101}
98 102
99void jump_label_dec(struct jump_label_key *key) 103void static_key_slow_dec(struct static_key *key)
100{ 104{
101 __jump_label_dec(key, 0, NULL); 105 __static_key_slow_dec(key, 0, NULL);
102} 106}
107EXPORT_SYMBOL_GPL(static_key_slow_dec);
103 108
104void jump_label_dec_deferred(struct jump_label_key_deferred *key) 109void static_key_slow_dec_deferred(struct static_key_deferred *key)
105{ 110{
106 __jump_label_dec(&key->key, key->timeout, &key->work); 111 __static_key_slow_dec(&key->key, key->timeout, &key->work);
107} 112}
113EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
108 114
109 115void jump_label_rate_limit(struct static_key_deferred *key,
110void jump_label_rate_limit(struct jump_label_key_deferred *key,
111 unsigned long rl) 116 unsigned long rl)
112{ 117{
113 key->timeout = rl; 118 key->timeout = rl;
@@ -150,7 +155,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
150 arch_jump_label_transform(entry, type); 155 arch_jump_label_transform(entry, type);
151} 156}
152 157
153static void __jump_label_update(struct jump_label_key *key, 158static void __jump_label_update(struct static_key *key,
154 struct jump_entry *entry, 159 struct jump_entry *entry,
155 struct jump_entry *stop, int enable) 160 struct jump_entry *stop, int enable)
156{ 161{
@@ -167,27 +172,40 @@ static void __jump_label_update(struct jump_label_key *key,
167 } 172 }
168} 173}
169 174
175static enum jump_label_type jump_label_type(struct static_key *key)
176{
177 bool true_branch = jump_label_get_branch_default(key);
178 bool state = static_key_enabled(key);
179
180 if ((!true_branch && state) || (true_branch && !state))
181 return JUMP_LABEL_ENABLE;
182
183 return JUMP_LABEL_DISABLE;
184}
185
170void __init jump_label_init(void) 186void __init jump_label_init(void)
171{ 187{
172 struct jump_entry *iter_start = __start___jump_table; 188 struct jump_entry *iter_start = __start___jump_table;
173 struct jump_entry *iter_stop = __stop___jump_table; 189 struct jump_entry *iter_stop = __stop___jump_table;
174 struct jump_label_key *key = NULL; 190 struct static_key *key = NULL;
175 struct jump_entry *iter; 191 struct jump_entry *iter;
176 192
177 jump_label_lock(); 193 jump_label_lock();
178 jump_label_sort_entries(iter_start, iter_stop); 194 jump_label_sort_entries(iter_start, iter_stop);
179 195
180 for (iter = iter_start; iter < iter_stop; iter++) { 196 for (iter = iter_start; iter < iter_stop; iter++) {
181 struct jump_label_key *iterk; 197 struct static_key *iterk;
182 198
183 iterk = (struct jump_label_key *)(unsigned long)iter->key; 199 iterk = (struct static_key *)(unsigned long)iter->key;
184 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? 200 arch_jump_label_transform_static(iter, jump_label_type(iterk));
185 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
186 if (iterk == key) 201 if (iterk == key)
187 continue; 202 continue;
188 203
189 key = iterk; 204 key = iterk;
190 key->entries = iter; 205 /*
206 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
207 */
208 *((unsigned long *)&key->entries) += (unsigned long)iter;
191#ifdef CONFIG_MODULES 209#ifdef CONFIG_MODULES
192 key->next = NULL; 210 key->next = NULL;
193#endif 211#endif
@@ -197,8 +215,8 @@ void __init jump_label_init(void)
197 215
198#ifdef CONFIG_MODULES 216#ifdef CONFIG_MODULES
199 217
200struct jump_label_mod { 218struct static_key_mod {
201 struct jump_label_mod *next; 219 struct static_key_mod *next;
202 struct jump_entry *entries; 220 struct jump_entry *entries;
203 struct module *mod; 221 struct module *mod;
204}; 222};
@@ -218,9 +236,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
218 start, end); 236 start, end);
219} 237}
220 238
221static void __jump_label_mod_update(struct jump_label_key *key, int enable) 239static void __jump_label_mod_update(struct static_key *key, int enable)
222{ 240{
223 struct jump_label_mod *mod = key->next; 241 struct static_key_mod *mod = key->next;
224 242
225 while (mod) { 243 while (mod) {
226 struct module *m = mod->mod; 244 struct module *m = mod->mod;
@@ -251,11 +269,7 @@ void jump_label_apply_nops(struct module *mod)
251 return; 269 return;
252 270
253 for (iter = iter_start; iter < iter_stop; iter++) { 271 for (iter = iter_start; iter < iter_stop; iter++) {
254 struct jump_label_key *iterk; 272 arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
255
256 iterk = (struct jump_label_key *)(unsigned long)iter->key;
257 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
258 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
259 } 273 }
260} 274}
261 275
@@ -264,8 +278,8 @@ static int jump_label_add_module(struct module *mod)
264 struct jump_entry *iter_start = mod->jump_entries; 278 struct jump_entry *iter_start = mod->jump_entries;
265 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; 279 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
266 struct jump_entry *iter; 280 struct jump_entry *iter;
267 struct jump_label_key *key = NULL; 281 struct static_key *key = NULL;
268 struct jump_label_mod *jlm; 282 struct static_key_mod *jlm;
269 283
270 /* if the module doesn't have jump label entries, just return */ 284 /* if the module doesn't have jump label entries, just return */
271 if (iter_start == iter_stop) 285 if (iter_start == iter_stop)
@@ -274,28 +288,30 @@ static int jump_label_add_module(struct module *mod)
274 jump_label_sort_entries(iter_start, iter_stop); 288 jump_label_sort_entries(iter_start, iter_stop);
275 289
276 for (iter = iter_start; iter < iter_stop; iter++) { 290 for (iter = iter_start; iter < iter_stop; iter++) {
277 if (iter->key == (jump_label_t)(unsigned long)key) 291 struct static_key *iterk;
278 continue;
279 292
280 key = (struct jump_label_key *)(unsigned long)iter->key; 293 iterk = (struct static_key *)(unsigned long)iter->key;
294 if (iterk == key)
295 continue;
281 296
297 key = iterk;
282 if (__module_address(iter->key) == mod) { 298 if (__module_address(iter->key) == mod) {
283 atomic_set(&key->enabled, 0); 299 /*
284 key->entries = iter; 300 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
301 */
302 *((unsigned long *)&key->entries) += (unsigned long)iter;
285 key->next = NULL; 303 key->next = NULL;
286 continue; 304 continue;
287 } 305 }
288 306 jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
289 jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
290 if (!jlm) 307 if (!jlm)
291 return -ENOMEM; 308 return -ENOMEM;
292
293 jlm->mod = mod; 309 jlm->mod = mod;
294 jlm->entries = iter; 310 jlm->entries = iter;
295 jlm->next = key->next; 311 jlm->next = key->next;
296 key->next = jlm; 312 key->next = jlm;
297 313
298 if (jump_label_enabled(key)) 314 if (jump_label_type(key) == JUMP_LABEL_ENABLE)
299 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); 315 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
300 } 316 }
301 317
@@ -307,14 +323,14 @@ static void jump_label_del_module(struct module *mod)
307 struct jump_entry *iter_start = mod->jump_entries; 323 struct jump_entry *iter_start = mod->jump_entries;
308 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; 324 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
309 struct jump_entry *iter; 325 struct jump_entry *iter;
310 struct jump_label_key *key = NULL; 326 struct static_key *key = NULL;
311 struct jump_label_mod *jlm, **prev; 327 struct static_key_mod *jlm, **prev;
312 328
313 for (iter = iter_start; iter < iter_stop; iter++) { 329 for (iter = iter_start; iter < iter_stop; iter++) {
314 if (iter->key == (jump_label_t)(unsigned long)key) 330 if (iter->key == (jump_label_t)(unsigned long)key)
315 continue; 331 continue;
316 332
317 key = (struct jump_label_key *)(unsigned long)iter->key; 333 key = (struct static_key *)(unsigned long)iter->key;
318 334
319 if (__module_address(iter->key) == mod) 335 if (__module_address(iter->key) == mod)
320 continue; 336 continue;
@@ -416,12 +432,13 @@ int jump_label_text_reserved(void *start, void *end)
416 return ret; 432 return ret;
417} 433}
418 434
419static void jump_label_update(struct jump_label_key *key, int enable) 435static void jump_label_update(struct static_key *key, int enable)
420{ 436{
421 struct jump_entry *entry = key->entries, *stop = __stop___jump_table; 437 struct jump_entry *stop = __stop___jump_table;
438 struct jump_entry *entry = jump_label_get_entries(key);
422 439
423#ifdef CONFIG_MODULES 440#ifdef CONFIG_MODULES
424 struct module *mod = __module_address((jump_label_t)key); 441 struct module *mod = __module_address((unsigned long)key);
425 442
426 __jump_label_mod_update(key, enable); 443 __jump_label_mod_update(key, enable);
427 444
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7b0886786701..4e2e472f6aeb 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -37,7 +37,6 @@
37#include <asm/page.h> 37#include <asm/page.h>
38#include <asm/uaccess.h> 38#include <asm/uaccess.h>
39#include <asm/io.h> 39#include <asm/io.h>
40#include <asm/system.h>
41#include <asm/sections.h> 40#include <asm/sections.h>
42 41
43/* Per cpu memory for storing cpu states in case of system crash. */ 42/* Per cpu memory for storing cpu states in case of system crash. */
@@ -1359,6 +1358,10 @@ static int __init parse_crashkernel_simple(char *cmdline,
1359 1358
1360 if (*cur == '@') 1359 if (*cur == '@')
1361 *crash_base = memparse(cur+1, &cur); 1360 *crash_base = memparse(cur+1, &cur);
1361 else if (*cur != ' ' && *cur != '\0') {
1362 pr_warning("crashkernel: unrecognized char\n");
1363 return -EINVAL;
1364 }
1362 1365
1363 return 0; 1366 return 0;
1364} 1367}
@@ -1462,7 +1465,9 @@ static int __init crash_save_vmcoreinfo_init(void)
1462 1465
1463 VMCOREINFO_SYMBOL(init_uts_ns); 1466 VMCOREINFO_SYMBOL(init_uts_ns);
1464 VMCOREINFO_SYMBOL(node_online_map); 1467 VMCOREINFO_SYMBOL(node_online_map);
1468#ifdef CONFIG_MMU
1465 VMCOREINFO_SYMBOL(swapper_pg_dir); 1469 VMCOREINFO_SYMBOL(swapper_pg_dir);
1470#endif
1466 VMCOREINFO_SYMBOL(_stext); 1471 VMCOREINFO_SYMBOL(_stext);
1467 VMCOREINFO_SYMBOL(vmlist); 1472 VMCOREINFO_SYMBOL(vmlist);
1468 1473
@@ -1546,13 +1551,13 @@ int kernel_kexec(void)
1546 if (error) 1551 if (error)
1547 goto Resume_console; 1552 goto Resume_console;
1548 /* At this point, dpm_suspend_start() has been called, 1553 /* At this point, dpm_suspend_start() has been called,
1549 * but *not* dpm_suspend_noirq(). We *must* call 1554 * but *not* dpm_suspend_end(). We *must* call
1550 * dpm_suspend_noirq() now. Otherwise, drivers for 1555 * dpm_suspend_end() now. Otherwise, drivers for
1551 * some devices (e.g. interrupt controllers) become 1556 * some devices (e.g. interrupt controllers) become
1552 * desynchronized with the actual state of the 1557 * desynchronized with the actual state of the
1553 * hardware at resume time, and evil weirdness ensues. 1558 * hardware at resume time, and evil weirdness ensues.
1554 */ 1559 */
1555 error = dpm_suspend_noirq(PMSG_FREEZE); 1560 error = dpm_suspend_end(PMSG_FREEZE);
1556 if (error) 1561 if (error)
1557 goto Resume_devices; 1562 goto Resume_devices;
1558 error = disable_nonboot_cpus(); 1563 error = disable_nonboot_cpus();
@@ -1579,7 +1584,7 @@ int kernel_kexec(void)
1579 local_irq_enable(); 1584 local_irq_enable();
1580 Enable_cpus: 1585 Enable_cpus:
1581 enable_nonboot_cpus(); 1586 enable_nonboot_cpus();
1582 dpm_resume_noirq(PMSG_RESTORE); 1587 dpm_resume_start(PMSG_RESTORE);
1583 Resume_devices: 1588 Resume_devices:
1584 dpm_resume_end(PMSG_RESTORE); 1589 dpm_resume_end(PMSG_RESTORE);
1585 Resume_console: 1590 Resume_console:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a0a88543934e..957a7aab8ebc 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -60,6 +60,43 @@ static DECLARE_RWSEM(umhelper_sem);
60*/ 60*/
61char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; 61char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
62 62
63static void free_modprobe_argv(struct subprocess_info *info)
64{
65 kfree(info->argv[3]); /* check call_modprobe() */
66 kfree(info->argv);
67}
68
69static int call_modprobe(char *module_name, int wait)
70{
71 static char *envp[] = {
72 "HOME=/",
73 "TERM=linux",
74 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
75 NULL
76 };
77
78 char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
79 if (!argv)
80 goto out;
81
82 module_name = kstrdup(module_name, GFP_KERNEL);
83 if (!module_name)
84 goto free_argv;
85
86 argv[0] = modprobe_path;
87 argv[1] = "-q";
88 argv[2] = "--";
89 argv[3] = module_name; /* check free_modprobe_argv() */
90 argv[4] = NULL;
91
92 return call_usermodehelper_fns(modprobe_path, argv, envp,
93 wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
94free_argv:
95 kfree(argv);
96out:
97 return -ENOMEM;
98}
99
63/** 100/**
64 * __request_module - try to load a kernel module 101 * __request_module - try to load a kernel module
65 * @wait: wait (or not) for the operation to complete 102 * @wait: wait (or not) for the operation to complete
@@ -81,11 +118,6 @@ int __request_module(bool wait, const char *fmt, ...)
81 char module_name[MODULE_NAME_LEN]; 118 char module_name[MODULE_NAME_LEN];
82 unsigned int max_modprobes; 119 unsigned int max_modprobes;
83 int ret; 120 int ret;
84 char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
85 static char *envp[] = { "HOME=/",
86 "TERM=linux",
87 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
88 NULL };
89 static atomic_t kmod_concurrent = ATOMIC_INIT(0); 121 static atomic_t kmod_concurrent = ATOMIC_INIT(0);
90#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 122#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
91 static int kmod_loop_msg; 123 static int kmod_loop_msg;
@@ -128,9 +160,7 @@ int __request_module(bool wait, const char *fmt, ...)
128 160
129 trace_module_request(module_name, wait, _RET_IP_); 161 trace_module_request(module_name, wait, _RET_IP_);
130 162
131 ret = call_usermodehelper_fns(modprobe_path, argv, envp, 163 ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
132 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
133 NULL, NULL, NULL);
134 164
135 atomic_dec(&kmod_concurrent); 165 atomic_dec(&kmod_concurrent);
136 return ret; 166 return ret;
@@ -188,7 +218,7 @@ static int ____call_usermodehelper(void *data)
188 /* Exec failed? */ 218 /* Exec failed? */
189fail: 219fail:
190 sub_info->retval = retval; 220 sub_info->retval = retval;
191 do_exit(0); 221 return 0;
192} 222}
193 223
194void call_usermodehelper_freeinfo(struct subprocess_info *info) 224void call_usermodehelper_freeinfo(struct subprocess_info *info)
@@ -199,6 +229,19 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
199} 229}
200EXPORT_SYMBOL(call_usermodehelper_freeinfo); 230EXPORT_SYMBOL(call_usermodehelper_freeinfo);
201 231
232static void umh_complete(struct subprocess_info *sub_info)
233{
234 struct completion *comp = xchg(&sub_info->complete, NULL);
235 /*
236 * See call_usermodehelper_exec(). If xchg() returns NULL
237 * we own sub_info, the UMH_KILLABLE caller has gone away.
238 */
239 if (comp)
240 complete(comp);
241 else
242 call_usermodehelper_freeinfo(sub_info);
243}
244
202/* Keventd can't block, but this (a child) can. */ 245/* Keventd can't block, but this (a child) can. */
203static int wait_for_helper(void *data) 246static int wait_for_helper(void *data)
204{ 247{
@@ -235,7 +278,7 @@ static int wait_for_helper(void *data)
235 sub_info->retval = ret; 278 sub_info->retval = ret;
236 } 279 }
237 280
238 complete(sub_info->complete); 281 umh_complete(sub_info);
239 return 0; 282 return 0;
240} 283}
241 284
@@ -244,7 +287,7 @@ static void __call_usermodehelper(struct work_struct *work)
244{ 287{
245 struct subprocess_info *sub_info = 288 struct subprocess_info *sub_info =
246 container_of(work, struct subprocess_info, work); 289 container_of(work, struct subprocess_info, work);
247 enum umh_wait wait = sub_info->wait; 290 int wait = sub_info->wait & ~UMH_KILLABLE;
248 pid_t pid; 291 pid_t pid;
249 292
250 /* CLONE_VFORK: wait until the usermode helper has execve'd 293 /* CLONE_VFORK: wait until the usermode helper has execve'd
@@ -269,7 +312,7 @@ static void __call_usermodehelper(struct work_struct *work)
269 case UMH_WAIT_EXEC: 312 case UMH_WAIT_EXEC:
270 if (pid < 0) 313 if (pid < 0)
271 sub_info->retval = pid; 314 sub_info->retval = pid;
272 complete(sub_info->complete); 315 umh_complete(sub_info);
273 } 316 }
274} 317}
275 318
@@ -435,8 +478,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
435 * asynchronously if wait is not set, and runs as a child of keventd. 478 * asynchronously if wait is not set, and runs as a child of keventd.
436 * (ie. it runs with full root capabilities). 479 * (ie. it runs with full root capabilities).
437 */ 480 */
438int call_usermodehelper_exec(struct subprocess_info *sub_info, 481int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
439 enum umh_wait wait)
440{ 482{
441 DECLARE_COMPLETION_ONSTACK(done); 483 DECLARE_COMPLETION_ONSTACK(done);
442 int retval = 0; 484 int retval = 0;
@@ -456,9 +498,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
456 queue_work(khelper_wq, &sub_info->work); 498 queue_work(khelper_wq, &sub_info->work);
457 if (wait == UMH_NO_WAIT) /* task has freed sub_info */ 499 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
458 goto unlock; 500 goto unlock;
501
502 if (wait & UMH_KILLABLE) {
503 retval = wait_for_completion_killable(&done);
504 if (!retval)
505 goto wait_done;
506
507 /* umh_complete() will see NULL and free sub_info */
508 if (xchg(&sub_info->complete, NULL))
509 goto unlock;
510 /* fallthrough, umh_complete() was already called */
511 }
512
459 wait_for_completion(&done); 513 wait_for_completion(&done);
514wait_done:
460 retval = sub_info->retval; 515 retval = sub_info->retval;
461
462out: 516out:
463 call_usermodehelper_freeinfo(sub_info); 517 call_usermodehelper_freeinfo(sub_info);
464unlock: 518unlock:
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8889f7dd7c46..ea9ee4518c35 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4176,7 +4176,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4176 printk("-------------------------------\n"); 4176 printk("-------------------------------\n");
4177 printk("%s:%d %s!\n", file, line, s); 4177 printk("%s:%d %s!\n", file, line, s);
4178 printk("\nother info that might help us debug this:\n\n"); 4178 printk("\nother info that might help us debug this:\n\n");
4179 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); 4179 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
4180 !rcu_lockdep_current_cpu_online()
4181 ? "RCU used illegally from offline CPU!\n"
4182 : rcu_is_cpu_idle()
4183 ? "RCU used illegally from idle CPU!\n"
4184 : "",
4185 rcu_scheduler_active, debug_locks);
4180 4186
4181 /* 4187 /*
4182 * If a CPU is in the RCU-free window in idle (ie: in the section 4188 * If a CPU is in the RCU-free window in idle (ie: in the section
diff --git a/kernel/module.c b/kernel/module.c
index 2c932760fd33..78ac6ec1e425 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -105,6 +105,7 @@ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
105 105
106/* Block module loading/unloading? */ 106/* Block module loading/unloading? */
107int modules_disabled = 0; 107int modules_disabled = 0;
108core_param(nomodule, modules_disabled, bint, 0);
108 109
109/* Waiting for a module to finish initializing? */ 110/* Waiting for a module to finish initializing? */
110static DECLARE_WAIT_QUEUE_HEAD(module_wq); 111static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -903,6 +904,36 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
903static struct module_attribute modinfo_refcnt = 904static struct module_attribute modinfo_refcnt =
904 __ATTR(refcnt, 0444, show_refcnt, NULL); 905 __ATTR(refcnt, 0444, show_refcnt, NULL);
905 906
907void __module_get(struct module *module)
908{
909 if (module) {
910 preempt_disable();
911 __this_cpu_inc(module->refptr->incs);
912 trace_module_get(module, _RET_IP_);
913 preempt_enable();
914 }
915}
916EXPORT_SYMBOL(__module_get);
917
918bool try_module_get(struct module *module)
919{
920 bool ret = true;
921
922 if (module) {
923 preempt_disable();
924
925 if (likely(module_is_live(module))) {
926 __this_cpu_inc(module->refptr->incs);
927 trace_module_get(module, _RET_IP_);
928 } else
929 ret = false;
930
931 preempt_enable();
932 }
933 return ret;
934}
935EXPORT_SYMBOL(try_module_get);
936
906void module_put(struct module *module) 937void module_put(struct module *module)
907{ 938{
908 if (module) { 939 if (module) {
@@ -2380,8 +2411,7 @@ static int copy_and_check(struct load_info *info,
2380 return -ENOEXEC; 2411 return -ENOEXEC;
2381 2412
2382 /* Suck in entire file: we'll want most of it. */ 2413 /* Suck in entire file: we'll want most of it. */
2383 /* vmalloc barfs on "unusual" numbers. Check here */ 2414 if ((hdr = vmalloc(len)) == NULL)
2384 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
2385 return -ENOMEM; 2415 return -ENOMEM;
2386 2416
2387 if (copy_from_user(hdr, umod, len) != 0) { 2417 if (copy_from_user(hdr, umod, len) != 0) {
@@ -2922,7 +2952,8 @@ static struct module *load_module(void __user *umod,
2922 mutex_unlock(&module_mutex); 2952 mutex_unlock(&module_mutex);
2923 2953
2924 /* Module is ready to execute: parsing args may do that. */ 2954 /* Module is ready to execute: parsing args may do that. */
2925 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2955 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
2956 -32768, 32767, NULL);
2926 if (err < 0) 2957 if (err < 0)
2927 goto unlink; 2958 goto unlink;
2928 2959
diff --git a/kernel/padata.c b/kernel/padata.c
index b45259931512..6f10eb285ece 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -29,7 +29,6 @@
29#include <linux/sysfs.h> 29#include <linux/sysfs.h>
30#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
31 31
32#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
33#define MAX_OBJ_NUM 1000 32#define MAX_OBJ_NUM 1000
34 33
35static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
@@ -43,18 +42,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
43 return target_cpu; 42 return target_cpu;
44} 43}
45 44
46static int padata_cpu_hash(struct padata_priv *padata) 45static int padata_cpu_hash(struct parallel_data *pd)
47{ 46{
48 int cpu_index; 47 int cpu_index;
49 struct parallel_data *pd;
50
51 pd = padata->pd;
52 48
53 /* 49 /*
54 * Hash the sequence numbers to the cpus by taking 50 * Hash the sequence numbers to the cpus by taking
55 * seq_nr mod. number of cpus in use. 51 * seq_nr mod. number of cpus in use.
56 */ 52 */
57 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); 53
54 spin_lock(&pd->seq_lock);
55 cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
56 pd->seq_nr++;
57 spin_unlock(&pd->seq_lock);
58 58
59 return padata_index_to_cpu(pd, cpu_index); 59 return padata_index_to_cpu(pd, cpu_index);
60} 60}
@@ -132,12 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst,
132 padata->pd = pd; 132 padata->pd = pd;
133 padata->cb_cpu = cb_cpu; 133 padata->cb_cpu = cb_cpu;
134 134
135 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) 135 target_cpu = padata_cpu_hash(pd);
136 atomic_set(&pd->seq_nr, -1);
137
138 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
139
140 target_cpu = padata_cpu_hash(padata);
141 queue = per_cpu_ptr(pd->pqueue, target_cpu); 136 queue = per_cpu_ptr(pd->pqueue, target_cpu);
142 137
143 spin_lock(&queue->parallel.lock); 138 spin_lock(&queue->parallel.lock);
@@ -173,7 +168,7 @@ EXPORT_SYMBOL(padata_do_parallel);
173static struct padata_priv *padata_get_next(struct parallel_data *pd) 168static struct padata_priv *padata_get_next(struct parallel_data *pd)
174{ 169{
175 int cpu, num_cpus; 170 int cpu, num_cpus;
176 int next_nr, next_index; 171 unsigned int next_nr, next_index;
177 struct padata_parallel_queue *queue, *next_queue; 172 struct padata_parallel_queue *queue, *next_queue;
178 struct padata_priv *padata; 173 struct padata_priv *padata;
179 struct padata_list *reorder; 174 struct padata_list *reorder;
@@ -189,14 +184,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
189 cpu = padata_index_to_cpu(pd, next_index); 184 cpu = padata_index_to_cpu(pd, next_index);
190 next_queue = per_cpu_ptr(pd->pqueue, cpu); 185 next_queue = per_cpu_ptr(pd->pqueue, cpu);
191 186
192 if (unlikely(next_nr > pd->max_seq_nr)) {
193 next_nr = next_nr - pd->max_seq_nr - 1;
194 next_index = next_nr % num_cpus;
195 cpu = padata_index_to_cpu(pd, next_index);
196 next_queue = per_cpu_ptr(pd->pqueue, cpu);
197 pd->processed = 0;
198 }
199
200 padata = NULL; 187 padata = NULL;
201 188
202 reorder = &next_queue->reorder; 189 reorder = &next_queue->reorder;
@@ -205,8 +192,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
205 padata = list_entry(reorder->list.next, 192 padata = list_entry(reorder->list.next,
206 struct padata_priv, list); 193 struct padata_priv, list);
207 194
208 BUG_ON(next_nr != padata->seq_nr);
209
210 spin_lock(&reorder->lock); 195 spin_lock(&reorder->lock);
211 list_del_init(&padata->list); 196 list_del_init(&padata->list);
212 atomic_dec(&pd->reorder_objects); 197 atomic_dec(&pd->reorder_objects);
@@ -230,6 +215,7 @@ out:
230 215
231static void padata_reorder(struct parallel_data *pd) 216static void padata_reorder(struct parallel_data *pd)
232{ 217{
218 int cb_cpu;
233 struct padata_priv *padata; 219 struct padata_priv *padata;
234 struct padata_serial_queue *squeue; 220 struct padata_serial_queue *squeue;
235 struct padata_instance *pinst = pd->pinst; 221 struct padata_instance *pinst = pd->pinst;
@@ -270,13 +256,14 @@ static void padata_reorder(struct parallel_data *pd)
270 return; 256 return;
271 } 257 }
272 258
273 squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); 259 cb_cpu = padata->cb_cpu;
260 squeue = per_cpu_ptr(pd->squeue, cb_cpu);
274 261
275 spin_lock(&squeue->serial.lock); 262 spin_lock(&squeue->serial.lock);
276 list_add_tail(&padata->list, &squeue->serial.list); 263 list_add_tail(&padata->list, &squeue->serial.list);
277 spin_unlock(&squeue->serial.lock); 264 spin_unlock(&squeue->serial.lock);
278 265
279 queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); 266 queue_work_on(cb_cpu, pinst->wq, &squeue->work);
280 } 267 }
281 268
282 spin_unlock_bh(&pd->lock); 269 spin_unlock_bh(&pd->lock);
@@ -400,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd)
400/* Initialize all percpu queues used by parallel workers */ 387/* Initialize all percpu queues used by parallel workers */
401static void padata_init_pqueues(struct parallel_data *pd) 388static void padata_init_pqueues(struct parallel_data *pd)
402{ 389{
403 int cpu_index, num_cpus, cpu; 390 int cpu_index, cpu;
404 struct padata_parallel_queue *pqueue; 391 struct padata_parallel_queue *pqueue;
405 392
406 cpu_index = 0; 393 cpu_index = 0;
@@ -415,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd)
415 INIT_WORK(&pqueue->work, padata_parallel_worker); 402 INIT_WORK(&pqueue->work, padata_parallel_worker);
416 atomic_set(&pqueue->num_obj, 0); 403 atomic_set(&pqueue->num_obj, 0);
417 } 404 }
418
419 num_cpus = cpumask_weight(pd->cpumask.pcpu);
420 pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
421} 405}
422 406
423/* Allocate and initialize the internal cpumask dependend resources. */ 407/* Allocate and initialize the internal cpumask dependend resources. */
@@ -444,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
444 padata_init_pqueues(pd); 428 padata_init_pqueues(pd);
445 padata_init_squeues(pd); 429 padata_init_squeues(pd);
446 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 430 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
447 atomic_set(&pd->seq_nr, -1); 431 pd->seq_nr = 0;
448 atomic_set(&pd->reorder_objects, 0); 432 atomic_set(&pd->reorder_objects, 0);
449 atomic_set(&pd->refcnt, 0); 433 atomic_set(&pd->refcnt, 0);
450 pd->pinst = pinst; 434 pd->pinst = pinst;
diff --git a/kernel/params.c b/kernel/params.c
index 4bc965d8a1fe..f37d82631347 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/module.h>
19#include <linux/kernel.h> 18#include <linux/kernel.h>
20#include <linux/string.h> 19#include <linux/string.h>
21#include <linux/errno.h> 20#include <linux/errno.h>
@@ -88,6 +87,8 @@ static int parse_one(char *param,
88 char *val, 87 char *val,
89 const struct kernel_param *params, 88 const struct kernel_param *params,
90 unsigned num_params, 89 unsigned num_params,
90 s16 min_level,
91 s16 max_level,
91 int (*handle_unknown)(char *param, char *val)) 92 int (*handle_unknown)(char *param, char *val))
92{ 93{
93 unsigned int i; 94 unsigned int i;
@@ -96,6 +97,9 @@ static int parse_one(char *param,
96 /* Find parameter */ 97 /* Find parameter */
97 for (i = 0; i < num_params; i++) { 98 for (i = 0; i < num_params; i++) {
98 if (parameq(param, params[i].name)) { 99 if (parameq(param, params[i].name)) {
100 if (params[i].level < min_level
101 || params[i].level > max_level)
102 return 0;
99 /* No one handled NULL, so do it here. */ 103 /* No one handled NULL, so do it here. */
100 if (!val && params[i].ops->set != param_set_bool 104 if (!val && params[i].ops->set != param_set_bool
101 && params[i].ops->set != param_set_bint) 105 && params[i].ops->set != param_set_bint)
@@ -175,6 +179,8 @@ int parse_args(const char *name,
175 char *args, 179 char *args,
176 const struct kernel_param *params, 180 const struct kernel_param *params,
177 unsigned num, 181 unsigned num,
182 s16 min_level,
183 s16 max_level,
178 int (*unknown)(char *param, char *val)) 184 int (*unknown)(char *param, char *val))
179{ 185{
180 char *param, *val; 186 char *param, *val;
@@ -190,7 +196,8 @@ int parse_args(const char *name,
190 196
191 args = next_arg(args, &param, &val); 197 args = next_arg(args, &param, &val);
192 irq_was_disabled = irqs_disabled(); 198 irq_was_disabled = irqs_disabled();
193 ret = parse_one(param, val, params, num, unknown); 199 ret = parse_one(param, val, params, num,
200 min_level, max_level, unknown);
194 if (irq_was_disabled && !irqs_disabled()) { 201 if (irq_was_disabled && !irqs_disabled()) {
195 printk(KERN_WARNING "parse_args(): option '%s' enabled " 202 printk(KERN_WARNING "parse_args(): option '%s' enabled "
196 "irq's!\n", param); 203 "irq's!\n", param);
@@ -298,35 +305,18 @@ EXPORT_SYMBOL(param_ops_charp);
298/* Actually could be a bool or an int, for historical reasons. */ 305/* Actually could be a bool or an int, for historical reasons. */
299int param_set_bool(const char *val, const struct kernel_param *kp) 306int param_set_bool(const char *val, const struct kernel_param *kp)
300{ 307{
301 bool v;
302 int ret;
303
304 /* No equals means "set"... */ 308 /* No equals means "set"... */
305 if (!val) val = "1"; 309 if (!val) val = "1";
306 310
307 /* One of =[yYnN01] */ 311 /* One of =[yYnN01] */
308 ret = strtobool(val, &v); 312 return strtobool(val, kp->arg);
309 if (ret)
310 return ret;
311
312 if (kp->flags & KPARAM_ISBOOL)
313 *(bool *)kp->arg = v;
314 else
315 *(int *)kp->arg = v;
316 return 0;
317} 313}
318EXPORT_SYMBOL(param_set_bool); 314EXPORT_SYMBOL(param_set_bool);
319 315
320int param_get_bool(char *buffer, const struct kernel_param *kp) 316int param_get_bool(char *buffer, const struct kernel_param *kp)
321{ 317{
322 bool val;
323 if (kp->flags & KPARAM_ISBOOL)
324 val = *(bool *)kp->arg;
325 else
326 val = *(int *)kp->arg;
327
328 /* Y and N chosen as being relatively non-coder friendly */ 318 /* Y and N chosen as being relatively non-coder friendly */
329 return sprintf(buffer, "%c", val ? 'Y' : 'N'); 319 return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
330} 320}
331EXPORT_SYMBOL(param_get_bool); 321EXPORT_SYMBOL(param_get_bool);
332 322
@@ -344,7 +334,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp)
344 struct kernel_param dummy; 334 struct kernel_param dummy;
345 335
346 dummy.arg = &boolval; 336 dummy.arg = &boolval;
347 dummy.flags = KPARAM_ISBOOL;
348 ret = param_set_bool(val, &dummy); 337 ret = param_set_bool(val, &dummy);
349 if (ret == 0) 338 if (ret == 0)
350 *(bool *)kp->arg = !boolval; 339 *(bool *)kp->arg = !boolval;
@@ -373,7 +362,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
373 /* Match bool exactly, by re-using it. */ 362 /* Match bool exactly, by re-using it. */
374 boolkp = *kp; 363 boolkp = *kp;
375 boolkp.arg = &v; 364 boolkp.arg = &v;
376 boolkp.flags |= KPARAM_ISBOOL;
377 365
378 ret = param_set_bool(val, &boolkp); 366 ret = param_set_bool(val, &boolkp);
379 if (ret == 0) 367 if (ret == 0)
@@ -394,7 +382,7 @@ static int param_array(const char *name,
394 unsigned int min, unsigned int max, 382 unsigned int min, unsigned int max,
395 void *elem, int elemsize, 383 void *elem, int elemsize,
396 int (*set)(const char *, const struct kernel_param *kp), 384 int (*set)(const char *, const struct kernel_param *kp),
397 u16 flags, 385 s16 level,
398 unsigned int *num) 386 unsigned int *num)
399{ 387{
400 int ret; 388 int ret;
@@ -404,7 +392,7 @@ static int param_array(const char *name,
404 /* Get the name right for errors. */ 392 /* Get the name right for errors. */
405 kp.name = name; 393 kp.name = name;
406 kp.arg = elem; 394 kp.arg = elem;
407 kp.flags = flags; 395 kp.level = level;
408 396
409 *num = 0; 397 *num = 0;
410 /* We expect a comma-separated list of values. */ 398 /* We expect a comma-separated list of values. */
@@ -445,7 +433,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
445 unsigned int temp_num; 433 unsigned int temp_num;
446 434
447 return param_array(kp->name, val, 1, arr->max, arr->elem, 435 return param_array(kp->name, val, 1, arr->max, arr->elem,
448 arr->elemsize, arr->ops->set, kp->flags, 436 arr->elemsize, arr->ops->set, kp->level,
449 arr->num ?: &temp_num); 437 arr->num ?: &temp_num);
450} 438}
451 439
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a8968396046d..57bc1fd35b3c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,6 +15,7 @@
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/reboot.h>
18 19
19#define BITS_PER_PAGE (PAGE_SIZE*8) 20#define BITS_PER_PAGE (PAGE_SIZE*8)
20 21
@@ -168,13 +169,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
168 while (nr > 0) { 169 while (nr > 0) {
169 rcu_read_lock(); 170 rcu_read_lock();
170 171
171 /*
172 * Any nested-container's init processes won't ignore the
173 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
174 */
175 task = pid_task(find_vpid(nr), PIDTYPE_PID); 172 task = pid_task(find_vpid(nr), PIDTYPE_PID);
176 if (task) 173 if (task && !__fatal_signal_pending(task))
177 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); 174 send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
178 175
179 rcu_read_unlock(); 176 rcu_read_unlock();
180 177
@@ -187,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
187 rc = sys_wait4(-1, NULL, __WALL, NULL); 184 rc = sys_wait4(-1, NULL, __WALL, NULL);
188 } while (rc != -ECHILD); 185 } while (rc != -ECHILD);
189 186
187 if (pid_ns->reboot)
188 current->signal->group_exit_code = pid_ns->reboot;
189
190 acct_exit_ns(pid_ns); 190 acct_exit_ns(pid_ns);
191 return; 191 return;
192} 192}
@@ -221,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = {
221 221
222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; 222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
223 223
224int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
225{
226 if (pid_ns == &init_pid_ns)
227 return 0;
228
229 switch (cmd) {
230 case LINUX_REBOOT_CMD_RESTART2:
231 case LINUX_REBOOT_CMD_RESTART:
232 pid_ns->reboot = SIGHUP;
233 break;
234
235 case LINUX_REBOOT_CMD_POWER_OFF:
236 case LINUX_REBOOT_CMD_HALT:
237 pid_ns->reboot = SIGINT;
238 break;
239 default:
240 return -EINVAL;
241 }
242
243 read_lock(&tasklist_lock);
244 force_sig(SIGKILL, pid_ns->child_reaper);
245 read_unlock(&tasklist_lock);
246
247 do_exit(0);
248
249 /* Not reached */
250 return 0;
251}
252
224static __init int pid_namespaces_init(void) 253static __init int pid_namespaces_init(void)
225{ 254{
226 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 255 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 07e0e28ffba7..66d808ec5252 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,8 @@
1 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
3 3
4obj-$(CONFIG_PM) += main.o qos.o 4obj-y += qos.o
5obj-$(CONFIG_PM) += main.o
5obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o 6obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
6obj-$(CONFIG_FREEZER) += process.o 7obj-$(CONFIG_FREEZER) += process.o
7obj-$(CONFIG_SUSPEND) += suspend.o 8obj-$(CONFIG_SUSPEND) += suspend.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 6d6d28870335..0a186cfde788 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
245 * create_image - Create a hibernation image. 245 * create_image - Create a hibernation image.
246 * @platform_mode: Whether or not to use the platform driver. 246 * @platform_mode: Whether or not to use the platform driver.
247 * 247 *
248 * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image 248 * Execute device drivers' "late" and "noirq" freeze callbacks, create a
249 * and execute the drivers' .thaw_noirq() callbacks. 249 * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
250 * 250 *
251 * Control reappears in this routine after the subsequent restore. 251 * Control reappears in this routine after the subsequent restore.
252 */ 252 */
@@ -254,7 +254,7 @@ static int create_image(int platform_mode)
254{ 254{
255 int error; 255 int error;
256 256
257 error = dpm_suspend_noirq(PMSG_FREEZE); 257 error = dpm_suspend_end(PMSG_FREEZE);
258 if (error) { 258 if (error) {
259 printk(KERN_ERR "PM: Some devices failed to power down, " 259 printk(KERN_ERR "PM: Some devices failed to power down, "
260 "aborting hibernation\n"); 260 "aborting hibernation\n");
@@ -306,7 +306,7 @@ static int create_image(int platform_mode)
306 Platform_finish: 306 Platform_finish:
307 platform_finish(platform_mode); 307 platform_finish(platform_mode);
308 308
309 dpm_resume_noirq(in_suspend ? 309 dpm_resume_start(in_suspend ?
310 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 310 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
311 311
312 return error; 312 return error;
@@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode)
343 * successful freezer test. 343 * successful freezer test.
344 */ 344 */
345 freezer_test_done = true; 345 freezer_test_done = true;
346 goto Cleanup; 346 goto Thaw;
347 } 347 }
348 348
349 error = dpm_prepare(PMSG_FREEZE); 349 error = dpm_prepare(PMSG_FREEZE);
350 if (error) { 350 if (error) {
351 dpm_complete(PMSG_RECOVER); 351 dpm_complete(PMSG_RECOVER);
352 goto Cleanup; 352 goto Thaw;
353 } 353 }
354 354
355 suspend_console(); 355 suspend_console();
@@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode)
385 platform_end(platform_mode); 385 platform_end(platform_mode);
386 return error; 386 return error;
387 387
388 Thaw:
389 thaw_kernel_threads();
388 Cleanup: 390 Cleanup:
389 swsusp_free(); 391 swsusp_free();
390 goto Close; 392 goto Close;
@@ -394,16 +396,16 @@ int hibernation_snapshot(int platform_mode)
394 * resume_target_kernel - Restore system state from a hibernation image. 396 * resume_target_kernel - Restore system state from a hibernation image.
395 * @platform_mode: Whether or not to use the platform driver. 397 * @platform_mode: Whether or not to use the platform driver.
396 * 398 *
397 * Execute device drivers' .freeze_noirq() callbacks, restore the contents of 399 * Execute device drivers' "noirq" and "late" freeze callbacks, restore the
398 * highmem that have not been restored yet from the image and run the low-level 400 * contents of highmem that have not been restored yet from the image and run
399 * code that will restore the remaining contents of memory and switch to the 401 * the low-level code that will restore the remaining contents of memory and
400 * just restored target kernel. 402 * switch to the just restored target kernel.
401 */ 403 */
402static int resume_target_kernel(bool platform_mode) 404static int resume_target_kernel(bool platform_mode)
403{ 405{
404 int error; 406 int error;
405 407
406 error = dpm_suspend_noirq(PMSG_QUIESCE); 408 error = dpm_suspend_end(PMSG_QUIESCE);
407 if (error) { 409 if (error) {
408 printk(KERN_ERR "PM: Some devices failed to power down, " 410 printk(KERN_ERR "PM: Some devices failed to power down, "
409 "aborting resume\n"); 411 "aborting resume\n");
@@ -460,7 +462,7 @@ static int resume_target_kernel(bool platform_mode)
460 Cleanup: 462 Cleanup:
461 platform_restore_cleanup(platform_mode); 463 platform_restore_cleanup(platform_mode);
462 464
463 dpm_resume_noirq(PMSG_RECOVER); 465 dpm_resume_start(PMSG_RECOVER);
464 466
465 return error; 467 return error;
466} 468}
@@ -518,7 +520,7 @@ int hibernation_platform_enter(void)
518 goto Resume_devices; 520 goto Resume_devices;
519 } 521 }
520 522
521 error = dpm_suspend_noirq(PMSG_HIBERNATE); 523 error = dpm_suspend_end(PMSG_HIBERNATE);
522 if (error) 524 if (error)
523 goto Resume_devices; 525 goto Resume_devices;
524 526
@@ -549,7 +551,7 @@ int hibernation_platform_enter(void)
549 Platform_finish: 551 Platform_finish:
550 hibernation_ops->finish(); 552 hibernation_ops->finish();
551 553
552 dpm_resume_noirq(PMSG_RESTORE); 554 dpm_resume_start(PMSG_RESTORE);
553 555
554 Resume_devices: 556 Resume_devices:
555 entering_platform_hibernation = false; 557 entering_platform_hibernation = false;
@@ -616,7 +618,7 @@ int hibernate(void)
616 /* Allocate memory management structures */ 618 /* Allocate memory management structures */
617 error = create_basic_memory_bitmaps(); 619 error = create_basic_memory_bitmaps();
618 if (error) 620 if (error)
619 goto Exit; 621 goto Enable_umh;
620 622
621 printk(KERN_INFO "PM: Syncing filesystems ... "); 623 printk(KERN_INFO "PM: Syncing filesystems ... ");
622 sys_sync(); 624 sys_sync();
@@ -624,15 +626,11 @@ int hibernate(void)
624 626
625 error = freeze_processes(); 627 error = freeze_processes();
626 if (error) 628 if (error)
627 goto Finish; 629 goto Free_bitmaps;
628 630
629 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 631 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
630 if (error) 632 if (error || freezer_test_done)
631 goto Thaw;
632 if (freezer_test_done) {
633 freezer_test_done = false;
634 goto Thaw; 633 goto Thaw;
635 }
636 634
637 if (in_suspend) { 635 if (in_suspend) {
638 unsigned int flags = 0; 636 unsigned int flags = 0;
@@ -657,8 +655,13 @@ int hibernate(void)
657 655
658 Thaw: 656 Thaw:
659 thaw_processes(); 657 thaw_processes();
660 Finish: 658
659 /* Don't bother checking whether freezer_test_done is true */
660 freezer_test_done = false;
661
662 Free_bitmaps:
661 free_basic_memory_bitmaps(); 663 free_basic_memory_bitmaps();
664 Enable_umh:
662 usermodehelper_enable(); 665 usermodehelper_enable();
663 Exit: 666 Exit:
664 pm_notifier_call_chain(PM_POST_HIBERNATION); 667 pm_notifier_call_chain(PM_POST_HIBERNATION);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9824b41e5a18..1c12581f1c62 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
165 last_errno %= REC_FAILED_NUM; 165 last_errno %= REC_FAILED_NUM;
166 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; 166 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
167 last_step %= REC_FAILED_NUM; 167 last_step %= REC_FAILED_NUM;
168 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" 168 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
169 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", 169 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
170 "success", suspend_stats.success, 170 "success", suspend_stats.success,
171 "fail", suspend_stats.fail, 171 "fail", suspend_stats.fail,
172 "failed_freeze", suspend_stats.failed_freeze, 172 "failed_freeze", suspend_stats.failed_freeze,
173 "failed_prepare", suspend_stats.failed_prepare, 173 "failed_prepare", suspend_stats.failed_prepare,
174 "failed_suspend", suspend_stats.failed_suspend, 174 "failed_suspend", suspend_stats.failed_suspend,
175 "failed_suspend_late",
176 suspend_stats.failed_suspend_late,
175 "failed_suspend_noirq", 177 "failed_suspend_noirq",
176 suspend_stats.failed_suspend_noirq, 178 suspend_stats.failed_suspend_noirq,
177 "failed_resume", suspend_stats.failed_resume, 179 "failed_resume", suspend_stats.failed_resume,
180 "failed_resume_early",
181 suspend_stats.failed_resume_early,
178 "failed_resume_noirq", 182 "failed_resume_noirq",
179 suspend_stats.failed_resume_noirq); 183 suspend_stats.failed_resume_noirq);
180 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", 184 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
@@ -287,16 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
287 291
288#ifdef CONFIG_SUSPEND 292#ifdef CONFIG_SUSPEND
289 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { 293 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
290 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 294 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
295 error = pm_suspend(state);
291 break; 296 break;
292 } 297 }
293 if (state < PM_SUSPEND_MAX && *s) {
294 error = enter_state(state);
295 if (error) {
296 suspend_stats.fail++;
297 dpm_save_failed_errno(error);
298 } else
299 suspend_stats.success++;
300 } 298 }
301#endif 299#endif
302 300
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 21724eee5206..98f3622d7407 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -177,13 +177,11 @@ extern const char *const pm_states[];
177 177
178extern bool valid_state(suspend_state_t state); 178extern bool valid_state(suspend_state_t state);
179extern int suspend_devices_and_enter(suspend_state_t state); 179extern int suspend_devices_and_enter(suspend_state_t state);
180extern int enter_state(suspend_state_t state);
181#else /* !CONFIG_SUSPEND */ 180#else /* !CONFIG_SUSPEND */
182static inline int suspend_devices_and_enter(suspend_state_t state) 181static inline int suspend_devices_and_enter(suspend_state_t state)
183{ 182{
184 return -ENOSYS; 183 return -ENOSYS;
185} 184}
186static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
187static inline bool valid_state(suspend_state_t state) { return false; } 185static inline bool valid_state(suspend_state_t state) { return false; }
188#endif /* !CONFIG_SUSPEND */ 186#endif /* !CONFIG_SUSPEND */
189 187
@@ -234,16 +232,14 @@ static inline int suspend_freeze_processes(void)
234 int error; 232 int error;
235 233
236 error = freeze_processes(); 234 error = freeze_processes();
237
238 /* 235 /*
239 * freeze_processes() automatically thaws every task if freezing 236 * freeze_processes() automatically thaws every task if freezing
240 * fails. So we need not do anything extra upon error. 237 * fails. So we need not do anything extra upon error.
241 */ 238 */
242 if (error) 239 if (error)
243 goto Finish; 240 return error;
244 241
245 error = freeze_kernel_threads(); 242 error = freeze_kernel_threads();
246
247 /* 243 /*
248 * freeze_kernel_threads() thaws only kernel threads upon freezing 244 * freeze_kernel_threads() thaws only kernel threads upon freezing
249 * failure. So we have to thaw the userspace tasks ourselves. 245 * failure. So we have to thaw the userspace tasks ourselves.
@@ -251,7 +247,6 @@ static inline int suspend_freeze_processes(void)
251 if (error) 247 if (error)
252 thaw_processes(); 248 thaw_processes();
253 249
254 Finish:
255 return error; 250 return error;
256} 251}
257 252
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7e426459e60a..0d2aeb226108 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only)
53 * It is "frozen enough". If the task does wake 53 * It is "frozen enough". If the task does wake
54 * up, it will immediately call try_to_freeze. 54 * up, it will immediately call try_to_freeze.
55 * 55 *
56 * Because freeze_task() goes through p's 56 * Because freeze_task() goes through p's scheduler lock, it's
57 * scheduler lock after setting TIF_FREEZE, it's 57 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
58 * guaranteed that either we see TASK_RUNNING or 58 * transition can't race with task state testing here.
59 * try_to_stop() after schedule() in ptrace/signal
60 * stop sees TIF_FREEZE.
61 */ 59 */
62 if (!task_is_stopped_or_traced(p) && 60 if (!task_is_stopped_or_traced(p) &&
63 !freezer_should_skip(p)) 61 !freezer_should_skip(p))
@@ -98,13 +96,15 @@ static int try_to_freeze_tasks(bool user_only)
98 elapsed_csecs / 100, elapsed_csecs % 100, 96 elapsed_csecs / 100, elapsed_csecs % 100,
99 todo - wq_busy, wq_busy); 97 todo - wq_busy, wq_busy);
100 98
101 read_lock(&tasklist_lock); 99 if (!wakeup) {
102 do_each_thread(g, p) { 100 read_lock(&tasklist_lock);
103 if (!wakeup && !freezer_should_skip(p) && 101 do_each_thread(g, p) {
104 p != current && freezing(p) && !frozen(p)) 102 if (p != current && !freezer_should_skip(p)
105 sched_show_task(p); 103 && freezing(p) && !frozen(p))
106 } while_each_thread(g, p); 104 sched_show_task(p);
107 read_unlock(&tasklist_lock); 105 } while_each_thread(g, p);
106 read_unlock(&tasklist_lock);
107 }
108 } else { 108 } else {
109 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, 109 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
110 elapsed_csecs % 100); 110 elapsed_csecs % 100);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 995e3bd3417b..d6d6dbd1ecc0 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
469static int __init pm_qos_power_init(void) 469static int __init pm_qos_power_init(void)
470{ 470{
471 int ret = 0; 471 int ret = 0;
472 int i;
472 473
473 ret = register_pm_qos_misc(&cpu_dma_pm_qos); 474 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
474 if (ret < 0) { 475
475 printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); 476 for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
476 return ret; 477 ret = register_pm_qos_misc(pm_qos_array[i]);
477 } 478 if (ret < 0) {
478 ret = register_pm_qos_misc(&network_lat_pm_qos); 479 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
479 if (ret < 0) { 480 pm_qos_array[i]->name);
480 printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); 481 return ret;
481 return ret; 482 }
482 } 483 }
483 ret = register_pm_qos_misc(&network_throughput_pm_qos);
484 if (ret < 0)
485 printk(KERN_ERR
486 "pm_qos_param: network_throughput setup failed\n");
487 484
488 return ret; 485 return ret;
489} 486}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 6a768e537001..0de28576807d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
711 list_for_each_entry(region, &nosave_regions, list) { 711 list_for_each_entry(region, &nosave_regions, list) {
712 unsigned long pfn; 712 unsigned long pfn;
713 713
714 pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", 714 pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
715 region->start_pfn << PAGE_SHIFT, 715 (unsigned long long) region->start_pfn << PAGE_SHIFT,
716 region->end_pfn << PAGE_SHIFT); 716 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
717 - 1);
717 718
718 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) 719 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
719 if (pfn_valid(pfn)) { 720 if (pfn_valid(pfn)) {
@@ -1000,20 +1001,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
1000 s_page = pfn_to_page(src_pfn); 1001 s_page = pfn_to_page(src_pfn);
1001 d_page = pfn_to_page(dst_pfn); 1002 d_page = pfn_to_page(dst_pfn);
1002 if (PageHighMem(s_page)) { 1003 if (PageHighMem(s_page)) {
1003 src = kmap_atomic(s_page, KM_USER0); 1004 src = kmap_atomic(s_page);
1004 dst = kmap_atomic(d_page, KM_USER1); 1005 dst = kmap_atomic(d_page);
1005 do_copy_page(dst, src); 1006 do_copy_page(dst, src);
1006 kunmap_atomic(dst, KM_USER1); 1007 kunmap_atomic(dst);
1007 kunmap_atomic(src, KM_USER0); 1008 kunmap_atomic(src);
1008 } else { 1009 } else {
1009 if (PageHighMem(d_page)) { 1010 if (PageHighMem(d_page)) {
1010 /* Page pointed to by src may contain some kernel 1011 /* Page pointed to by src may contain some kernel
1011 * data modified by kmap_atomic() 1012 * data modified by kmap_atomic()
1012 */ 1013 */
1013 safe_copy_page(buffer, s_page); 1014 safe_copy_page(buffer, s_page);
1014 dst = kmap_atomic(d_page, KM_USER0); 1015 dst = kmap_atomic(d_page);
1015 copy_page(dst, buffer); 1016 copy_page(dst, buffer);
1016 kunmap_atomic(dst, KM_USER0); 1017 kunmap_atomic(dst);
1017 } else { 1018 } else {
1018 safe_copy_page(page_address(d_page), s_page); 1019 safe_copy_page(page_address(d_page), s_page);
1019 } 1020 }
@@ -1728,9 +1729,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
1728 */ 1729 */
1729 void *kaddr; 1730 void *kaddr;
1730 1731
1731 kaddr = kmap_atomic(page, KM_USER0); 1732 kaddr = kmap_atomic(page);
1732 copy_page(buffer, kaddr); 1733 copy_page(buffer, kaddr);
1733 kunmap_atomic(kaddr, KM_USER0); 1734 kunmap_atomic(kaddr);
1734 handle->buffer = buffer; 1735 handle->buffer = buffer;
1735 } else { 1736 } else {
1736 handle->buffer = page_address(page); 1737 handle->buffer = page_address(page);
@@ -2014,9 +2015,9 @@ static void copy_last_highmem_page(void)
2014 if (last_highmem_page) { 2015 if (last_highmem_page) {
2015 void *dst; 2016 void *dst;
2016 2017
2017 dst = kmap_atomic(last_highmem_page, KM_USER0); 2018 dst = kmap_atomic(last_highmem_page);
2018 copy_page(dst, buffer); 2019 copy_page(dst, buffer);
2019 kunmap_atomic(dst, KM_USER0); 2020 kunmap_atomic(dst);
2020 last_highmem_page = NULL; 2021 last_highmem_page = NULL;
2021 } 2022 }
2022} 2023}
@@ -2309,13 +2310,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2309{ 2310{
2310 void *kaddr1, *kaddr2; 2311 void *kaddr1, *kaddr2;
2311 2312
2312 kaddr1 = kmap_atomic(p1, KM_USER0); 2313 kaddr1 = kmap_atomic(p1);
2313 kaddr2 = kmap_atomic(p2, KM_USER1); 2314 kaddr2 = kmap_atomic(p2);
2314 copy_page(buf, kaddr1); 2315 copy_page(buf, kaddr1);
2315 copy_page(kaddr1, kaddr2); 2316 copy_page(kaddr1, kaddr2);
2316 copy_page(kaddr2, buf); 2317 copy_page(kaddr2, buf);
2317 kunmap_atomic(kaddr2, KM_USER1); 2318 kunmap_atomic(kaddr2);
2318 kunmap_atomic(kaddr1, KM_USER0); 2319 kunmap_atomic(kaddr1);
2319} 2320}
2320 2321
2321/** 2322/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4fd51beed879..88e5c967370d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
37static const struct platform_suspend_ops *suspend_ops; 37static const struct platform_suspend_ops *suspend_ops;
38 38
39/** 39/**
40 * suspend_set_ops - Set the global suspend method table. 40 * suspend_set_ops - Set the global suspend method table.
41 * @ops: Pointer to ops structure. 41 * @ops: Suspend operations to use.
42 */ 42 */
43void suspend_set_ops(const struct platform_suspend_ops *ops) 43void suspend_set_ops(const struct platform_suspend_ops *ops)
44{ 44{
@@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state)
58} 58}
59 59
60/** 60/**
61 * suspend_valid_only_mem - generic memory-only valid callback 61 * suspend_valid_only_mem - Generic memory-only valid callback.
62 * 62 *
63 * Platform drivers that implement mem suspend only and only need 63 * Platform drivers that implement mem suspend only and only need to check for
64 * to check for that in their .valid callback can use this instead 64 * that in their .valid() callback can use this instead of rolling their own
65 * of rolling their own .valid callback. 65 * .valid() callback.
66 */ 66 */
67int suspend_valid_only_mem(suspend_state_t state) 67int suspend_valid_only_mem(suspend_state_t state)
68{ 68{
@@ -83,10 +83,11 @@ static int suspend_test(int level)
83} 83}
84 84
85/** 85/**
86 * suspend_prepare - Do prep work before entering low-power state. 86 * suspend_prepare - Prepare for entering system sleep state.
87 * 87 *
88 * This is common code that is called for each state that we're entering. 88 * Common code run for every system sleep state that can be entered (except for
89 * Run suspend notifiers, allocate a console and stop all processes. 89 * hibernation). Run suspend notifiers, allocate the "suspend" console and
90 * freeze processes.
90 */ 91 */
91static int suspend_prepare(void) 92static int suspend_prepare(void)
92{ 93{
@@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
131} 132}
132 133
133/** 134/**
134 * suspend_enter - enter the desired system sleep state. 135 * suspend_enter - Make the system enter the given sleep state.
135 * @state: State to enter 136 * @state: System sleep state to enter.
136 * @wakeup: Returns information that suspend should not be entered again. 137 * @wakeup: Returns information that the sleep state should not be re-entered.
137 * 138 *
138 * This function should be called after devices have been suspended. 139 * This function should be called after devices have been suspended.
139 */ 140 */
@@ -147,7 +148,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
147 goto Platform_finish; 148 goto Platform_finish;
148 } 149 }
149 150
150 error = dpm_suspend_noirq(PMSG_SUSPEND); 151 error = dpm_suspend_end(PMSG_SUSPEND);
151 if (error) { 152 if (error) {
152 printk(KERN_ERR "PM: Some devices failed to power down\n"); 153 printk(KERN_ERR "PM: Some devices failed to power down\n");
153 goto Platform_finish; 154 goto Platform_finish;
@@ -189,7 +190,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
189 if (suspend_ops->wake) 190 if (suspend_ops->wake)
190 suspend_ops->wake(); 191 suspend_ops->wake();
191 192
192 dpm_resume_noirq(PMSG_RESUME); 193 dpm_resume_start(PMSG_RESUME);
193 194
194 Platform_finish: 195 Platform_finish:
195 if (suspend_ops->finish) 196 if (suspend_ops->finish)
@@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
199} 200}
200 201
201/** 202/**
202 * suspend_devices_and_enter - suspend devices and enter the desired system 203 * suspend_devices_and_enter - Suspend devices and enter system sleep state.
203 * sleep state. 204 * @state: System sleep state to enter.
204 * @state: state to enter
205 */ 205 */
206int suspend_devices_and_enter(suspend_state_t state) 206int suspend_devices_and_enter(suspend_state_t state)
207{ 207{
@@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state)
251} 251}
252 252
253/** 253/**
254 * suspend_finish - Do final work before exiting suspend sequence. 254 * suspend_finish - Clean up before finishing the suspend sequence.
255 * 255 *
256 * Call platform code to clean up, restart processes, and free the 256 * Call platform code to clean up, restart processes, and free the console that
257 * console that we've allocated. This is not called for suspend-to-disk. 257 * we've allocated. This routine is not called for hibernation.
258 */ 258 */
259static void suspend_finish(void) 259static void suspend_finish(void)
260{ 260{
@@ -265,16 +265,14 @@ static void suspend_finish(void)
265} 265}
266 266
267/** 267/**
268 * enter_state - Do common work of entering low-power state. 268 * enter_state - Do common work needed to enter system sleep state.
269 * @state: pm_state structure for state we're entering. 269 * @state: System sleep state to enter.
270 * 270 *
271 * Make sure we're the only ones trying to enter a sleep state. Fail 271 * Make sure that no one else is trying to put the system into a sleep state.
272 * if someone has beat us to it, since we don't want anything weird to 272 * Fail if that's not the case. Otherwise, prepare for system suspend, make the
273 * happen when we wake up. 273 * system enter the given sleep state and clean up after wakeup.
274 * Then, do the setup for suspend, enter the state, and cleaup (after
275 * we've woken up).
276 */ 274 */
277int enter_state(suspend_state_t state) 275static int enter_state(suspend_state_t state)
278{ 276{
279 int error; 277 int error;
280 278
@@ -310,24 +308,26 @@ int enter_state(suspend_state_t state)
310} 308}
311 309
312/** 310/**
313 * pm_suspend - Externally visible function for suspending system. 311 * pm_suspend - Externally visible function for suspending the system.
314 * @state: Enumerated value of state to enter. 312 * @state: System sleep state to enter.
315 * 313 *
316 * Determine whether or not value is within range, get state 314 * Check if the value of @state represents one of the supported states,
317 * structure, and enter (above). 315 * execute enter_state() and update system suspend statistics.
318 */ 316 */
319int pm_suspend(suspend_state_t state) 317int pm_suspend(suspend_state_t state)
320{ 318{
321 int ret; 319 int error;
322 if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { 320
323 ret = enter_state(state); 321 if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
324 if (ret) { 322 return -EINVAL;
325 suspend_stats.fail++; 323
326 dpm_save_failed_errno(ret); 324 error = enter_state(state);
327 } else 325 if (error) {
328 suspend_stats.success++; 326 suspend_stats.fail++;
329 return ret; 327 dpm_save_failed_errno(error);
328 } else {
329 suspend_stats.success++;
330 } 330 }
331 return -EINVAL; 331 return error;
332} 332}
333EXPORT_SYMBOL(pm_suspend); 333EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3e100075b13c..33c4329205af 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -249,16 +249,10 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
249 } 249 }
250 pm_restore_gfp_mask(); 250 pm_restore_gfp_mask();
251 error = hibernation_snapshot(data->platform_support); 251 error = hibernation_snapshot(data->platform_support);
252 if (error) { 252 if (!error) {
253 thaw_kernel_threads();
254 } else {
255 error = put_user(in_suspend, (int __user *)arg); 253 error = put_user(in_suspend, (int __user *)arg);
256 if (!error && !freezer_test_done) 254 data->ready = !freezer_test_done && !error;
257 data->ready = 1; 255 freezer_test_done = false;
258 if (freezer_test_done) {
259 freezer_test_done = false;
260 thaw_kernel_threads();
261 }
262 } 256 }
263 break; 257 break;
264 258
diff --git a/kernel/printk.c b/kernel/printk.c
index b64ce71cb2e5..b663c2c95d39 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -44,6 +44,9 @@
44 44
45#include <asm/uaccess.h> 45#include <asm/uaccess.h>
46 46
47#define CREATE_TRACE_POINTS
48#include <trace/events/printk.h>
49
47/* 50/*
48 * Architectures can override it: 51 * Architectures can override it:
49 */ 52 */
@@ -542,6 +545,8 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
542static void _call_console_drivers(unsigned start, 545static void _call_console_drivers(unsigned start,
543 unsigned end, int msg_log_level) 546 unsigned end, int msg_log_level)
544{ 547{
548 trace_console(&LOG_BUF(0), start, end, log_buf_len);
549
545 if ((msg_log_level < console_loglevel || ignore_loglevel) && 550 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
546 console_drivers && start != end) { 551 console_drivers && start != end) {
547 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { 552 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 00ab2ca5ed11..ee8d49b9c309 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -231,26 +231,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
231} 231}
232 232
233static int ptrace_attach(struct task_struct *task, long request, 233static int ptrace_attach(struct task_struct *task, long request,
234 unsigned long addr,
234 unsigned long flags) 235 unsigned long flags)
235{ 236{
236 bool seize = (request == PTRACE_SEIZE); 237 bool seize = (request == PTRACE_SEIZE);
237 int retval; 238 int retval;
238 239
239 /*
240 * SEIZE will enable new ptrace behaviors which will be implemented
241 * gradually. SEIZE_DEVEL is used to prevent applications
242 * expecting full SEIZE behaviors trapping on kernel commits which
243 * are still in the process of implementing them.
244 *
245 * Only test programs for new ptrace behaviors being implemented
246 * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
247 *
248 * Once SEIZE behaviors are completely implemented, this flag and
249 * the following test will be removed.
250 */
251 retval = -EIO; 240 retval = -EIO;
252 if (seize && !(flags & PTRACE_SEIZE_DEVEL)) 241 if (seize) {
253 goto out; 242 if (addr != 0)
243 goto out;
244 if (flags & ~(unsigned long)PTRACE_O_MASK)
245 goto out;
246 flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
247 } else {
248 flags = PT_PTRACED;
249 }
254 250
255 audit_ptrace(task); 251 audit_ptrace(task);
256 252
@@ -262,7 +258,7 @@ static int ptrace_attach(struct task_struct *task, long request,
262 258
263 /* 259 /*
264 * Protect exec's credential calculations against our interference; 260 * Protect exec's credential calculations against our interference;
265 * interference; SUID, SGID and LSM creds get determined differently 261 * SUID, SGID and LSM creds get determined differently
266 * under ptrace. 262 * under ptrace.
267 */ 263 */
268 retval = -ERESTARTNOINTR; 264 retval = -ERESTARTNOINTR;
@@ -282,11 +278,11 @@ static int ptrace_attach(struct task_struct *task, long request,
282 if (task->ptrace) 278 if (task->ptrace)
283 goto unlock_tasklist; 279 goto unlock_tasklist;
284 280
285 task->ptrace = PT_PTRACED;
286 if (seize) 281 if (seize)
287 task->ptrace |= PT_SEIZED; 282 flags |= PT_SEIZED;
288 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) 283 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
289 task->ptrace |= PT_PTRACE_CAP; 284 flags |= PT_PTRACE_CAP;
285 task->ptrace = flags;
290 286
291 __ptrace_link(task, current); 287 __ptrace_link(task, current);
292 288
@@ -528,30 +524,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
528 524
529static int ptrace_setoptions(struct task_struct *child, unsigned long data) 525static int ptrace_setoptions(struct task_struct *child, unsigned long data)
530{ 526{
531 child->ptrace &= ~PT_TRACE_MASK; 527 unsigned flags;
532 528
533 if (data & PTRACE_O_TRACESYSGOOD) 529 if (data & ~(unsigned long)PTRACE_O_MASK)
534 child->ptrace |= PT_TRACESYSGOOD; 530 return -EINVAL;
535
536 if (data & PTRACE_O_TRACEFORK)
537 child->ptrace |= PT_TRACE_FORK;
538
539 if (data & PTRACE_O_TRACEVFORK)
540 child->ptrace |= PT_TRACE_VFORK;
541
542 if (data & PTRACE_O_TRACECLONE)
543 child->ptrace |= PT_TRACE_CLONE;
544
545 if (data & PTRACE_O_TRACEEXEC)
546 child->ptrace |= PT_TRACE_EXEC;
547
548 if (data & PTRACE_O_TRACEVFORKDONE)
549 child->ptrace |= PT_TRACE_VFORK_DONE;
550 531
551 if (data & PTRACE_O_TRACEEXIT) 532 /* Avoid intermediate state when all opts are cleared */
552 child->ptrace |= PT_TRACE_EXIT; 533 flags = child->ptrace;
534 flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
535 flags |= (data << PT_OPT_FLAG_SHIFT);
536 child->ptrace = flags;
553 537
554 return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; 538 return 0;
555} 539}
556 540
557static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) 541static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
@@ -891,7 +875,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
891 } 875 }
892 876
893 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { 877 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
894 ret = ptrace_attach(child, request, data); 878 ret = ptrace_attach(child, request, addr, data);
895 /* 879 /*
896 * Some architectures need to do book-keeping after 880 * Some architectures need to do book-keeping after
897 * a ptrace attach. 881 * a ptrace attach.
@@ -1034,7 +1018,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
1034 } 1018 }
1035 1019
1036 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { 1020 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
1037 ret = ptrace_attach(child, request, data); 1021 ret = ptrace_attach(child, request, addr, data);
1038 /* 1022 /*
1039 * Some architectures need to do book-keeping after 1023 * Some architectures need to do book-keeping after
1040 * a ptrace attach. 1024 * a ptrace attach.
diff --git a/kernel/rcu.h b/kernel/rcu.h
index aa88baab5f78..8ba99cdc6515 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -33,8 +33,27 @@
33 * Process-level increment to ->dynticks_nesting field. This allows for 33 * Process-level increment to ->dynticks_nesting field. This allows for
34 * architectures that use half-interrupts and half-exceptions from 34 * architectures that use half-interrupts and half-exceptions from
35 * process context. 35 * process context.
36 *
37 * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH
38 * that counts the number of process-based reasons why RCU cannot
39 * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE
40 * is the value used to increment or decrement this field.
41 *
42 * The rest of the bits could in principle be used to count interrupts,
43 * but this would mean that a negative-one value in the interrupt
44 * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field.
45 * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK
46 * that is set to DYNTICK_TASK_FLAG upon initial exit from idle.
47 * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon
48 * initial exit from idle.
36 */ 49 */
37#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) 50#define DYNTICK_TASK_NEST_WIDTH 7
51#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1)
52#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1)
53#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2)
54#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3)
55#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
56 DYNTICK_TASK_FLAG)
38 57
39/* 58/*
40 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally 59 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
@@ -50,7 +69,6 @@ extern struct debug_obj_descr rcuhead_debug_descr;
50 69
51static inline void debug_rcu_head_queue(struct rcu_head *head) 70static inline void debug_rcu_head_queue(struct rcu_head *head)
52{ 71{
53 WARN_ON_ONCE((unsigned long)head & 0x3);
54 debug_object_activate(head, &rcuhead_debug_descr); 72 debug_object_activate(head, &rcuhead_debug_descr);
55 debug_object_active_state(head, &rcuhead_debug_descr, 73 debug_object_active_state(head, &rcuhead_debug_descr,
56 STATE_RCU_HEAD_READY, 74 STATE_RCU_HEAD_READY,
@@ -76,16 +94,18 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
76 94
77extern void kfree(const void *); 95extern void kfree(const void *);
78 96
79static inline void __rcu_reclaim(char *rn, struct rcu_head *head) 97static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
80{ 98{
81 unsigned long offset = (unsigned long)head->func; 99 unsigned long offset = (unsigned long)head->func;
82 100
83 if (__is_kfree_rcu_offset(offset)) { 101 if (__is_kfree_rcu_offset(offset)) {
84 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); 102 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
85 kfree((void *)head - offset); 103 kfree((void *)head - offset);
104 return 1;
86 } else { 105 } else {
87 RCU_TRACE(trace_rcu_invoke_callback(rn, head)); 106 RCU_TRACE(trace_rcu_invoke_callback(rn, head));
88 head->func(head); 107 head->func(head);
108 return 0;
89 } 109 }
90} 110}
91 111
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2bc4e135ff23..a86f1741cc27 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -88,6 +88,9 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
88 * section. 88 * section.
89 * 89 *
90 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. 90 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
91 *
92 * Note that rcu_read_lock() is disallowed if the CPU is either idle or
93 * offline from an RCU perspective, so check for those as well.
91 */ 94 */
92int rcu_read_lock_bh_held(void) 95int rcu_read_lock_bh_held(void)
93{ 96{
@@ -95,6 +98,8 @@ int rcu_read_lock_bh_held(void)
95 return 1; 98 return 1;
96 if (rcu_is_cpu_idle()) 99 if (rcu_is_cpu_idle())
97 return 0; 100 return 0;
101 if (!rcu_lockdep_current_cpu_online())
102 return 0;
98 return in_softirq() || irqs_disabled(); 103 return in_softirq() || irqs_disabled();
99} 104}
100EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 105EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 977296dca0a4..37a5444204d2 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -53,7 +53,7 @@ static void __call_rcu(struct rcu_head *head,
53 53
54#include "rcutiny_plugin.h" 54#include "rcutiny_plugin.h"
55 55
56static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; 56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
57 57
58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval) 59static void rcu_idle_enter_common(long long oldval)
@@ -88,10 +88,16 @@ void rcu_idle_enter(void)
88 88
89 local_irq_save(flags); 89 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting; 90 oldval = rcu_dynticks_nesting;
91 rcu_dynticks_nesting = 0; 91 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
92 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
93 DYNTICK_TASK_NEST_VALUE)
94 rcu_dynticks_nesting = 0;
95 else
96 rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
92 rcu_idle_enter_common(oldval); 97 rcu_idle_enter_common(oldval);
93 local_irq_restore(flags); 98 local_irq_restore(flags);
94} 99}
100EXPORT_SYMBOL_GPL(rcu_idle_enter);
95 101
96/* 102/*
97 * Exit an interrupt handler towards idle. 103 * Exit an interrupt handler towards idle.
@@ -140,11 +146,15 @@ void rcu_idle_exit(void)
140 146
141 local_irq_save(flags); 147 local_irq_save(flags);
142 oldval = rcu_dynticks_nesting; 148 oldval = rcu_dynticks_nesting;
143 WARN_ON_ONCE(oldval != 0); 149 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
144 rcu_dynticks_nesting = DYNTICK_TASK_NESTING; 150 if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
151 rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
152 else
153 rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
145 rcu_idle_exit_common(oldval); 154 rcu_idle_exit_common(oldval);
146 local_irq_restore(flags); 155 local_irq_restore(flags);
147} 156}
157EXPORT_SYMBOL_GPL(rcu_idle_exit);
148 158
149/* 159/*
150 * Enter an interrupt handler, moving away from idle. 160 * Enter an interrupt handler, moving away from idle.
@@ -258,7 +268,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
258 268
259 /* If no RCU callbacks ready to invoke, just return. */ 269 /* If no RCU callbacks ready to invoke, just return. */
260 if (&rcp->rcucblist == rcp->donetail) { 270 if (&rcp->rcucblist == rcp->donetail) {
261 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); 271 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
262 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, 272 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
263 ACCESS_ONCE(rcp->rcucblist), 273 ACCESS_ONCE(rcp->rcucblist),
264 need_resched(), 274 need_resched(),
@@ -269,7 +279,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
269 279
270 /* Move the ready-to-invoke callbacks to a local list. */ 280 /* Move the ready-to-invoke callbacks to a local list. */
271 local_irq_save(flags); 281 local_irq_save(flags);
272 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); 282 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
273 list = rcp->rcucblist; 283 list = rcp->rcucblist;
274 rcp->rcucblist = *rcp->donetail; 284 rcp->rcucblist = *rcp->donetail;
275 *rcp->donetail = NULL; 285 *rcp->donetail = NULL;
@@ -319,6 +329,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
319 */ 329 */
320void synchronize_sched(void) 330void synchronize_sched(void)
321{ 331{
332 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
333 !lock_is_held(&rcu_lock_map) &&
334 !lock_is_held(&rcu_sched_lock_map),
335 "Illegal synchronize_sched() in RCU read-side critical section");
322 cond_resched(); 336 cond_resched();
323} 337}
324EXPORT_SYMBOL_GPL(synchronize_sched); 338EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 9cb1ae4aabdd..22ecea0dfb62 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -132,6 +132,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
132 RCU_TRACE(.rcb.name = "rcu_preempt") 132 RCU_TRACE(.rcb.name = "rcu_preempt")
133}; 133};
134 134
135static void rcu_read_unlock_special(struct task_struct *t);
135static int rcu_preempted_readers_exp(void); 136static int rcu_preempted_readers_exp(void);
136static void rcu_report_exp_done(void); 137static void rcu_report_exp_done(void);
137 138
@@ -146,6 +147,16 @@ static int rcu_cpu_blocking_cur_gp(void)
146/* 147/*
147 * Check for a running RCU reader. Because there is only one CPU, 148 * Check for a running RCU reader. Because there is only one CPU,
148 * there can be but one running RCU reader at a time. ;-) 149 * there can be but one running RCU reader at a time. ;-)
150 *
151 * Returns zero if there are no running readers. Returns a positive
152 * number if there is at least one reader within its RCU read-side
153 * critical section. Returns a negative number if an outermost reader
154 * is in the midst of exiting from its RCU read-side critical section
155 *
156 * Returns zero if there are no running readers. Returns a positive
157 * number if there is at least one reader within its RCU read-side
158 * critical section. Returns a negative number if an outermost reader
159 * is in the midst of exiting from its RCU read-side critical section.
149 */ 160 */
150static int rcu_preempt_running_reader(void) 161static int rcu_preempt_running_reader(void)
151{ 162{
@@ -307,7 +318,6 @@ static int rcu_boost(void)
307 t = container_of(tb, struct task_struct, rcu_node_entry); 318 t = container_of(tb, struct task_struct, rcu_node_entry);
308 rt_mutex_init_proxy_locked(&mtx, t); 319 rt_mutex_init_proxy_locked(&mtx, t);
309 t->rcu_boost_mutex = &mtx; 320 t->rcu_boost_mutex = &mtx;
310 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
311 raw_local_irq_restore(flags); 321 raw_local_irq_restore(flags);
312 rt_mutex_lock(&mtx); 322 rt_mutex_lock(&mtx);
313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 323 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
@@ -475,7 +485,7 @@ void rcu_preempt_note_context_switch(void)
475 unsigned long flags; 485 unsigned long flags;
476 486
477 local_irq_save(flags); /* must exclude scheduler_tick(). */ 487 local_irq_save(flags); /* must exclude scheduler_tick(). */
478 if (rcu_preempt_running_reader() && 488 if (rcu_preempt_running_reader() > 0 &&
479 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 489 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
480 490
481 /* Possibly blocking in an RCU read-side critical section. */ 491 /* Possibly blocking in an RCU read-side critical section. */
@@ -494,6 +504,13 @@ void rcu_preempt_note_context_switch(void)
494 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); 504 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
495 if (rcu_cpu_blocking_cur_gp()) 505 if (rcu_cpu_blocking_cur_gp())
496 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; 506 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
507 } else if (rcu_preempt_running_reader() < 0 &&
508 t->rcu_read_unlock_special) {
509 /*
510 * Complete exit from RCU read-side critical section on
511 * behalf of preempted instance of __rcu_read_unlock().
512 */
513 rcu_read_unlock_special(t);
497 } 514 }
498 515
499 /* 516 /*
@@ -526,12 +543,15 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
526 * notify RCU core processing or task having blocked during the RCU 543 * notify RCU core processing or task having blocked during the RCU
527 * read-side critical section. 544 * read-side critical section.
528 */ 545 */
529static void rcu_read_unlock_special(struct task_struct *t) 546static noinline void rcu_read_unlock_special(struct task_struct *t)
530{ 547{
531 int empty; 548 int empty;
532 int empty_exp; 549 int empty_exp;
533 unsigned long flags; 550 unsigned long flags;
534 struct list_head *np; 551 struct list_head *np;
552#ifdef CONFIG_RCU_BOOST
553 struct rt_mutex *rbmp = NULL;
554#endif /* #ifdef CONFIG_RCU_BOOST */
535 int special; 555 int special;
536 556
537 /* 557 /*
@@ -552,7 +572,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
552 rcu_preempt_cpu_qs(); 572 rcu_preempt_cpu_qs();
553 573
554 /* Hardware IRQ handlers cannot block. */ 574 /* Hardware IRQ handlers cannot block. */
555 if (in_irq()) { 575 if (in_irq() || in_serving_softirq()) {
556 local_irq_restore(flags); 576 local_irq_restore(flags);
557 return; 577 return;
558 } 578 }
@@ -597,10 +617,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
597 } 617 }
598#ifdef CONFIG_RCU_BOOST 618#ifdef CONFIG_RCU_BOOST
599 /* Unboost self if was boosted. */ 619 /* Unboost self if was boosted. */
600 if (special & RCU_READ_UNLOCK_BOOSTED) { 620 if (t->rcu_boost_mutex != NULL) {
601 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; 621 rbmp = t->rcu_boost_mutex;
602 rt_mutex_unlock(t->rcu_boost_mutex);
603 t->rcu_boost_mutex = NULL; 622 t->rcu_boost_mutex = NULL;
623 rt_mutex_unlock(rbmp);
604 } 624 }
605#endif /* #ifdef CONFIG_RCU_BOOST */ 625#endif /* #ifdef CONFIG_RCU_BOOST */
606 local_irq_restore(flags); 626 local_irq_restore(flags);
@@ -618,13 +638,22 @@ void __rcu_read_unlock(void)
618 struct task_struct *t = current; 638 struct task_struct *t = current;
619 639
620 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ 640 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
621 --t->rcu_read_lock_nesting; 641 if (t->rcu_read_lock_nesting != 1)
622 barrier(); /* decrement before load of ->rcu_read_unlock_special */ 642 --t->rcu_read_lock_nesting;
623 if (t->rcu_read_lock_nesting == 0 && 643 else {
624 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 644 t->rcu_read_lock_nesting = INT_MIN;
625 rcu_read_unlock_special(t); 645 barrier(); /* assign before ->rcu_read_unlock_special load */
646 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
647 rcu_read_unlock_special(t);
648 barrier(); /* ->rcu_read_unlock_special load before assign */
649 t->rcu_read_lock_nesting = 0;
650 }
626#ifdef CONFIG_PROVE_LOCKING 651#ifdef CONFIG_PROVE_LOCKING
627 WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); 652 {
653 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
654
655 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
656 }
628#endif /* #ifdef CONFIG_PROVE_LOCKING */ 657#endif /* #ifdef CONFIG_PROVE_LOCKING */
629} 658}
630EXPORT_SYMBOL_GPL(__rcu_read_unlock); 659EXPORT_SYMBOL_GPL(__rcu_read_unlock);
@@ -649,7 +678,7 @@ static void rcu_preempt_check_callbacks(void)
649 invoke_rcu_callbacks(); 678 invoke_rcu_callbacks();
650 if (rcu_preempt_gp_in_progress() && 679 if (rcu_preempt_gp_in_progress() &&
651 rcu_cpu_blocking_cur_gp() && 680 rcu_cpu_blocking_cur_gp() &&
652 rcu_preempt_running_reader()) 681 rcu_preempt_running_reader() > 0)
653 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 682 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
654} 683}
655 684
@@ -706,6 +735,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
706 */ 735 */
707void synchronize_rcu(void) 736void synchronize_rcu(void)
708{ 737{
738 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
739 !lock_is_held(&rcu_lock_map) &&
740 !lock_is_held(&rcu_sched_lock_map),
741 "Illegal synchronize_rcu() in RCU read-side critical section");
742
709#ifdef CONFIG_DEBUG_LOCK_ALLOC 743#ifdef CONFIG_DEBUG_LOCK_ALLOC
710 if (!rcu_scheduler_active) 744 if (!rcu_scheduler_active)
711 return; 745 return;
@@ -882,7 +916,8 @@ static void rcu_preempt_process_callbacks(void)
882static void invoke_rcu_callbacks(void) 916static void invoke_rcu_callbacks(void)
883{ 917{
884 have_rcu_kthread_work = 1; 918 have_rcu_kthread_work = 1;
885 wake_up(&rcu_kthread_wq); 919 if (rcu_kthread_task != NULL)
920 wake_up(&rcu_kthread_wq);
886} 921}
887 922
888#ifdef CONFIG_RCU_TRACE 923#ifdef CONFIG_RCU_TRACE
@@ -943,12 +978,16 @@ early_initcall(rcu_spawn_kthreads);
943 978
944#else /* #ifdef CONFIG_RCU_BOOST */ 979#else /* #ifdef CONFIG_RCU_BOOST */
945 980
981/* Hold off callback invocation until early_initcall() time. */
982static int rcu_scheduler_fully_active __read_mostly;
983
946/* 984/*
947 * Start up softirq processing of callbacks. 985 * Start up softirq processing of callbacks.
948 */ 986 */
949void invoke_rcu_callbacks(void) 987void invoke_rcu_callbacks(void)
950{ 988{
951 raise_softirq(RCU_SOFTIRQ); 989 if (rcu_scheduler_fully_active)
990 raise_softirq(RCU_SOFTIRQ);
952} 991}
953 992
954#ifdef CONFIG_RCU_TRACE 993#ifdef CONFIG_RCU_TRACE
@@ -963,10 +1002,14 @@ static bool rcu_is_callbacks_kthread(void)
963 1002
964#endif /* #ifdef CONFIG_RCU_TRACE */ 1003#endif /* #ifdef CONFIG_RCU_TRACE */
965 1004
966void rcu_init(void) 1005static int __init rcu_scheduler_really_started(void)
967{ 1006{
1007 rcu_scheduler_fully_active = 1;
968 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1008 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1009 raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */
1010 return 0;
969} 1011}
1012early_initcall(rcu_scheduler_really_started);
970 1013
971#endif /* #else #ifdef CONFIG_RCU_BOOST */ 1014#endif /* #else #ifdef CONFIG_RCU_BOOST */
972 1015
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a58ac285fc69..a89b381a8c6e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -65,7 +65,10 @@ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ 67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
68static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ 69static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
70static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */
71static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */
69static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 72static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
70static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 73static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
71static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 74static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -95,8 +98,14 @@ module_param(fqs_stutter, int, 0444);
95MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 98MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
96module_param(onoff_interval, int, 0444); 99module_param(onoff_interval, int, 0444);
97MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 100MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
101module_param(onoff_holdoff, int, 0444);
102MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
98module_param(shutdown_secs, int, 0444); 103module_param(shutdown_secs, int, 0444);
99MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); 104MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
105module_param(stall_cpu, int, 0444);
106MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
107module_param(stall_cpu_holdoff, int, 0444);
108MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
100module_param(test_boost, int, 0444); 109module_param(test_boost, int, 0444);
101MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 110MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
102module_param(test_boost_interval, int, 0444); 111module_param(test_boost_interval, int, 0444);
@@ -129,6 +138,7 @@ static struct task_struct *shutdown_task;
129#ifdef CONFIG_HOTPLUG_CPU 138#ifdef CONFIG_HOTPLUG_CPU
130static struct task_struct *onoff_task; 139static struct task_struct *onoff_task;
131#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 140#endif /* #ifdef CONFIG_HOTPLUG_CPU */
141static struct task_struct *stall_task;
132 142
133#define RCU_TORTURE_PIPE_LEN 10 143#define RCU_TORTURE_PIPE_LEN 10
134 144
@@ -990,12 +1000,12 @@ static void rcu_torture_timer(unsigned long unused)
990 rcu_read_lock_bh_held() || 1000 rcu_read_lock_bh_held() ||
991 rcu_read_lock_sched_held() || 1001 rcu_read_lock_sched_held() ||
992 srcu_read_lock_held(&srcu_ctl)); 1002 srcu_read_lock_held(&srcu_ctl));
993 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
994 if (p == NULL) { 1003 if (p == NULL) {
995 /* Leave because rcu_torture_writer is not yet underway */ 1004 /* Leave because rcu_torture_writer is not yet underway */
996 cur_ops->readunlock(idx); 1005 cur_ops->readunlock(idx);
997 return; 1006 return;
998 } 1007 }
1008 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
999 if (p->rtort_mbtest == 0) 1009 if (p->rtort_mbtest == 0)
1000 atomic_inc(&n_rcu_torture_mberror); 1010 atomic_inc(&n_rcu_torture_mberror);
1001 spin_lock(&rand_lock); 1011 spin_lock(&rand_lock);
@@ -1053,13 +1063,13 @@ rcu_torture_reader(void *arg)
1053 rcu_read_lock_bh_held() || 1063 rcu_read_lock_bh_held() ||
1054 rcu_read_lock_sched_held() || 1064 rcu_read_lock_sched_held() ||
1055 srcu_read_lock_held(&srcu_ctl)); 1065 srcu_read_lock_held(&srcu_ctl));
1056 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1057 if (p == NULL) { 1066 if (p == NULL) {
1058 /* Wait for rcu_torture_writer to get underway */ 1067 /* Wait for rcu_torture_writer to get underway */
1059 cur_ops->readunlock(idx); 1068 cur_ops->readunlock(idx);
1060 schedule_timeout_interruptible(HZ); 1069 schedule_timeout_interruptible(HZ);
1061 continue; 1070 continue;
1062 } 1071 }
1072 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1063 if (p->rtort_mbtest == 0) 1073 if (p->rtort_mbtest == 0)
1064 atomic_inc(&n_rcu_torture_mberror); 1074 atomic_inc(&n_rcu_torture_mberror);
1065 cur_ops->read_delay(&rand); 1075 cur_ops->read_delay(&rand);
@@ -1300,13 +1310,13 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1300 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1310 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1301 "test_boost=%d/%d test_boost_interval=%d " 1311 "test_boost=%d/%d test_boost_interval=%d "
1302 "test_boost_duration=%d shutdown_secs=%d " 1312 "test_boost_duration=%d shutdown_secs=%d "
1303 "onoff_interval=%d\n", 1313 "onoff_interval=%d onoff_holdoff=%d\n",
1304 torture_type, tag, nrealreaders, nfakewriters, 1314 torture_type, tag, nrealreaders, nfakewriters,
1305 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1315 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1306 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1316 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1307 test_boost, cur_ops->can_boost, 1317 test_boost, cur_ops->can_boost,
1308 test_boost_interval, test_boost_duration, shutdown_secs, 1318 test_boost_interval, test_boost_duration, shutdown_secs,
1309 onoff_interval); 1319 onoff_interval, onoff_holdoff);
1310} 1320}
1311 1321
1312static struct notifier_block rcutorture_shutdown_nb = { 1322static struct notifier_block rcutorture_shutdown_nb = {
@@ -1410,6 +1420,11 @@ rcu_torture_onoff(void *arg)
1410 for_each_online_cpu(cpu) 1420 for_each_online_cpu(cpu)
1411 maxcpu = cpu; 1421 maxcpu = cpu;
1412 WARN_ON(maxcpu < 0); 1422 WARN_ON(maxcpu < 0);
1423 if (onoff_holdoff > 0) {
1424 VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
1425 schedule_timeout_interruptible(onoff_holdoff * HZ);
1426 VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
1427 }
1413 while (!kthread_should_stop()) { 1428 while (!kthread_should_stop()) {
1414 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); 1429 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1415 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { 1430 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
@@ -1450,12 +1465,15 @@ rcu_torture_onoff(void *arg)
1450static int __cpuinit 1465static int __cpuinit
1451rcu_torture_onoff_init(void) 1466rcu_torture_onoff_init(void)
1452{ 1467{
1468 int ret;
1469
1453 if (onoff_interval <= 0) 1470 if (onoff_interval <= 0)
1454 return 0; 1471 return 0;
1455 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); 1472 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1456 if (IS_ERR(onoff_task)) { 1473 if (IS_ERR(onoff_task)) {
1474 ret = PTR_ERR(onoff_task);
1457 onoff_task = NULL; 1475 onoff_task = NULL;
1458 return PTR_ERR(onoff_task); 1476 return ret;
1459 } 1477 }
1460 return 0; 1478 return 0;
1461} 1479}
@@ -1481,6 +1499,63 @@ static void rcu_torture_onoff_cleanup(void)
1481 1499
1482#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ 1500#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1483 1501
1502/*
1503 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
1504 * induces a CPU stall for the time specified by stall_cpu.
1505 */
1506static int __cpuinit rcu_torture_stall(void *args)
1507{
1508 unsigned long stop_at;
1509
1510 VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
1511 if (stall_cpu_holdoff > 0) {
1512 VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
1513 schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
1514 VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
1515 }
1516 if (!kthread_should_stop()) {
1517 stop_at = get_seconds() + stall_cpu;
1518 /* RCU CPU stall is expected behavior in following code. */
1519 printk(KERN_ALERT "rcu_torture_stall start.\n");
1520 rcu_read_lock();
1521 preempt_disable();
1522 while (ULONG_CMP_LT(get_seconds(), stop_at))
1523 continue; /* Induce RCU CPU stall warning. */
1524 preempt_enable();
1525 rcu_read_unlock();
1526 printk(KERN_ALERT "rcu_torture_stall end.\n");
1527 }
1528 rcutorture_shutdown_absorb("rcu_torture_stall");
1529 while (!kthread_should_stop())
1530 schedule_timeout_interruptible(10 * HZ);
1531 return 0;
1532}
1533
1534/* Spawn CPU-stall kthread, if stall_cpu specified. */
1535static int __init rcu_torture_stall_init(void)
1536{
1537 int ret;
1538
1539 if (stall_cpu <= 0)
1540 return 0;
1541 stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
1542 if (IS_ERR(stall_task)) {
1543 ret = PTR_ERR(stall_task);
1544 stall_task = NULL;
1545 return ret;
1546 }
1547 return 0;
1548}
1549
1550/* Clean up after the CPU-stall kthread, if one was spawned. */
1551static void rcu_torture_stall_cleanup(void)
1552{
1553 if (stall_task == NULL)
1554 return;
1555 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1556 kthread_stop(stall_task);
1557}
1558
1484static int rcutorture_cpu_notify(struct notifier_block *self, 1559static int rcutorture_cpu_notify(struct notifier_block *self,
1485 unsigned long action, void *hcpu) 1560 unsigned long action, void *hcpu)
1486{ 1561{
@@ -1523,6 +1598,7 @@ rcu_torture_cleanup(void)
1523 fullstop = FULLSTOP_RMMOD; 1598 fullstop = FULLSTOP_RMMOD;
1524 mutex_unlock(&fullstop_mutex); 1599 mutex_unlock(&fullstop_mutex);
1525 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1600 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1601 rcu_torture_stall_cleanup();
1526 if (stutter_task) { 1602 if (stutter_task) {
1527 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1603 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1528 kthread_stop(stutter_task); 1604 kthread_stop(stutter_task);
@@ -1602,6 +1678,10 @@ rcu_torture_cleanup(void)
1602 cur_ops->cleanup(); 1678 cur_ops->cleanup();
1603 if (atomic_read(&n_rcu_torture_error)) 1679 if (atomic_read(&n_rcu_torture_error))
1604 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1680 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1681 else if (n_online_successes != n_online_attempts ||
1682 n_offline_successes != n_offline_attempts)
1683 rcu_torture_print_module_parms(cur_ops,
1684 "End of test: RCU_HOTPLUG");
1605 else 1685 else
1606 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1686 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1607} 1687}
@@ -1819,6 +1899,7 @@ rcu_torture_init(void)
1819 } 1899 }
1820 rcu_torture_onoff_init(); 1900 rcu_torture_onoff_init();
1821 register_reboot_notifier(&rcutorture_shutdown_nb); 1901 register_reboot_notifier(&rcutorture_shutdown_nb);
1902 rcu_torture_stall_init();
1822 rcutorture_record_test_transition(); 1903 rcutorture_record_test_transition();
1823 mutex_unlock(&fullstop_mutex); 1904 mutex_unlock(&fullstop_mutex);
1824 return 0; 1905 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6c4a6722abfd..1050d6d3922c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -50,6 +50,8 @@
50#include <linux/wait.h> 50#include <linux/wait.h>
51#include <linux/kthread.h> 51#include <linux/kthread.h>
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53#include <linux/delay.h>
54#include <linux/stop_machine.h>
53 55
54#include "rcutree.h" 56#include "rcutree.h"
55#include <trace/events/rcu.h> 57#include <trace/events/rcu.h>
@@ -196,7 +198,7 @@ void rcu_note_context_switch(int cpu)
196EXPORT_SYMBOL_GPL(rcu_note_context_switch); 198EXPORT_SYMBOL_GPL(rcu_note_context_switch);
197 199
198DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 200DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
199 .dynticks_nesting = DYNTICK_TASK_NESTING, 201 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
200 .dynticks = ATOMIC_INIT(1), 202 .dynticks = ATOMIC_INIT(1),
201}; 203};
202 204
@@ -208,8 +210,11 @@ module_param(blimit, int, 0);
208module_param(qhimark, int, 0); 210module_param(qhimark, int, 0);
209module_param(qlowmark, int, 0); 211module_param(qlowmark, int, 0);
210 212
211int rcu_cpu_stall_suppress __read_mostly; 213int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
214int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
215
212module_param(rcu_cpu_stall_suppress, int, 0644); 216module_param(rcu_cpu_stall_suppress, int, 0644);
217module_param(rcu_cpu_stall_timeout, int, 0644);
213 218
214static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 219static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
215static int rcu_pending(int cpu); 220static int rcu_pending(int cpu);
@@ -301,8 +306,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
301 return &rsp->node[0]; 306 return &rsp->node[0];
302} 307}
303 308
304#ifdef CONFIG_SMP
305
306/* 309/*
307 * If the specified CPU is offline, tell the caller that it is in 310 * If the specified CPU is offline, tell the caller that it is in
308 * a quiescent state. Otherwise, whack it with a reschedule IPI. 311 * a quiescent state. Otherwise, whack it with a reschedule IPI.
@@ -317,30 +320,21 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
317static int rcu_implicit_offline_qs(struct rcu_data *rdp) 320static int rcu_implicit_offline_qs(struct rcu_data *rdp)
318{ 321{
319 /* 322 /*
320 * If the CPU is offline, it is in a quiescent state. We can 323 * If the CPU is offline for more than a jiffy, it is in a quiescent
321 * trust its state not to change because interrupts are disabled. 324 * state. We can trust its state not to change because interrupts
325 * are disabled. The reason for the jiffy's worth of slack is to
326 * handle CPUs initializing on the way up and finding their way
327 * to the idle loop on the way down.
322 */ 328 */
323 if (cpu_is_offline(rdp->cpu)) { 329 if (cpu_is_offline(rdp->cpu) &&
330 ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {
324 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); 331 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
325 rdp->offline_fqs++; 332 rdp->offline_fqs++;
326 return 1; 333 return 1;
327 } 334 }
328
329 /*
330 * The CPU is online, so send it a reschedule IPI. This forces
331 * it through the scheduler, and (inefficiently) also handles cases
332 * where idle loops fail to inform RCU about the CPU being idle.
333 */
334 if (rdp->cpu != smp_processor_id())
335 smp_send_reschedule(rdp->cpu);
336 else
337 set_need_resched();
338 rdp->resched_ipi++;
339 return 0; 335 return 0;
340} 336}
341 337
342#endif /* #ifdef CONFIG_SMP */
343
344/* 338/*
345 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle 339 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
346 * 340 *
@@ -366,6 +360,17 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
366 atomic_inc(&rdtp->dynticks); 360 atomic_inc(&rdtp->dynticks);
367 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ 361 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
368 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 362 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
363
364 /*
365 * The idle task is not permitted to enter the idle loop while
366 * in an RCU read-side critical section.
367 */
368 rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
369 "Illegal idle entry in RCU read-side critical section.");
370 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
371 "Illegal idle entry in RCU-bh read-side critical section.");
372 rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
373 "Illegal idle entry in RCU-sched read-side critical section.");
369} 374}
370 375
371/** 376/**
@@ -389,10 +394,15 @@ void rcu_idle_enter(void)
389 local_irq_save(flags); 394 local_irq_save(flags);
390 rdtp = &__get_cpu_var(rcu_dynticks); 395 rdtp = &__get_cpu_var(rcu_dynticks);
391 oldval = rdtp->dynticks_nesting; 396 oldval = rdtp->dynticks_nesting;
392 rdtp->dynticks_nesting = 0; 397 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
398 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
399 rdtp->dynticks_nesting = 0;
400 else
401 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
393 rcu_idle_enter_common(rdtp, oldval); 402 rcu_idle_enter_common(rdtp, oldval);
394 local_irq_restore(flags); 403 local_irq_restore(flags);
395} 404}
405EXPORT_SYMBOL_GPL(rcu_idle_enter);
396 406
397/** 407/**
398 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 408 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -462,7 +472,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
462 * Exit idle mode, in other words, -enter- the mode in which RCU 472 * Exit idle mode, in other words, -enter- the mode in which RCU
463 * read-side critical sections can occur. 473 * read-side critical sections can occur.
464 * 474 *
465 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to 475 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
466 * allow for the possibility of usermode upcalls messing up our count 476 * allow for the possibility of usermode upcalls messing up our count
467 * of interrupt nesting level during the busy period that is just 477 * of interrupt nesting level during the busy period that is just
468 * now starting. 478 * now starting.
@@ -476,11 +486,15 @@ void rcu_idle_exit(void)
476 local_irq_save(flags); 486 local_irq_save(flags);
477 rdtp = &__get_cpu_var(rcu_dynticks); 487 rdtp = &__get_cpu_var(rcu_dynticks);
478 oldval = rdtp->dynticks_nesting; 488 oldval = rdtp->dynticks_nesting;
479 WARN_ON_ONCE(oldval != 0); 489 WARN_ON_ONCE(oldval < 0);
480 rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; 490 if (oldval & DYNTICK_TASK_NEST_MASK)
491 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
492 else
493 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
481 rcu_idle_exit_common(rdtp, oldval); 494 rcu_idle_exit_common(rdtp, oldval);
482 local_irq_restore(flags); 495 local_irq_restore(flags);
483} 496}
497EXPORT_SYMBOL_GPL(rcu_idle_exit);
484 498
485/** 499/**
486 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle 500 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -581,6 +595,49 @@ int rcu_is_cpu_idle(void)
581} 595}
582EXPORT_SYMBOL(rcu_is_cpu_idle); 596EXPORT_SYMBOL(rcu_is_cpu_idle);
583 597
598#ifdef CONFIG_HOTPLUG_CPU
599
600/*
601 * Is the current CPU online? Disable preemption to avoid false positives
602 * that could otherwise happen due to the current CPU number being sampled,
603 * this task being preempted, its old CPU being taken offline, resuming
604 * on some other CPU, then determining that its old CPU is now offline.
605 * It is OK to use RCU on an offline processor during initial boot, hence
606 * the check for rcu_scheduler_fully_active. Note also that it is OK
607 * for a CPU coming online to use RCU for one jiffy prior to marking itself
608 * online in the cpu_online_mask. Similarly, it is OK for a CPU going
609 * offline to continue to use RCU for one jiffy after marking itself
610 * offline in the cpu_online_mask. This leniency is necessary given the
611 * non-atomic nature of the online and offline processing, for example,
612 * the fact that a CPU enters the scheduler after completing the CPU_DYING
613 * notifiers.
614 *
615 * This is also why RCU internally marks CPUs online during the
616 * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
617 *
618 * Disable checking if in an NMI handler because we cannot safely report
619 * errors from NMI handlers anyway.
620 */
621bool rcu_lockdep_current_cpu_online(void)
622{
623 struct rcu_data *rdp;
624 struct rcu_node *rnp;
625 bool ret;
626
627 if (in_nmi())
628 return 1;
629 preempt_disable();
630 rdp = &__get_cpu_var(rcu_sched_data);
631 rnp = rdp->mynode;
632 ret = (rdp->grpmask & rnp->qsmaskinit) ||
633 !rcu_scheduler_fully_active;
634 preempt_enable();
635 return ret;
636}
637EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
638
639#endif /* #ifdef CONFIG_HOTPLUG_CPU */
640
584#endif /* #ifdef CONFIG_PROVE_RCU */ 641#endif /* #ifdef CONFIG_PROVE_RCU */
585 642
586/** 643/**
@@ -595,8 +652,6 @@ int rcu_is_cpu_rrupt_from_idle(void)
595 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 652 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
596} 653}
597 654
598#ifdef CONFIG_SMP
599
600/* 655/*
601 * Snapshot the specified CPU's dynticks counter so that we can later 656 * Snapshot the specified CPU's dynticks counter so that we can later
602 * credit them with an implicit quiescent state. Return 1 if this CPU 657 * credit them with an implicit quiescent state. Return 1 if this CPU
@@ -640,12 +695,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
640 return rcu_implicit_offline_qs(rdp); 695 return rcu_implicit_offline_qs(rdp);
641} 696}
642 697
643#endif /* #ifdef CONFIG_SMP */ 698static int jiffies_till_stall_check(void)
699{
700 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
701
702 /*
703 * Limit check must be consistent with the Kconfig limits
704 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
705 */
706 if (till_stall_check < 3) {
707 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
708 till_stall_check = 3;
709 } else if (till_stall_check > 300) {
710 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
711 till_stall_check = 300;
712 }
713 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
714}
644 715
645static void record_gp_stall_check_time(struct rcu_state *rsp) 716static void record_gp_stall_check_time(struct rcu_state *rsp)
646{ 717{
647 rsp->gp_start = jiffies; 718 rsp->gp_start = jiffies;
648 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; 719 rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
649} 720}
650 721
651static void print_other_cpu_stall(struct rcu_state *rsp) 722static void print_other_cpu_stall(struct rcu_state *rsp)
@@ -664,13 +735,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
664 raw_spin_unlock_irqrestore(&rnp->lock, flags); 735 raw_spin_unlock_irqrestore(&rnp->lock, flags);
665 return; 736 return;
666 } 737 }
667 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 738 rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
668
669 /*
670 * Now rat on any tasks that got kicked up to the root rcu_node
671 * due to CPU offlining.
672 */
673 ndetected = rcu_print_task_stall(rnp);
674 raw_spin_unlock_irqrestore(&rnp->lock, flags); 739 raw_spin_unlock_irqrestore(&rnp->lock, flags);
675 740
676 /* 741 /*
@@ -678,8 +743,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
678 * See Documentation/RCU/stallwarn.txt for info on how to debug 743 * See Documentation/RCU/stallwarn.txt for info on how to debug
679 * RCU CPU stall warnings. 744 * RCU CPU stall warnings.
680 */ 745 */
681 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", 746 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",
682 rsp->name); 747 rsp->name);
748 print_cpu_stall_info_begin();
683 rcu_for_each_leaf_node(rsp, rnp) { 749 rcu_for_each_leaf_node(rsp, rnp) {
684 raw_spin_lock_irqsave(&rnp->lock, flags); 750 raw_spin_lock_irqsave(&rnp->lock, flags);
685 ndetected += rcu_print_task_stall(rnp); 751 ndetected += rcu_print_task_stall(rnp);
@@ -688,11 +754,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
688 continue; 754 continue;
689 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 755 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
690 if (rnp->qsmask & (1UL << cpu)) { 756 if (rnp->qsmask & (1UL << cpu)) {
691 printk(" %d", rnp->grplo + cpu); 757 print_cpu_stall_info(rsp, rnp->grplo + cpu);
692 ndetected++; 758 ndetected++;
693 } 759 }
694 } 760 }
695 printk("} (detected by %d, t=%ld jiffies)\n", 761
762 /*
763 * Now rat on any tasks that got kicked up to the root rcu_node
764 * due to CPU offlining.
765 */
766 rnp = rcu_get_root(rsp);
767 raw_spin_lock_irqsave(&rnp->lock, flags);
768 ndetected = rcu_print_task_stall(rnp);
769 raw_spin_unlock_irqrestore(&rnp->lock, flags);
770
771 print_cpu_stall_info_end();
772 printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n",
696 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 773 smp_processor_id(), (long)(jiffies - rsp->gp_start));
697 if (ndetected == 0) 774 if (ndetected == 0)
698 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 775 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
@@ -716,15 +793,18 @@ static void print_cpu_stall(struct rcu_state *rsp)
716 * See Documentation/RCU/stallwarn.txt for info on how to debug 793 * See Documentation/RCU/stallwarn.txt for info on how to debug
717 * RCU CPU stall warnings. 794 * RCU CPU stall warnings.
718 */ 795 */
719 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 796 printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name);
720 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 797 print_cpu_stall_info_begin();
798 print_cpu_stall_info(rsp, smp_processor_id());
799 print_cpu_stall_info_end();
800 printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start);
721 if (!trigger_all_cpu_backtrace()) 801 if (!trigger_all_cpu_backtrace())
722 dump_stack(); 802 dump_stack();
723 803
724 raw_spin_lock_irqsave(&rnp->lock, flags); 804 raw_spin_lock_irqsave(&rnp->lock, flags);
725 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 805 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
726 rsp->jiffies_stall = 806 rsp->jiffies_stall = jiffies +
727 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 807 3 * jiffies_till_stall_check() + 3;
728 raw_spin_unlock_irqrestore(&rnp->lock, flags); 808 raw_spin_unlock_irqrestore(&rnp->lock, flags);
729 809
730 set_need_resched(); /* kick ourselves to get things going. */ 810 set_need_resched(); /* kick ourselves to get things going. */
@@ -807,6 +887,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
807 rdp->passed_quiesce = 0; 887 rdp->passed_quiesce = 0;
808 } else 888 } else
809 rdp->qs_pending = 0; 889 rdp->qs_pending = 0;
890 zero_cpu_stall_ticks(rdp);
810 } 891 }
811} 892}
812 893
@@ -943,6 +1024,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
943 * in preparation for detecting the next grace period. The caller must hold 1024 * in preparation for detecting the next grace period. The caller must hold
944 * the root node's ->lock, which is released before return. Hard irqs must 1025 * the root node's ->lock, which is released before return. Hard irqs must
945 * be disabled. 1026 * be disabled.
1027 *
1028 * Note that it is legal for a dying CPU (which is marked as offline) to
1029 * invoke this function. This can happen when the dying CPU reports its
1030 * quiescent state.
946 */ 1031 */
947static void 1032static void
948rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 1033rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
@@ -980,26 +1065,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
980 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 1065 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
981 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1066 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
982 record_gp_stall_check_time(rsp); 1067 record_gp_stall_check_time(rsp);
983
984 /* Special-case the common single-level case. */
985 if (NUM_RCU_NODES == 1) {
986 rcu_preempt_check_blocked_tasks(rnp);
987 rnp->qsmask = rnp->qsmaskinit;
988 rnp->gpnum = rsp->gpnum;
989 rnp->completed = rsp->completed;
990 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
991 rcu_start_gp_per_cpu(rsp, rnp, rdp);
992 rcu_preempt_boost_start_gp(rnp);
993 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
994 rnp->level, rnp->grplo,
995 rnp->grphi, rnp->qsmask);
996 raw_spin_unlock_irqrestore(&rnp->lock, flags);
997 return;
998 }
999
1000 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ 1068 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
1001 1069
1002
1003 /* Exclude any concurrent CPU-hotplug operations. */ 1070 /* Exclude any concurrent CPU-hotplug operations. */
1004 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1071 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1005 1072
@@ -1245,53 +1312,115 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1245 1312
1246/* 1313/*
1247 * Move a dying CPU's RCU callbacks to online CPU's callback list. 1314 * Move a dying CPU's RCU callbacks to online CPU's callback list.
1248 * Synchronization is not required because this function executes 1315 * Also record a quiescent state for this CPU for the current grace period.
1249 * in stop_machine() context. 1316 * Synchronization and interrupt disabling are not required because
1317 * this function executes in stop_machine() context. Therefore, cleanup
1318 * operations that might block must be done later from the CPU_DEAD
1319 * notifier.
1320 *
1321 * Note that the outgoing CPU's bit has already been cleared in the
1322 * cpu_online_mask. This allows us to randomly pick a callback
1323 * destination from the bits set in that mask.
1250 */ 1324 */
1251static void rcu_send_cbs_to_online(struct rcu_state *rsp) 1325static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1252{ 1326{
1253 int i; 1327 int i;
1254 /* current DYING CPU is cleared in the cpu_online_mask */ 1328 unsigned long mask;
1255 int receive_cpu = cpumask_any(cpu_online_mask); 1329 int receive_cpu = cpumask_any(cpu_online_mask);
1256 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1330 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1257 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); 1331 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
1332 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
1333
1334 /* First, adjust the counts. */
1335 if (rdp->nxtlist != NULL) {
1336 receive_rdp->qlen_lazy += rdp->qlen_lazy;
1337 receive_rdp->qlen += rdp->qlen;
1338 rdp->qlen_lazy = 0;
1339 rdp->qlen = 0;
1340 }
1258 1341
1259 if (rdp->nxtlist == NULL) 1342 /*
1260 return; /* irqs disabled, so comparison is stable. */ 1343 * Next, move ready-to-invoke callbacks to be invoked on some
1344 * other CPU. These will not be required to pass through another
1345 * grace period: They are done, regardless of CPU.
1346 */
1347 if (rdp->nxtlist != NULL &&
1348 rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) {
1349 struct rcu_head *oldhead;
1350 struct rcu_head **oldtail;
1351 struct rcu_head **newtail;
1352
1353 oldhead = rdp->nxtlist;
1354 oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
1355 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1356 *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
1357 *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
1358 newtail = rdp->nxttail[RCU_DONE_TAIL];
1359 for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
1360 if (receive_rdp->nxttail[i] == oldtail)
1361 receive_rdp->nxttail[i] = newtail;
1362 if (rdp->nxttail[i] == newtail)
1363 rdp->nxttail[i] = &rdp->nxtlist;
1364 }
1365 }
1261 1366
1262 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 1367 /*
1263 receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1368 * Finally, put the rest of the callbacks at the end of the list.
1264 receive_rdp->qlen += rdp->qlen; 1369 * The ones that made it partway through get to start over: We
1265 receive_rdp->n_cbs_adopted += rdp->qlen; 1370 * cannot assume that grace periods are synchronized across CPUs.
1266 rdp->n_cbs_orphaned += rdp->qlen; 1371 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
1372 * this does not seem compelling. Not yet, anyway.)
1373 */
1374 if (rdp->nxtlist != NULL) {
1375 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
1376 receive_rdp->nxttail[RCU_NEXT_TAIL] =
1377 rdp->nxttail[RCU_NEXT_TAIL];
1378 receive_rdp->n_cbs_adopted += rdp->qlen;
1379 rdp->n_cbs_orphaned += rdp->qlen;
1380
1381 rdp->nxtlist = NULL;
1382 for (i = 0; i < RCU_NEXT_SIZE; i++)
1383 rdp->nxttail[i] = &rdp->nxtlist;
1384 }
1267 1385
1268 rdp->nxtlist = NULL; 1386 /*
1269 for (i = 0; i < RCU_NEXT_SIZE; i++) 1387 * Record a quiescent state for the dying CPU. This is safe
1270 rdp->nxttail[i] = &rdp->nxtlist; 1388 * only because we have already cleared out the callbacks.
1271 rdp->qlen = 0; 1389 * (Otherwise, the RCU core might try to schedule the invocation
1390 * of callbacks on this now-offline CPU, which would be bad.)
1391 */
1392 mask = rdp->grpmask; /* rnp->grplo is constant. */
1393 trace_rcu_grace_period(rsp->name,
1394 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1395 "cpuofl");
1396 rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
1397 /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
1272} 1398}
1273 1399
1274/* 1400/*
1275 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 1401 * The CPU has been completely removed, and some other CPU is reporting
1276 * and move all callbacks from the outgoing CPU to the current one. 1402 * this fact from process context. Do the remainder of the cleanup.
1277 * There can only be one CPU hotplug operation at a time, so no other 1403 * There can only be one CPU hotplug operation at a time, so no other
1278 * CPU can be attempting to update rcu_cpu_kthread_task. 1404 * CPU can be attempting to update rcu_cpu_kthread_task.
1279 */ 1405 */
1280static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 1406static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1281{ 1407{
1282 unsigned long flags; 1408 unsigned long flags;
1283 unsigned long mask; 1409 unsigned long mask;
1284 int need_report = 0; 1410 int need_report = 0;
1285 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1411 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1286 struct rcu_node *rnp; 1412 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */
1287 1413
1414 /* Adjust any no-longer-needed kthreads. */
1288 rcu_stop_cpu_kthread(cpu); 1415 rcu_stop_cpu_kthread(cpu);
1416 rcu_node_kthread_setaffinity(rnp, -1);
1417
1418 /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */
1289 1419
1290 /* Exclude any attempts to start a new grace period. */ 1420 /* Exclude any attempts to start a new grace period. */
1291 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1421 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1292 1422
1293 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1423 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1294 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
1295 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1424 mask = rdp->grpmask; /* rnp->grplo is constant. */
1296 do { 1425 do {
1297 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1426 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
@@ -1299,20 +1428,11 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1299 if (rnp->qsmaskinit != 0) { 1428 if (rnp->qsmaskinit != 0) {
1300 if (rnp != rdp->mynode) 1429 if (rnp != rdp->mynode)
1301 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1430 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1302 else
1303 trace_rcu_grace_period(rsp->name,
1304 rnp->gpnum + 1 -
1305 !!(rnp->qsmask & mask),
1306 "cpuofl");
1307 break; 1431 break;
1308 } 1432 }
1309 if (rnp == rdp->mynode) { 1433 if (rnp == rdp->mynode)
1310 trace_rcu_grace_period(rsp->name,
1311 rnp->gpnum + 1 -
1312 !!(rnp->qsmask & mask),
1313 "cpuofl");
1314 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 1434 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
1315 } else 1435 else
1316 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1436 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1317 mask = rnp->grpmask; 1437 mask = rnp->grpmask;
1318 rnp = rnp->parent; 1438 rnp = rnp->parent;
@@ -1332,29 +1452,15 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1332 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1452 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1333 if (need_report & RCU_OFL_TASKS_EXP_GP) 1453 if (need_report & RCU_OFL_TASKS_EXP_GP)
1334 rcu_report_exp_rnp(rsp, rnp, true); 1454 rcu_report_exp_rnp(rsp, rnp, true);
1335 rcu_node_kthread_setaffinity(rnp, -1);
1336}
1337
1338/*
1339 * Remove the specified CPU from the RCU hierarchy and move any pending
1340 * callbacks that it might have to the current CPU. This code assumes
1341 * that at least one CPU in the system will remain running at all times.
1342 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
1343 */
1344static void rcu_offline_cpu(int cpu)
1345{
1346 __rcu_offline_cpu(cpu, &rcu_sched_state);
1347 __rcu_offline_cpu(cpu, &rcu_bh_state);
1348 rcu_preempt_offline_cpu(cpu);
1349} 1455}
1350 1456
1351#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1457#else /* #ifdef CONFIG_HOTPLUG_CPU */
1352 1458
1353static void rcu_send_cbs_to_online(struct rcu_state *rsp) 1459static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1354{ 1460{
1355} 1461}
1356 1462
1357static void rcu_offline_cpu(int cpu) 1463static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1358{ 1464{
1359} 1465}
1360 1466
@@ -1368,11 +1474,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1368{ 1474{
1369 unsigned long flags; 1475 unsigned long flags;
1370 struct rcu_head *next, *list, **tail; 1476 struct rcu_head *next, *list, **tail;
1371 int bl, count; 1477 int bl, count, count_lazy;
1372 1478
1373 /* If no callbacks are ready, just return.*/ 1479 /* If no callbacks are ready, just return.*/
1374 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1480 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1375 trace_rcu_batch_start(rsp->name, 0, 0); 1481 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
1376 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), 1482 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
1377 need_resched(), is_idle_task(current), 1483 need_resched(), is_idle_task(current),
1378 rcu_is_callbacks_kthread()); 1484 rcu_is_callbacks_kthread());
@@ -1384,8 +1490,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1384 * races with call_rcu() from interrupt handlers. 1490 * races with call_rcu() from interrupt handlers.
1385 */ 1491 */
1386 local_irq_save(flags); 1492 local_irq_save(flags);
1493 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
1387 bl = rdp->blimit; 1494 bl = rdp->blimit;
1388 trace_rcu_batch_start(rsp->name, rdp->qlen, bl); 1495 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
1389 list = rdp->nxtlist; 1496 list = rdp->nxtlist;
1390 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 1497 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1391 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 1498 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1396,12 +1503,13 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1396 local_irq_restore(flags); 1503 local_irq_restore(flags);
1397 1504
1398 /* Invoke callbacks. */ 1505 /* Invoke callbacks. */
1399 count = 0; 1506 count = count_lazy = 0;
1400 while (list) { 1507 while (list) {
1401 next = list->next; 1508 next = list->next;
1402 prefetch(next); 1509 prefetch(next);
1403 debug_rcu_head_unqueue(list); 1510 debug_rcu_head_unqueue(list);
1404 __rcu_reclaim(rsp->name, list); 1511 if (__rcu_reclaim(rsp->name, list))
1512 count_lazy++;
1405 list = next; 1513 list = next;
1406 /* Stop only if limit reached and CPU has something to do. */ 1514 /* Stop only if limit reached and CPU has something to do. */
1407 if (++count >= bl && 1515 if (++count >= bl &&
@@ -1416,6 +1524,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1416 rcu_is_callbacks_kthread()); 1524 rcu_is_callbacks_kthread());
1417 1525
1418 /* Update count, and requeue any remaining callbacks. */ 1526 /* Update count, and requeue any remaining callbacks. */
1527 rdp->qlen_lazy -= count_lazy;
1419 rdp->qlen -= count; 1528 rdp->qlen -= count;
1420 rdp->n_cbs_invoked += count; 1529 rdp->n_cbs_invoked += count;
1421 if (list != NULL) { 1530 if (list != NULL) {
@@ -1458,6 +1567,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1458void rcu_check_callbacks(int cpu, int user) 1567void rcu_check_callbacks(int cpu, int user)
1459{ 1568{
1460 trace_rcu_utilization("Start scheduler-tick"); 1569 trace_rcu_utilization("Start scheduler-tick");
1570 increment_cpu_stall_ticks();
1461 if (user || rcu_is_cpu_rrupt_from_idle()) { 1571 if (user || rcu_is_cpu_rrupt_from_idle()) {
1462 1572
1463 /* 1573 /*
@@ -1492,8 +1602,6 @@ void rcu_check_callbacks(int cpu, int user)
1492 trace_rcu_utilization("End scheduler-tick"); 1602 trace_rcu_utilization("End scheduler-tick");
1493} 1603}
1494 1604
1495#ifdef CONFIG_SMP
1496
1497/* 1605/*
1498 * Scan the leaf rcu_node structures, processing dyntick state for any that 1606 * Scan the leaf rcu_node structures, processing dyntick state for any that
1499 * have not yet encountered a quiescent state, using the function specified. 1607 * have not yet encountered a quiescent state, using the function specified.
@@ -1616,15 +1724,6 @@ unlock_fqs_ret:
1616 trace_rcu_utilization("End fqs"); 1724 trace_rcu_utilization("End fqs");
1617} 1725}
1618 1726
1619#else /* #ifdef CONFIG_SMP */
1620
1621static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1622{
1623 set_need_resched();
1624}
1625
1626#endif /* #else #ifdef CONFIG_SMP */
1627
1628/* 1727/*
1629 * This does the RCU core processing work for the specified rcu_state 1728 * This does the RCU core processing work for the specified rcu_state
1630 * and rcu_data structures. This may be called only from the CPU to 1729 * and rcu_data structures. This may be called only from the CPU to
@@ -1702,11 +1801,12 @@ static void invoke_rcu_core(void)
1702 1801
1703static void 1802static void
1704__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1803__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1705 struct rcu_state *rsp) 1804 struct rcu_state *rsp, bool lazy)
1706{ 1805{
1707 unsigned long flags; 1806 unsigned long flags;
1708 struct rcu_data *rdp; 1807 struct rcu_data *rdp;
1709 1808
1809 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
1710 debug_rcu_head_queue(head); 1810 debug_rcu_head_queue(head);
1711 head->func = func; 1811 head->func = func;
1712 head->next = NULL; 1812 head->next = NULL;
@@ -1720,18 +1820,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1720 * a quiescent state betweentimes. 1820 * a quiescent state betweentimes.
1721 */ 1821 */
1722 local_irq_save(flags); 1822 local_irq_save(flags);
1823 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
1723 rdp = this_cpu_ptr(rsp->rda); 1824 rdp = this_cpu_ptr(rsp->rda);
1724 1825
1725 /* Add the callback to our list. */ 1826 /* Add the callback to our list. */
1726 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1827 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1727 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1828 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1728 rdp->qlen++; 1829 rdp->qlen++;
1830 if (lazy)
1831 rdp->qlen_lazy++;
1729 1832
1730 if (__is_kfree_rcu_offset((unsigned long)func)) 1833 if (__is_kfree_rcu_offset((unsigned long)func))
1731 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 1834 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
1732 rdp->qlen); 1835 rdp->qlen_lazy, rdp->qlen);
1733 else 1836 else
1734 trace_rcu_callback(rsp->name, head, rdp->qlen); 1837 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
1735 1838
1736 /* If interrupts were disabled, don't dive into RCU core. */ 1839 /* If interrupts were disabled, don't dive into RCU core. */
1737 if (irqs_disabled_flags(flags)) { 1840 if (irqs_disabled_flags(flags)) {
@@ -1778,16 +1881,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1778 */ 1881 */
1779void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1882void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1780{ 1883{
1781 __call_rcu(head, func, &rcu_sched_state); 1884 __call_rcu(head, func, &rcu_sched_state, 0);
1782} 1885}
1783EXPORT_SYMBOL_GPL(call_rcu_sched); 1886EXPORT_SYMBOL_GPL(call_rcu_sched);
1784 1887
1785/* 1888/*
1786 * Queue an RCU for invocation after a quicker grace period. 1889 * Queue an RCU callback for invocation after a quicker grace period.
1787 */ 1890 */
1788void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1891void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1789{ 1892{
1790 __call_rcu(head, func, &rcu_bh_state); 1893 __call_rcu(head, func, &rcu_bh_state, 0);
1791} 1894}
1792EXPORT_SYMBOL_GPL(call_rcu_bh); 1895EXPORT_SYMBOL_GPL(call_rcu_bh);
1793 1896
@@ -1816,6 +1919,10 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1816 */ 1919 */
1817void synchronize_sched(void) 1920void synchronize_sched(void)
1818{ 1921{
1922 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
1923 !lock_is_held(&rcu_lock_map) &&
1924 !lock_is_held(&rcu_sched_lock_map),
1925 "Illegal synchronize_sched() in RCU-sched read-side critical section");
1819 if (rcu_blocking_is_gp()) 1926 if (rcu_blocking_is_gp())
1820 return; 1927 return;
1821 wait_rcu_gp(call_rcu_sched); 1928 wait_rcu_gp(call_rcu_sched);
@@ -1833,12 +1940,137 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
1833 */ 1940 */
1834void synchronize_rcu_bh(void) 1941void synchronize_rcu_bh(void)
1835{ 1942{
1943 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
1944 !lock_is_held(&rcu_lock_map) &&
1945 !lock_is_held(&rcu_sched_lock_map),
1946 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
1836 if (rcu_blocking_is_gp()) 1947 if (rcu_blocking_is_gp())
1837 return; 1948 return;
1838 wait_rcu_gp(call_rcu_bh); 1949 wait_rcu_gp(call_rcu_bh);
1839} 1950}
1840EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1951EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1841 1952
1953static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1954static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1955
1956static int synchronize_sched_expedited_cpu_stop(void *data)
1957{
1958 /*
1959 * There must be a full memory barrier on each affected CPU
1960 * between the time that try_stop_cpus() is called and the
1961 * time that it returns.
1962 *
1963 * In the current initial implementation of cpu_stop, the
1964 * above condition is already met when the control reaches
1965 * this point and the following smp_mb() is not strictly
1966 * necessary. Do smp_mb() anyway for documentation and
1967 * robustness against future implementation changes.
1968 */
1969 smp_mb(); /* See above comment block. */
1970 return 0;
1971}
1972
1973/**
1974 * synchronize_sched_expedited - Brute-force RCU-sched grace period
1975 *
1976 * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
1977 * approach to force the grace period to end quickly. This consumes
1978 * significant time on all CPUs and is unfriendly to real-time workloads,
1979 * so is thus not recommended for any sort of common-case code. In fact,
1980 * if you are using synchronize_sched_expedited() in a loop, please
1981 * restructure your code to batch your updates, and then use a single
1982 * synchronize_sched() instead.
1983 *
1984 * Note that it is illegal to call this function while holding any lock
1985 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
1986 * to call this function from a CPU-hotplug notifier. Failing to observe
1987 * these restriction will result in deadlock.
1988 *
1989 * This implementation can be thought of as an application of ticket
1990 * locking to RCU, with sync_sched_expedited_started and
1991 * sync_sched_expedited_done taking on the roles of the halves
1992 * of the ticket-lock word. Each task atomically increments
1993 * sync_sched_expedited_started upon entry, snapshotting the old value,
1994 * then attempts to stop all the CPUs. If this succeeds, then each
1995 * CPU will have executed a context switch, resulting in an RCU-sched
1996 * grace period. We are then done, so we use atomic_cmpxchg() to
1997 * update sync_sched_expedited_done to match our snapshot -- but
1998 * only if someone else has not already advanced past our snapshot.
1999 *
2000 * On the other hand, if try_stop_cpus() fails, we check the value
2001 * of sync_sched_expedited_done. If it has advanced past our
2002 * initial snapshot, then someone else must have forced a grace period
2003 * some time after we took our snapshot. In this case, our work is
2004 * done for us, and we can simply return. Otherwise, we try again,
2005 * but keep our initial snapshot for purposes of checking for someone
2006 * doing our work for us.
2007 *
2008 * If we fail too many times in a row, we fall back to synchronize_sched().
2009 */
2010void synchronize_sched_expedited(void)
2011{
2012 int firstsnap, s, snap, trycount = 0;
2013
2014 /* Note that atomic_inc_return() implies full memory barrier. */
2015 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
2016 get_online_cpus();
2017 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2018
2019 /*
2020 * Each pass through the following loop attempts to force a
2021 * context switch on each CPU.
2022 */
2023 while (try_stop_cpus(cpu_online_mask,
2024 synchronize_sched_expedited_cpu_stop,
2025 NULL) == -EAGAIN) {
2026 put_online_cpus();
2027
2028 /* No joy, try again later. Or just synchronize_sched(). */
2029 if (trycount++ < 10)
2030 udelay(trycount * num_online_cpus());
2031 else {
2032 synchronize_sched();
2033 return;
2034 }
2035
2036 /* Check to see if someone else did our work for us. */
2037 s = atomic_read(&sync_sched_expedited_done);
2038 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
2039 smp_mb(); /* ensure test happens before caller kfree */
2040 return;
2041 }
2042
2043 /*
2044 * Refetching sync_sched_expedited_started allows later
2045 * callers to piggyback on our grace period. We subtract
2046 * 1 to get the same token that the last incrementer got.
2047 * We retry after they started, so our grace period works
2048 * for them, and they started after our first try, so their
2049 * grace period works for us.
2050 */
2051 get_online_cpus();
2052 snap = atomic_read(&sync_sched_expedited_started);
2053 smp_mb(); /* ensure read is before try_stop_cpus(). */
2054 }
2055
2056 /*
2057 * Everyone up to our most recent fetch is covered by our grace
2058 * period. Update the counter, but only if our work is still
2059 * relevant -- which it won't be if someone who started later
2060 * than we did beat us to the punch.
2061 */
2062 do {
2063 s = atomic_read(&sync_sched_expedited_done);
2064 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
2065 smp_mb(); /* ensure test happens before caller kfree */
2066 break;
2067 }
2068 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
2069
2070 put_online_cpus();
2071}
2072EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
2073
1842/* 2074/*
1843 * Check to see if there is any immediate RCU-related work to be done 2075 * Check to see if there is any immediate RCU-related work to be done
1844 * by the current CPU, for the specified type of RCU, returning 1 if so. 2076 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1932,7 +2164,7 @@ static int rcu_cpu_has_callbacks(int cpu)
1932 /* RCU callbacks either ready or pending? */ 2164 /* RCU callbacks either ready or pending? */
1933 return per_cpu(rcu_sched_data, cpu).nxtlist || 2165 return per_cpu(rcu_sched_data, cpu).nxtlist ||
1934 per_cpu(rcu_bh_data, cpu).nxtlist || 2166 per_cpu(rcu_bh_data, cpu).nxtlist ||
1935 rcu_preempt_needs_cpu(cpu); 2167 rcu_preempt_cpu_has_callbacks(cpu);
1936} 2168}
1937 2169
1938static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 2170static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
@@ -2027,9 +2259,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2027 rdp->nxtlist = NULL; 2259 rdp->nxtlist = NULL;
2028 for (i = 0; i < RCU_NEXT_SIZE; i++) 2260 for (i = 0; i < RCU_NEXT_SIZE; i++)
2029 rdp->nxttail[i] = &rdp->nxtlist; 2261 rdp->nxttail[i] = &rdp->nxtlist;
2262 rdp->qlen_lazy = 0;
2030 rdp->qlen = 0; 2263 rdp->qlen = 0;
2031 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2264 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2032 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); 2265 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2033 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2266 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2034 rdp->cpu = cpu; 2267 rdp->cpu = cpu;
2035 rdp->rsp = rsp; 2268 rdp->rsp = rsp;
@@ -2057,7 +2290,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2057 rdp->qlen_last_fqs_check = 0; 2290 rdp->qlen_last_fqs_check = 0;
2058 rdp->n_force_qs_snap = rsp->n_force_qs; 2291 rdp->n_force_qs_snap = rsp->n_force_qs;
2059 rdp->blimit = blimit; 2292 rdp->blimit = blimit;
2060 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; 2293 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2061 atomic_set(&rdp->dynticks->dynticks, 2294 atomic_set(&rdp->dynticks->dynticks,
2062 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 2295 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2063 rcu_prepare_for_idle_init(cpu); 2296 rcu_prepare_for_idle_init(cpu);
@@ -2139,16 +2372,18 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2139 * touch any data without introducing corruption. We send the 2372 * touch any data without introducing corruption. We send the
2140 * dying CPU's callbacks to an arbitrarily chosen online CPU. 2373 * dying CPU's callbacks to an arbitrarily chosen online CPU.
2141 */ 2374 */
2142 rcu_send_cbs_to_online(&rcu_bh_state); 2375 rcu_cleanup_dying_cpu(&rcu_bh_state);
2143 rcu_send_cbs_to_online(&rcu_sched_state); 2376 rcu_cleanup_dying_cpu(&rcu_sched_state);
2144 rcu_preempt_send_cbs_to_online(); 2377 rcu_preempt_cleanup_dying_cpu();
2145 rcu_cleanup_after_idle(cpu); 2378 rcu_cleanup_after_idle(cpu);
2146 break; 2379 break;
2147 case CPU_DEAD: 2380 case CPU_DEAD:
2148 case CPU_DEAD_FROZEN: 2381 case CPU_DEAD_FROZEN:
2149 case CPU_UP_CANCELED: 2382 case CPU_UP_CANCELED:
2150 case CPU_UP_CANCELED_FROZEN: 2383 case CPU_UP_CANCELED_FROZEN:
2151 rcu_offline_cpu(cpu); 2384 rcu_cleanup_dead_cpu(cpu, &rcu_bh_state);
2385 rcu_cleanup_dead_cpu(cpu, &rcu_sched_state);
2386 rcu_preempt_cleanup_dead_cpu(cpu);
2152 break; 2387 break;
2153 default: 2388 default:
2154 break; 2389 break;
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index fddff92d6676..cdd1be0a4072 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -239,6 +239,12 @@ struct rcu_data {
239 bool preemptible; /* Preemptible RCU? */ 239 bool preemptible; /* Preemptible RCU? */
240 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 240 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
241 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 241 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
242#ifdef CONFIG_RCU_CPU_STALL_INFO
243 unsigned long ticks_this_gp; /* The number of scheduling-clock */
244 /* ticks this CPU has handled */
245 /* during and after the last grace */
246 /* period it is aware of. */
247#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
242 248
243 /* 2) batch handling */ 249 /* 2) batch handling */
244 /* 250 /*
@@ -265,7 +271,8 @@ struct rcu_data {
265 */ 271 */
266 struct rcu_head *nxtlist; 272 struct rcu_head *nxtlist;
267 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 273 struct rcu_head **nxttail[RCU_NEXT_SIZE];
268 long qlen; /* # of queued callbacks */ 274 long qlen_lazy; /* # of lazy queued callbacks */
275 long qlen; /* # of queued callbacks, incl lazy */
269 long qlen_last_fqs_check; 276 long qlen_last_fqs_check;
270 /* qlen at last check for QS forcing */ 277 /* qlen at last check for QS forcing */
271 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 278 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
@@ -282,7 +289,6 @@ struct rcu_data {
282 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 289 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
283 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 290 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
284 unsigned long offline_fqs; /* Kicked due to being offline. */ 291 unsigned long offline_fqs; /* Kicked due to being offline. */
285 unsigned long resched_ipi; /* Sent a resched IPI. */
286 292
287 /* 5) __rcu_pending() statistics. */ 293 /* 5) __rcu_pending() statistics. */
288 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 294 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -313,12 +319,6 @@ struct rcu_data {
313#else 319#else
314#define RCU_STALL_DELAY_DELTA 0 320#define RCU_STALL_DELAY_DELTA 0
315#endif 321#endif
316
317#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
318 RCU_STALL_DELAY_DELTA)
319 /* for rsp->jiffies_stall */
320#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
321 /* for rsp->jiffies_stall */
322#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 322#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
323 /* to take at least one */ 323 /* to take at least one */
324 /* scheduling clock irq */ 324 /* scheduling clock irq */
@@ -438,8 +438,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
438static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 438static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
439 struct rcu_node *rnp, 439 struct rcu_node *rnp,
440 struct rcu_data *rdp); 440 struct rcu_data *rdp);
441static void rcu_preempt_offline_cpu(int cpu);
442#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 441#endif /* #ifdef CONFIG_HOTPLUG_CPU */
442static void rcu_preempt_cleanup_dead_cpu(int cpu);
443static void rcu_preempt_check_callbacks(int cpu); 443static void rcu_preempt_check_callbacks(int cpu);
444static void rcu_preempt_process_callbacks(void); 444static void rcu_preempt_process_callbacks(void);
445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
@@ -448,9 +448,9 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
448 bool wake); 448 bool wake);
449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
450static int rcu_preempt_pending(int cpu); 450static int rcu_preempt_pending(int cpu);
451static int rcu_preempt_needs_cpu(int cpu); 451static int rcu_preempt_cpu_has_callbacks(int cpu);
452static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 452static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
453static void rcu_preempt_send_cbs_to_online(void); 453static void rcu_preempt_cleanup_dying_cpu(void);
454static void __init __rcu_init_preempt(void); 454static void __init __rcu_init_preempt(void);
455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -471,5 +471,10 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu); 471static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu); 472static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu); 473static void rcu_prepare_for_idle(int cpu);
474static void print_cpu_stall_info_begin(void);
475static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
476static void print_cpu_stall_info_end(void);
477static void zero_cpu_stall_ticks(struct rcu_data *rdp);
478static void increment_cpu_stall_ticks(void);
474 479
475#endif /* #ifndef RCU_TREE_NONCORE */ 480#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8bb35d73e1f9..c023464816be 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,7 +25,6 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h>
29 28
30#define RCU_KTHREAD_PRIO 1 29#define RCU_KTHREAD_PRIO 1
31 30
@@ -63,7 +62,10 @@ static void __init rcu_bootup_announce_oddness(void)
63 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 62 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
64#endif 63#endif
65#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 64#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
66 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 65 printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n");
66#endif
67#if defined(CONFIG_RCU_CPU_STALL_INFO)
68 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
67#endif 69#endif
68#if NUM_RCU_LVL_4 != 0 70#if NUM_RCU_LVL_4 != 0
69 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); 71 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
@@ -490,6 +492,31 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
490 492
491#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ 493#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
492 494
495#ifdef CONFIG_RCU_CPU_STALL_INFO
496
497static void rcu_print_task_stall_begin(struct rcu_node *rnp)
498{
499 printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
500 rnp->level, rnp->grplo, rnp->grphi);
501}
502
503static void rcu_print_task_stall_end(void)
504{
505 printk(KERN_CONT "\n");
506}
507
508#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
509
510static void rcu_print_task_stall_begin(struct rcu_node *rnp)
511{
512}
513
514static void rcu_print_task_stall_end(void)
515{
516}
517
518#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
519
493/* 520/*
494 * Scan the current list of tasks blocked within RCU read-side critical 521 * Scan the current list of tasks blocked within RCU read-side critical
495 * sections, printing out the tid of each. 522 * sections, printing out the tid of each.
@@ -501,12 +528,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
501 528
502 if (!rcu_preempt_blocked_readers_cgp(rnp)) 529 if (!rcu_preempt_blocked_readers_cgp(rnp))
503 return 0; 530 return 0;
531 rcu_print_task_stall_begin(rnp);
504 t = list_entry(rnp->gp_tasks, 532 t = list_entry(rnp->gp_tasks,
505 struct task_struct, rcu_node_entry); 533 struct task_struct, rcu_node_entry);
506 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 534 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
507 printk(" P%d", t->pid); 535 printk(KERN_CONT " P%d", t->pid);
508 ndetected++; 536 ndetected++;
509 } 537 }
538 rcu_print_task_stall_end();
510 return ndetected; 539 return ndetected;
511} 540}
512 541
@@ -581,7 +610,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
581 * absolutely necessary, but this is a good performance/complexity 610 * absolutely necessary, but this is a good performance/complexity
582 * tradeoff. 611 * tradeoff.
583 */ 612 */
584 if (rcu_preempt_blocked_readers_cgp(rnp)) 613 if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
585 retval |= RCU_OFL_TASKS_NORM_GP; 614 retval |= RCU_OFL_TASKS_NORM_GP;
586 if (rcu_preempted_readers_exp(rnp)) 615 if (rcu_preempted_readers_exp(rnp))
587 retval |= RCU_OFL_TASKS_EXP_GP; 616 retval |= RCU_OFL_TASKS_EXP_GP;
@@ -618,16 +647,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
618 return retval; 647 return retval;
619} 648}
620 649
650#endif /* #ifdef CONFIG_HOTPLUG_CPU */
651
621/* 652/*
622 * Do CPU-offline processing for preemptible RCU. 653 * Do CPU-offline processing for preemptible RCU.
623 */ 654 */
624static void rcu_preempt_offline_cpu(int cpu) 655static void rcu_preempt_cleanup_dead_cpu(int cpu)
625{ 656{
626 __rcu_offline_cpu(cpu, &rcu_preempt_state); 657 rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);
627} 658}
628 659
629#endif /* #ifdef CONFIG_HOTPLUG_CPU */
630
631/* 660/*
632 * Check for a quiescent state from the current CPU. When a task blocks, 661 * Check for a quiescent state from the current CPU. When a task blocks,
633 * the task is recorded in the corresponding CPU's rcu_node structure, 662 * the task is recorded in the corresponding CPU's rcu_node structure,
@@ -671,10 +700,24 @@ static void rcu_preempt_do_callbacks(void)
671 */ 700 */
672void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 701void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
673{ 702{
674 __call_rcu(head, func, &rcu_preempt_state); 703 __call_rcu(head, func, &rcu_preempt_state, 0);
675} 704}
676EXPORT_SYMBOL_GPL(call_rcu); 705EXPORT_SYMBOL_GPL(call_rcu);
677 706
707/*
708 * Queue an RCU callback for lazy invocation after a grace period.
709 * This will likely be later named something like "call_rcu_lazy()",
710 * but this change will require some way of tagging the lazy RCU
711 * callbacks in the list of pending callbacks. Until then, this
712 * function may only be called from __kfree_rcu().
713 */
714void kfree_call_rcu(struct rcu_head *head,
715 void (*func)(struct rcu_head *rcu))
716{
717 __call_rcu(head, func, &rcu_preempt_state, 1);
718}
719EXPORT_SYMBOL_GPL(kfree_call_rcu);
720
678/** 721/**
679 * synchronize_rcu - wait until a grace period has elapsed. 722 * synchronize_rcu - wait until a grace period has elapsed.
680 * 723 *
@@ -688,6 +731,10 @@ EXPORT_SYMBOL_GPL(call_rcu);
688 */ 731 */
689void synchronize_rcu(void) 732void synchronize_rcu(void)
690{ 733{
734 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
735 !lock_is_held(&rcu_lock_map) &&
736 !lock_is_held(&rcu_sched_lock_map),
737 "Illegal synchronize_rcu() in RCU read-side critical section");
691 if (!rcu_scheduler_active) 738 if (!rcu_scheduler_active)
692 return; 739 return;
693 wait_rcu_gp(call_rcu); 740 wait_rcu_gp(call_rcu);
@@ -788,10 +835,22 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
788 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ 835 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
789} 836}
790 837
791/* 838/**
792 * Wait for an rcu-preempt grace period, but expedite it. The basic idea 839 * synchronize_rcu_expedited - Brute-force RCU grace period
793 * is to invoke synchronize_sched_expedited() to push all the tasks to 840 *
794 * the ->blkd_tasks lists and wait for this list to drain. 841 * Wait for an RCU-preempt grace period, but expedite it. The basic
842 * idea is to invoke synchronize_sched_expedited() to push all the tasks to
843 * the ->blkd_tasks lists and wait for this list to drain. This consumes
844 * significant time on all CPUs and is unfriendly to real-time workloads,
845 * so is thus not recommended for any sort of common-case code.
846 * In fact, if you are using synchronize_rcu_expedited() in a loop,
847 * please restructure your code to batch your updates, and then Use a
848 * single synchronize_rcu() instead.
849 *
850 * Note that it is illegal to call this function while holding any lock
851 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
852 * to call this function from a CPU-hotplug notifier. Failing to observe
853 * these restriction will result in deadlock.
795 */ 854 */
796void synchronize_rcu_expedited(void) 855void synchronize_rcu_expedited(void)
797{ 856{
@@ -869,9 +928,9 @@ static int rcu_preempt_pending(int cpu)
869} 928}
870 929
871/* 930/*
872 * Does preemptible RCU need the CPU to stay out of dynticks mode? 931 * Does preemptible RCU have callbacks on this CPU?
873 */ 932 */
874static int rcu_preempt_needs_cpu(int cpu) 933static int rcu_preempt_cpu_has_callbacks(int cpu)
875{ 934{
876 return !!per_cpu(rcu_preempt_data, cpu).nxtlist; 935 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
877} 936}
@@ -894,11 +953,12 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
894} 953}
895 954
896/* 955/*
897 * Move preemptible RCU's callbacks from dying CPU to other online CPU. 956 * Move preemptible RCU's callbacks from dying CPU to other online CPU
957 * and record a quiescent state.
898 */ 958 */
899static void rcu_preempt_send_cbs_to_online(void) 959static void rcu_preempt_cleanup_dying_cpu(void)
900{ 960{
901 rcu_send_cbs_to_online(&rcu_preempt_state); 961 rcu_cleanup_dying_cpu(&rcu_preempt_state);
902} 962}
903 963
904/* 964/*
@@ -1034,16 +1094,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1034 return 0; 1094 return 0;
1035} 1095}
1036 1096
1097#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1098
1037/* 1099/*
1038 * Because preemptible RCU does not exist, it never needs CPU-offline 1100 * Because preemptible RCU does not exist, it never needs CPU-offline
1039 * processing. 1101 * processing.
1040 */ 1102 */
1041static void rcu_preempt_offline_cpu(int cpu) 1103static void rcu_preempt_cleanup_dead_cpu(int cpu)
1042{ 1104{
1043} 1105}
1044 1106
1045#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1046
1047/* 1107/*
1048 * Because preemptible RCU does not exist, it never has any callbacks 1108 * Because preemptible RCU does not exist, it never has any callbacks
1049 * to check. 1109 * to check.
@@ -1061,6 +1121,22 @@ static void rcu_preempt_process_callbacks(void)
1061} 1121}
1062 1122
1063/* 1123/*
1124 * Queue an RCU callback for lazy invocation after a grace period.
1125 * This will likely be later named something like "call_rcu_lazy()",
1126 * but this change will require some way of tagging the lazy RCU
1127 * callbacks in the list of pending callbacks. Until then, this
1128 * function may only be called from __kfree_rcu().
1129 *
1130 * Because there is no preemptible RCU, we use RCU-sched instead.
1131 */
1132void kfree_call_rcu(struct rcu_head *head,
1133 void (*func)(struct rcu_head *rcu))
1134{
1135 __call_rcu(head, func, &rcu_sched_state, 1);
1136}
1137EXPORT_SYMBOL_GPL(kfree_call_rcu);
1138
1139/*
1064 * Wait for an rcu-preempt grace period, but make it happen quickly. 1140 * Wait for an rcu-preempt grace period, but make it happen quickly.
1065 * But because preemptible RCU does not exist, map to rcu-sched. 1141 * But because preemptible RCU does not exist, map to rcu-sched.
1066 */ 1142 */
@@ -1093,9 +1169,9 @@ static int rcu_preempt_pending(int cpu)
1093} 1169}
1094 1170
1095/* 1171/*
1096 * Because preemptible RCU does not exist, it never needs any CPU. 1172 * Because preemptible RCU does not exist, it never has callbacks
1097 */ 1173 */
1098static int rcu_preempt_needs_cpu(int cpu) 1174static int rcu_preempt_cpu_has_callbacks(int cpu)
1099{ 1175{
1100 return 0; 1176 return 0;
1101} 1177}
@@ -1119,9 +1195,9 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1119} 1195}
1120 1196
1121/* 1197/*
1122 * Because there is no preemptible RCU, there are no callbacks to move. 1198 * Because there is no preemptible RCU, there is no cleanup to do.
1123 */ 1199 */
1124static void rcu_preempt_send_cbs_to_online(void) 1200static void rcu_preempt_cleanup_dying_cpu(void)
1125{ 1201{
1126} 1202}
1127 1203
@@ -1823,132 +1899,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1823 1899
1824#endif /* #else #ifdef CONFIG_RCU_BOOST */ 1900#endif /* #else #ifdef CONFIG_RCU_BOOST */
1825 1901
1826#ifndef CONFIG_SMP
1827
1828void synchronize_sched_expedited(void)
1829{
1830 cond_resched();
1831}
1832EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1833
1834#else /* #ifndef CONFIG_SMP */
1835
1836static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1837static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1838
1839static int synchronize_sched_expedited_cpu_stop(void *data)
1840{
1841 /*
1842 * There must be a full memory barrier on each affected CPU
1843 * between the time that try_stop_cpus() is called and the
1844 * time that it returns.
1845 *
1846 * In the current initial implementation of cpu_stop, the
1847 * above condition is already met when the control reaches
1848 * this point and the following smp_mb() is not strictly
1849 * necessary. Do smp_mb() anyway for documentation and
1850 * robustness against future implementation changes.
1851 */
1852 smp_mb(); /* See above comment block. */
1853 return 0;
1854}
1855
1856/*
1857 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1858 * approach to force grace period to end quickly. This consumes
1859 * significant time on all CPUs, and is thus not recommended for
1860 * any sort of common-case code.
1861 *
1862 * Note that it is illegal to call this function while holding any
1863 * lock that is acquired by a CPU-hotplug notifier. Failing to
1864 * observe this restriction will result in deadlock.
1865 *
1866 * This implementation can be thought of as an application of ticket
1867 * locking to RCU, with sync_sched_expedited_started and
1868 * sync_sched_expedited_done taking on the roles of the halves
1869 * of the ticket-lock word. Each task atomically increments
1870 * sync_sched_expedited_started upon entry, snapshotting the old value,
1871 * then attempts to stop all the CPUs. If this succeeds, then each
1872 * CPU will have executed a context switch, resulting in an RCU-sched
1873 * grace period. We are then done, so we use atomic_cmpxchg() to
1874 * update sync_sched_expedited_done to match our snapshot -- but
1875 * only if someone else has not already advanced past our snapshot.
1876 *
1877 * On the other hand, if try_stop_cpus() fails, we check the value
1878 * of sync_sched_expedited_done. If it has advanced past our
1879 * initial snapshot, then someone else must have forced a grace period
1880 * some time after we took our snapshot. In this case, our work is
1881 * done for us, and we can simply return. Otherwise, we try again,
1882 * but keep our initial snapshot for purposes of checking for someone
1883 * doing our work for us.
1884 *
1885 * If we fail too many times in a row, we fall back to synchronize_sched().
1886 */
1887void synchronize_sched_expedited(void)
1888{
1889 int firstsnap, s, snap, trycount = 0;
1890
1891 /* Note that atomic_inc_return() implies full memory barrier. */
1892 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1893 get_online_cpus();
1894
1895 /*
1896 * Each pass through the following loop attempts to force a
1897 * context switch on each CPU.
1898 */
1899 while (try_stop_cpus(cpu_online_mask,
1900 synchronize_sched_expedited_cpu_stop,
1901 NULL) == -EAGAIN) {
1902 put_online_cpus();
1903
1904 /* No joy, try again later. Or just synchronize_sched(). */
1905 if (trycount++ < 10)
1906 udelay(trycount * num_online_cpus());
1907 else {
1908 synchronize_sched();
1909 return;
1910 }
1911
1912 /* Check to see if someone else did our work for us. */
1913 s = atomic_read(&sync_sched_expedited_done);
1914 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1915 smp_mb(); /* ensure test happens before caller kfree */
1916 return;
1917 }
1918
1919 /*
1920 * Refetching sync_sched_expedited_started allows later
1921 * callers to piggyback on our grace period. We subtract
1922 * 1 to get the same token that the last incrementer got.
1923 * We retry after they started, so our grace period works
1924 * for them, and they started after our first try, so their
1925 * grace period works for us.
1926 */
1927 get_online_cpus();
1928 snap = atomic_read(&sync_sched_expedited_started);
1929 smp_mb(); /* ensure read is before try_stop_cpus(). */
1930 }
1931
1932 /*
1933 * Everyone up to our most recent fetch is covered by our grace
1934 * period. Update the counter, but only if our work is still
1935 * relevant -- which it won't be if someone who started later
1936 * than we did beat us to the punch.
1937 */
1938 do {
1939 s = atomic_read(&sync_sched_expedited_done);
1940 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1941 smp_mb(); /* ensure test happens before caller kfree */
1942 break;
1943 }
1944 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1945
1946 put_online_cpus();
1947}
1948EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1949
1950#endif /* #else #ifndef CONFIG_SMP */
1951
1952#if !defined(CONFIG_RCU_FAST_NO_HZ) 1902#if !defined(CONFIG_RCU_FAST_NO_HZ)
1953 1903
1954/* 1904/*
@@ -1981,7 +1931,7 @@ static void rcu_cleanup_after_idle(int cpu)
1981} 1931}
1982 1932
1983/* 1933/*
1984 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, 1934 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1985 * is nothing. 1935 * is nothing.
1986 */ 1936 */
1987static void rcu_prepare_for_idle(int cpu) 1937static void rcu_prepare_for_idle(int cpu)
@@ -2015,6 +1965,9 @@ static void rcu_prepare_for_idle(int cpu)
2015 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your 1965 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
2016 * system. And if you are -that- concerned about energy efficiency, 1966 * system. And if you are -that- concerned about energy efficiency,
2017 * just power the system down and be done with it! 1967 * just power the system down and be done with it!
1968 * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
1969 * permitted to sleep in dyntick-idle mode with only lazy RCU
1970 * callbacks pending. Setting this too high can OOM your system.
2018 * 1971 *
2019 * The values below work well in practice. If future workloads require 1972 * The values below work well in practice. If future workloads require
2020 * adjustment, they can be converted into kernel config parameters, though 1973 * adjustment, they can be converted into kernel config parameters, though
@@ -2023,11 +1976,13 @@ static void rcu_prepare_for_idle(int cpu)
2023#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ 1976#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
2024#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ 1977#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
2025#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1978#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1979#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
2026 1980
2027static DEFINE_PER_CPU(int, rcu_dyntick_drain); 1981static DEFINE_PER_CPU(int, rcu_dyntick_drain);
2028static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 1982static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
2029static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); 1983static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
2030static ktime_t rcu_idle_gp_wait; 1984static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */
1985static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
2031 1986
2032/* 1987/*
2033 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1988 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -2048,6 +2003,48 @@ int rcu_needs_cpu(int cpu)
2048} 2003}
2049 2004
2050/* 2005/*
2006 * Does the specified flavor of RCU have non-lazy callbacks pending on
2007 * the specified CPU? Both RCU flavor and CPU are specified by the
2008 * rcu_data structure.
2009 */
2010static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
2011{
2012 return rdp->qlen != rdp->qlen_lazy;
2013}
2014
2015#ifdef CONFIG_TREE_PREEMPT_RCU
2016
2017/*
2018 * Are there non-lazy RCU-preempt callbacks? (There cannot be if there
2019 * is no RCU-preempt in the kernel.)
2020 */
2021static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
2022{
2023 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
2024
2025 return __rcu_cpu_has_nonlazy_callbacks(rdp);
2026}
2027
2028#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2029
2030static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
2031{
2032 return 0;
2033}
2034
2035#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
2036
2037/*
2038 * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
2039 */
2040static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2041{
2042 return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
2043 __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
2044 rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
2045}
2046
2047/*
2051 * Timer handler used to force CPU to start pushing its remaining RCU 2048 * Timer handler used to force CPU to start pushing its remaining RCU
2052 * callbacks in the case where it entered dyntick-idle mode with callbacks 2049 * callbacks in the case where it entered dyntick-idle mode with callbacks
2053 * pending. The hander doesn't really need to do anything because the 2050 * pending. The hander doesn't really need to do anything because the
@@ -2074,6 +2071,8 @@ static void rcu_prepare_for_idle_init(int cpu)
2074 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); 2071 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2075 2072
2076 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); 2073 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2074 upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
2075 rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
2077 firsttime = 0; 2076 firsttime = 0;
2078 } 2077 }
2079} 2078}
@@ -2109,10 +2108,6 @@ static void rcu_cleanup_after_idle(int cpu)
2109 */ 2108 */
2110static void rcu_prepare_for_idle(int cpu) 2109static void rcu_prepare_for_idle(int cpu)
2111{ 2110{
2112 unsigned long flags;
2113
2114 local_irq_save(flags);
2115
2116 /* 2111 /*
2117 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2112 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2118 * Also reset state to avoid prejudicing later attempts. 2113 * Also reset state to avoid prejudicing later attempts.
@@ -2120,7 +2115,6 @@ static void rcu_prepare_for_idle(int cpu)
2120 if (!rcu_cpu_has_callbacks(cpu)) { 2115 if (!rcu_cpu_has_callbacks(cpu)) {
2121 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2116 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2122 per_cpu(rcu_dyntick_drain, cpu) = 0; 2117 per_cpu(rcu_dyntick_drain, cpu) = 0;
2123 local_irq_restore(flags);
2124 trace_rcu_prep_idle("No callbacks"); 2118 trace_rcu_prep_idle("No callbacks");
2125 return; 2119 return;
2126 } 2120 }
@@ -2130,7 +2124,6 @@ static void rcu_prepare_for_idle(int cpu)
2130 * refrained from disabling the scheduling-clock tick. 2124 * refrained from disabling the scheduling-clock tick.
2131 */ 2125 */
2132 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { 2126 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
2133 local_irq_restore(flags);
2134 trace_rcu_prep_idle("In holdoff"); 2127 trace_rcu_prep_idle("In holdoff");
2135 return; 2128 return;
2136 } 2129 }
@@ -2140,18 +2133,22 @@ static void rcu_prepare_for_idle(int cpu)
2140 /* First time through, initialize the counter. */ 2133 /* First time through, initialize the counter. */
2141 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; 2134 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
2142 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && 2135 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
2143 !rcu_pending(cpu)) { 2136 !rcu_pending(cpu) &&
2137 !local_softirq_pending()) {
2144 /* Can we go dyntick-idle despite still having callbacks? */ 2138 /* Can we go dyntick-idle despite still having callbacks? */
2145 trace_rcu_prep_idle("Dyntick with callbacks"); 2139 trace_rcu_prep_idle("Dyntick with callbacks");
2146 per_cpu(rcu_dyntick_drain, cpu) = 0; 2140 per_cpu(rcu_dyntick_drain, cpu) = 0;
2147 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2141 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2148 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2142 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2149 rcu_idle_gp_wait, HRTIMER_MODE_REL); 2143 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2144 rcu_idle_gp_wait, HRTIMER_MODE_REL);
2145 else
2146 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2147 rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL);
2150 return; /* Nothing more to do immediately. */ 2148 return; /* Nothing more to do immediately. */
2151 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2149 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2152 /* We have hit the limit, so time to give up. */ 2150 /* We have hit the limit, so time to give up. */
2153 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2151 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2154 local_irq_restore(flags);
2155 trace_rcu_prep_idle("Begin holdoff"); 2152 trace_rcu_prep_idle("Begin holdoff");
2156 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ 2153 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2157 return; 2154 return;
@@ -2163,23 +2160,17 @@ static void rcu_prepare_for_idle(int cpu)
2163 */ 2160 */
2164#ifdef CONFIG_TREE_PREEMPT_RCU 2161#ifdef CONFIG_TREE_PREEMPT_RCU
2165 if (per_cpu(rcu_preempt_data, cpu).nxtlist) { 2162 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2166 local_irq_restore(flags);
2167 rcu_preempt_qs(cpu); 2163 rcu_preempt_qs(cpu);
2168 force_quiescent_state(&rcu_preempt_state, 0); 2164 force_quiescent_state(&rcu_preempt_state, 0);
2169 local_irq_save(flags);
2170 } 2165 }
2171#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 2166#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2172 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 2167 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2173 local_irq_restore(flags);
2174 rcu_sched_qs(cpu); 2168 rcu_sched_qs(cpu);
2175 force_quiescent_state(&rcu_sched_state, 0); 2169 force_quiescent_state(&rcu_sched_state, 0);
2176 local_irq_save(flags);
2177 } 2170 }
2178 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 2171 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2179 local_irq_restore(flags);
2180 rcu_bh_qs(cpu); 2172 rcu_bh_qs(cpu);
2181 force_quiescent_state(&rcu_bh_state, 0); 2173 force_quiescent_state(&rcu_bh_state, 0);
2182 local_irq_save(flags);
2183 } 2174 }
2184 2175
2185 /* 2176 /*
@@ -2187,13 +2178,124 @@ static void rcu_prepare_for_idle(int cpu)
2187 * So try forcing the callbacks through the grace period. 2178 * So try forcing the callbacks through the grace period.
2188 */ 2179 */
2189 if (rcu_cpu_has_callbacks(cpu)) { 2180 if (rcu_cpu_has_callbacks(cpu)) {
2190 local_irq_restore(flags);
2191 trace_rcu_prep_idle("More callbacks"); 2181 trace_rcu_prep_idle("More callbacks");
2192 invoke_rcu_core(); 2182 invoke_rcu_core();
2193 } else { 2183 } else
2194 local_irq_restore(flags);
2195 trace_rcu_prep_idle("Callbacks drained"); 2184 trace_rcu_prep_idle("Callbacks drained");
2196 }
2197} 2185}
2198 2186
2199#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2187#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2188
2189#ifdef CONFIG_RCU_CPU_STALL_INFO
2190
2191#ifdef CONFIG_RCU_FAST_NO_HZ
2192
2193static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2194{
2195 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
2196
2197 sprintf(cp, "drain=%d %c timer=%lld",
2198 per_cpu(rcu_dyntick_drain, cpu),
2199 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
2200 hrtimer_active(hrtp)
2201 ? ktime_to_us(hrtimer_get_remaining(hrtp))
2202 : -1);
2203}
2204
2205#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
2206
2207static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2208{
2209}
2210
2211#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
2212
2213/* Initiate the stall-info list. */
2214static void print_cpu_stall_info_begin(void)
2215{
2216 printk(KERN_CONT "\n");
2217}
2218
2219/*
2220 * Print out diagnostic information for the specified stalled CPU.
2221 *
2222 * If the specified CPU is aware of the current RCU grace period
2223 * (flavor specified by rsp), then print the number of scheduling
2224 * clock interrupts the CPU has taken during the time that it has
2225 * been aware. Otherwise, print the number of RCU grace periods
2226 * that this CPU is ignorant of, for example, "1" if the CPU was
2227 * aware of the previous grace period.
2228 *
2229 * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
2230 */
2231static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2232{
2233 char fast_no_hz[72];
2234 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2235 struct rcu_dynticks *rdtp = rdp->dynticks;
2236 char *ticks_title;
2237 unsigned long ticks_value;
2238
2239 if (rsp->gpnum == rdp->gpnum) {
2240 ticks_title = "ticks this GP";
2241 ticks_value = rdp->ticks_this_gp;
2242 } else {
2243 ticks_title = "GPs behind";
2244 ticks_value = rsp->gpnum - rdp->gpnum;
2245 }
2246 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
2247 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
2248 cpu, ticks_value, ticks_title,
2249 atomic_read(&rdtp->dynticks) & 0xfff,
2250 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
2251 fast_no_hz);
2252}
2253
2254/* Terminate the stall-info list. */
2255static void print_cpu_stall_info_end(void)
2256{
2257 printk(KERN_ERR "\t");
2258}
2259
2260/* Zero ->ticks_this_gp for all flavors of RCU. */
2261static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2262{
2263 rdp->ticks_this_gp = 0;
2264}
2265
2266/* Increment ->ticks_this_gp for all flavors of RCU. */
2267static void increment_cpu_stall_ticks(void)
2268{
2269 __get_cpu_var(rcu_sched_data).ticks_this_gp++;
2270 __get_cpu_var(rcu_bh_data).ticks_this_gp++;
2271#ifdef CONFIG_TREE_PREEMPT_RCU
2272 __get_cpu_var(rcu_preempt_data).ticks_this_gp++;
2273#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2274}
2275
2276#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
2277
2278static void print_cpu_stall_info_begin(void)
2279{
2280 printk(KERN_CONT " {");
2281}
2282
2283static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2284{
2285 printk(KERN_CONT " %d", cpu);
2286}
2287
2288static void print_cpu_stall_info_end(void)
2289{
2290 printk(KERN_CONT "} ");
2291}
2292
2293static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2294{
2295}
2296
2297static void increment_cpu_stall_ticks(void)
2298{
2299}
2300
2301#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 654cfe67f0d1..ed459edeff43 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -72,9 +72,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
72 rdp->dynticks->dynticks_nesting, 72 rdp->dynticks->dynticks_nesting,
73 rdp->dynticks->dynticks_nmi_nesting, 73 rdp->dynticks->dynticks_nmi_nesting,
74 rdp->dynticks_fqs); 74 rdp->dynticks_fqs);
75 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 75 seq_printf(m, " of=%lu", rdp->offline_fqs);
76 seq_printf(m, " ql=%ld qs=%c%c%c%c", 76 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
77 rdp->qlen, 77 rdp->qlen_lazy, rdp->qlen,
78 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 78 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
79 rdp->nxttail[RCU_NEXT_TAIL]], 79 rdp->nxttail[RCU_NEXT_TAIL]],
80 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 80 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -144,8 +144,8 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
144 rdp->dynticks->dynticks_nesting, 144 rdp->dynticks->dynticks_nesting,
145 rdp->dynticks->dynticks_nmi_nesting, 145 rdp->dynticks->dynticks_nmi_nesting,
146 rdp->dynticks_fqs); 146 rdp->dynticks_fqs);
147 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 147 seq_printf(m, ",%lu", rdp->offline_fqs);
148 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, 148 seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,
149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
150 rdp->nxttail[RCU_NEXT_TAIL]], 150 rdp->nxttail[RCU_NEXT_TAIL]],
151 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 151 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -168,7 +168,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
168{ 168{
169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
171 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); 171 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
172#ifdef CONFIG_RCU_BOOST 172#ifdef CONFIG_RCU_BOOST
173 seq_puts(m, "\"kt\",\"ktl\""); 173 seq_puts(m, "\"kt\",\"ktl\"");
174#endif /* #ifdef CONFIG_RCU_BOOST */ 174#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/resource.c b/kernel/resource.c
index 7640b3a947d0..7e8ea66a8c01 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
749 write_unlock(&resource_lock); 749 write_unlock(&resource_lock);
750 return result; 750 return result;
751} 751}
752EXPORT_SYMBOL(adjust_resource);
752 753
753static void __init __reserve_region_with_split(struct resource *root, 754static void __init __reserve_region_with_split(struct resource *root,
754 resource_size_t start, resource_size_t end, 755 resource_size_t start, resource_size_t end,
@@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root,
792 write_unlock(&resource_lock); 793 write_unlock(&resource_lock);
793} 794}
794 795
795EXPORT_SYMBOL(adjust_resource);
796
797/** 796/**
798 * resource_alignment - calculate resource's alignment 797 * resource_alignment - calculate resource's alignment
799 * @res: resource pointer 798 * @res: resource pointer
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b152f74f02de..6850f53e02d8 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -10,7 +10,6 @@
10#include <linux/export.h> 10#include <linux/export.h>
11#include <linux/rwsem.h> 11#include <linux/rwsem.h>
12 12
13#include <asm/system.h>
14#include <linux/atomic.h> 13#include <linux/atomic.h>
15 14
16/* 15/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9c1629c90b2d..e3ed0ecee7c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,7 +71,9 @@
71#include <linux/ftrace.h> 71#include <linux/ftrace.h>
72#include <linux/slab.h> 72#include <linux/slab.h>
73#include <linux/init_task.h> 73#include <linux/init_task.h>
74#include <linux/binfmts.h>
74 75
76#include <asm/switch_to.h>
75#include <asm/tlb.h> 77#include <asm/tlb.h>
76#include <asm/irq_regs.h> 78#include <asm/irq_regs.h>
77#include <asm/mutex.h> 79#include <asm/mutex.h>
@@ -162,13 +164,13 @@ static int sched_feat_show(struct seq_file *m, void *v)
162 164
163#ifdef HAVE_JUMP_LABEL 165#ifdef HAVE_JUMP_LABEL
164 166
165#define jump_label_key__true jump_label_key_enabled 167#define jump_label_key__true STATIC_KEY_INIT_TRUE
166#define jump_label_key__false jump_label_key_disabled 168#define jump_label_key__false STATIC_KEY_INIT_FALSE
167 169
168#define SCHED_FEAT(name, enabled) \ 170#define SCHED_FEAT(name, enabled) \
169 jump_label_key__##enabled , 171 jump_label_key__##enabled ,
170 172
171struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { 173struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
172#include "features.h" 174#include "features.h"
173}; 175};
174 176
@@ -176,14 +178,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
176 178
177static void sched_feat_disable(int i) 179static void sched_feat_disable(int i)
178{ 180{
179 if (jump_label_enabled(&sched_feat_keys[i])) 181 if (static_key_enabled(&sched_feat_keys[i]))
180 jump_label_dec(&sched_feat_keys[i]); 182 static_key_slow_dec(&sched_feat_keys[i]);
181} 183}
182 184
183static void sched_feat_enable(int i) 185static void sched_feat_enable(int i)
184{ 186{
185 if (!jump_label_enabled(&sched_feat_keys[i])) 187 if (!static_key_enabled(&sched_feat_keys[i]))
186 jump_label_inc(&sched_feat_keys[i]); 188 static_key_slow_inc(&sched_feat_keys[i]);
187} 189}
188#else 190#else
189static void sched_feat_disable(int i) { }; 191static void sched_feat_disable(int i) { };
@@ -894,7 +896,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
894 delta -= irq_delta; 896 delta -= irq_delta;
895#endif 897#endif
896#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 898#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
897 if (static_branch((&paravirt_steal_rq_enabled))) { 899 if (static_key_false((&paravirt_steal_rq_enabled))) {
898 u64 st; 900 u64 st;
899 901
900 steal = paravirt_steal_clock(cpu_of(rq)); 902 steal = paravirt_steal_clock(cpu_of(rq));
@@ -2784,7 +2786,7 @@ void account_idle_time(cputime_t cputime)
2784static __always_inline bool steal_account_process_tick(void) 2786static __always_inline bool steal_account_process_tick(void)
2785{ 2787{
2786#ifdef CONFIG_PARAVIRT 2788#ifdef CONFIG_PARAVIRT
2787 if (static_branch(&paravirt_steal_enabled)) { 2789 if (static_key_false(&paravirt_steal_enabled)) {
2788 u64 steal, st = 0; 2790 u64 steal, st = 0;
2789 2791
2790 steal = paravirt_steal_clock(smp_processor_id()); 2792 steal = paravirt_steal_clock(smp_processor_id());
@@ -7601,8 +7603,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7601 struct task_group, css); 7603 struct task_group, css);
7602} 7604}
7603 7605
7604static struct cgroup_subsys_state * 7606static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7605cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7606{ 7607{
7607 struct task_group *tg, *parent; 7608 struct task_group *tg, *parent;
7608 7609
@@ -7619,15 +7620,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7619 return &tg->css; 7620 return &tg->css;
7620} 7621}
7621 7622
7622static void 7623static void cpu_cgroup_destroy(struct cgroup *cgrp)
7623cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7624{ 7624{
7625 struct task_group *tg = cgroup_tg(cgrp); 7625 struct task_group *tg = cgroup_tg(cgrp);
7626 7626
7627 sched_destroy_group(tg); 7627 sched_destroy_group(tg);
7628} 7628}
7629 7629
7630static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7630static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7631 struct cgroup_taskset *tset) 7631 struct cgroup_taskset *tset)
7632{ 7632{
7633 struct task_struct *task; 7633 struct task_struct *task;
@@ -7645,7 +7645,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7645 return 0; 7645 return 0;
7646} 7646}
7647 7647
7648static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7648static void cpu_cgroup_attach(struct cgroup *cgrp,
7649 struct cgroup_taskset *tset) 7649 struct cgroup_taskset *tset)
7650{ 7650{
7651 struct task_struct *task; 7651 struct task_struct *task;
@@ -7655,8 +7655,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7655} 7655}
7656 7656
7657static void 7657static void
7658cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7658cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7659 struct cgroup *old_cgrp, struct task_struct *task) 7659 struct task_struct *task)
7660{ 7660{
7661 /* 7661 /*
7662 * cgroup_exit() is called in the copy_process() failure path. 7662 * cgroup_exit() is called in the copy_process() failure path.
@@ -8006,8 +8006,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8006 */ 8006 */
8007 8007
8008/* create a new cpu accounting group */ 8008/* create a new cpu accounting group */
8009static struct cgroup_subsys_state *cpuacct_create( 8009static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
8010 struct cgroup_subsys *ss, struct cgroup *cgrp)
8011{ 8010{
8012 struct cpuacct *ca; 8011 struct cpuacct *ca;
8013 8012
@@ -8037,8 +8036,7 @@ out:
8037} 8036}
8038 8037
8039/* destroy an existing cpu accounting group */ 8038/* destroy an existing cpu accounting group */
8040static void 8039static void cpuacct_destroy(struct cgroup *cgrp)
8041cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
8042{ 8040{
8043 struct cpuacct *ca = cgroup_ca(cgrp); 8041 struct cpuacct *ca = cgroup_ca(cgrp);
8044 8042
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 258f430d71a5..0d97ebdc58f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1386,20 +1386,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1386#ifdef CONFIG_CFS_BANDWIDTH 1386#ifdef CONFIG_CFS_BANDWIDTH
1387 1387
1388#ifdef HAVE_JUMP_LABEL 1388#ifdef HAVE_JUMP_LABEL
1389static struct jump_label_key __cfs_bandwidth_used; 1389static struct static_key __cfs_bandwidth_used;
1390 1390
1391static inline bool cfs_bandwidth_used(void) 1391static inline bool cfs_bandwidth_used(void)
1392{ 1392{
1393 return static_branch(&__cfs_bandwidth_used); 1393 return static_key_false(&__cfs_bandwidth_used);
1394} 1394}
1395 1395
1396void account_cfs_bandwidth_used(int enabled, int was_enabled) 1396void account_cfs_bandwidth_used(int enabled, int was_enabled)
1397{ 1397{
1398 /* only need to count groups transitioning between enabled/!enabled */ 1398 /* only need to count groups transitioning between enabled/!enabled */
1399 if (enabled && !was_enabled) 1399 if (enabled && !was_enabled)
1400 jump_label_inc(&__cfs_bandwidth_used); 1400 static_key_slow_inc(&__cfs_bandwidth_used);
1401 else if (!enabled && was_enabled) 1401 else if (!enabled && was_enabled)
1402 jump_label_dec(&__cfs_bandwidth_used); 1402 static_key_slow_dec(&__cfs_bandwidth_used);
1403} 1403}
1404#else /* HAVE_JUMP_LABEL */ 1404#else /* HAVE_JUMP_LABEL */
1405static bool cfs_bandwidth_used(void) 1405static bool cfs_bandwidth_used(void)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 753bdd567416..42b1f304b044 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -600,7 +600,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
600 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 600 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
601 */ 601 */
602#ifdef CONFIG_SCHED_DEBUG 602#ifdef CONFIG_SCHED_DEBUG
603# include <linux/jump_label.h> 603# include <linux/static_key.h>
604# define const_debug __read_mostly 604# define const_debug __read_mostly
605#else 605#else
606# define const_debug const 606# define const_debug const
@@ -619,18 +619,18 @@ enum {
619#undef SCHED_FEAT 619#undef SCHED_FEAT
620 620
621#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 621#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
622static __always_inline bool static_branch__true(struct jump_label_key *key) 622static __always_inline bool static_branch__true(struct static_key *key)
623{ 623{
624 return likely(static_branch(key)); /* Not out of line branch. */ 624 return static_key_true(key); /* Not out of line branch. */
625} 625}
626 626
627static __always_inline bool static_branch__false(struct jump_label_key *key) 627static __always_inline bool static_branch__false(struct static_key *key)
628{ 628{
629 return unlikely(static_branch(key)); /* Out of line branch. */ 629 return static_key_false(key); /* Out of line branch. */
630} 630}
631 631
632#define SCHED_FEAT(name, enabled) \ 632#define SCHED_FEAT(name, enabled) \
633static __always_inline bool static_branch_##name(struct jump_label_key *key) \ 633static __always_inline bool static_branch_##name(struct static_key *key) \
634{ \ 634{ \
635 return static_branch__##enabled(key); \ 635 return static_branch__##enabled(key); \
636} 636}
@@ -639,7 +639,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \
639 639
640#undef SCHED_FEAT 640#undef SCHED_FEAT
641 641
642extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; 642extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
643#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) 643#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
644#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ 644#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
645#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 645#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
diff --git a/kernel/signal.c b/kernel/signal.c
index c73c4284160e..17afcaf582d0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -36,6 +36,7 @@
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37#include <asm/unistd.h> 37#include <asm/unistd.h>
38#include <asm/siginfo.h> 38#include <asm/siginfo.h>
39#include <asm/cacheflush.h>
39#include "audit.h" /* audit_signal_info() */ 40#include "audit.h" /* audit_signal_info() */
40 41
41/* 42/*
@@ -58,21 +59,20 @@ static int sig_handler_ignored(void __user *handler, int sig)
58 (handler == SIG_DFL && sig_kernel_ignore(sig)); 59 (handler == SIG_DFL && sig_kernel_ignore(sig));
59} 60}
60 61
61static int sig_task_ignored(struct task_struct *t, int sig, 62static int sig_task_ignored(struct task_struct *t, int sig, bool force)
62 int from_ancestor_ns)
63{ 63{
64 void __user *handler; 64 void __user *handler;
65 65
66 handler = sig_handler(t, sig); 66 handler = sig_handler(t, sig);
67 67
68 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && 68 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
69 handler == SIG_DFL && !from_ancestor_ns) 69 handler == SIG_DFL && !force)
70 return 1; 70 return 1;
71 71
72 return sig_handler_ignored(handler, sig); 72 return sig_handler_ignored(handler, sig);
73} 73}
74 74
75static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) 75static int sig_ignored(struct task_struct *t, int sig, bool force)
76{ 76{
77 /* 77 /*
78 * Blocked signals are never ignored, since the 78 * Blocked signals are never ignored, since the
@@ -82,7 +82,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
82 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 82 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
83 return 0; 83 return 0;
84 84
85 if (!sig_task_ignored(t, sig, from_ancestor_ns)) 85 if (!sig_task_ignored(t, sig, force))
86 return 0; 86 return 0;
87 87
88 /* 88 /*
@@ -855,7 +855,7 @@ static void ptrace_trap_notify(struct task_struct *t)
855 * Returns true if the signal should be actually delivered, otherwise 855 * Returns true if the signal should be actually delivered, otherwise
856 * it should be dropped. 856 * it should be dropped.
857 */ 857 */
858static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) 858static int prepare_signal(int sig, struct task_struct *p, bool force)
859{ 859{
860 struct signal_struct *signal = p->signal; 860 struct signal_struct *signal = p->signal;
861 struct task_struct *t; 861 struct task_struct *t;
@@ -915,7 +915,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
915 } 915 }
916 } 916 }
917 917
918 return !sig_ignored(p, sig, from_ancestor_ns); 918 return !sig_ignored(p, sig, force);
919} 919}
920 920
921/* 921/*
@@ -1054,13 +1054,14 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1054 struct sigpending *pending; 1054 struct sigpending *pending;
1055 struct sigqueue *q; 1055 struct sigqueue *q;
1056 int override_rlimit; 1056 int override_rlimit;
1057 1057 int ret = 0, result;
1058 trace_signal_generate(sig, info, t);
1059 1058
1060 assert_spin_locked(&t->sighand->siglock); 1059 assert_spin_locked(&t->sighand->siglock);
1061 1060
1062 if (!prepare_signal(sig, t, from_ancestor_ns)) 1061 result = TRACE_SIGNAL_IGNORED;
1063 return 0; 1062 if (!prepare_signal(sig, t,
1063 from_ancestor_ns || (info == SEND_SIG_FORCED)))
1064 goto ret;
1064 1065
1065 pending = group ? &t->signal->shared_pending : &t->pending; 1066 pending = group ? &t->signal->shared_pending : &t->pending;
1066 /* 1067 /*
@@ -1068,8 +1069,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1068 * exactly one non-rt signal, so that we can get more 1069 * exactly one non-rt signal, so that we can get more
1069 * detailed information about the cause of the signal. 1070 * detailed information about the cause of the signal.
1070 */ 1071 */
1072 result = TRACE_SIGNAL_ALREADY_PENDING;
1071 if (legacy_queue(pending, sig)) 1073 if (legacy_queue(pending, sig))
1072 return 0; 1074 goto ret;
1075
1076 result = TRACE_SIGNAL_DELIVERED;
1073 /* 1077 /*
1074 * fast-pathed signals for kernel-internal things like SIGSTOP 1078 * fast-pathed signals for kernel-internal things like SIGSTOP
1075 * or SIGKILL. 1079 * or SIGKILL.
@@ -1127,14 +1131,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1127 * signal was rt and sent by user using something 1131 * signal was rt and sent by user using something
1128 * other than kill(). 1132 * other than kill().
1129 */ 1133 */
1130 trace_signal_overflow_fail(sig, group, info); 1134 result = TRACE_SIGNAL_OVERFLOW_FAIL;
1131 return -EAGAIN; 1135 ret = -EAGAIN;
1136 goto ret;
1132 } else { 1137 } else {
1133 /* 1138 /*
1134 * This is a silent loss of information. We still 1139 * This is a silent loss of information. We still
1135 * send the signal, but the *info bits are lost. 1140 * send the signal, but the *info bits are lost.
1136 */ 1141 */
1137 trace_signal_lose_info(sig, group, info); 1142 result = TRACE_SIGNAL_LOSE_INFO;
1138 } 1143 }
1139 } 1144 }
1140 1145
@@ -1142,7 +1147,9 @@ out_set:
1142 signalfd_notify(t, sig); 1147 signalfd_notify(t, sig);
1143 sigaddset(&pending->signal, sig); 1148 sigaddset(&pending->signal, sig);
1144 complete_signal(sig, t, group); 1149 complete_signal(sig, t, group);
1145 return 0; 1150ret:
1151 trace_signal_generate(sig, info, t, group, result);
1152 return ret;
1146} 1153}
1147 1154
1148static int send_signal(int sig, struct siginfo *info, struct task_struct *t, 1155static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
@@ -1585,7 +1592,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1585 int sig = q->info.si_signo; 1592 int sig = q->info.si_signo;
1586 struct sigpending *pending; 1593 struct sigpending *pending;
1587 unsigned long flags; 1594 unsigned long flags;
1588 int ret; 1595 int ret, result;
1589 1596
1590 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1597 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1591 1598
@@ -1594,7 +1601,8 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1594 goto ret; 1601 goto ret;
1595 1602
1596 ret = 1; /* the signal is ignored */ 1603 ret = 1; /* the signal is ignored */
1597 if (!prepare_signal(sig, t, 0)) 1604 result = TRACE_SIGNAL_IGNORED;
1605 if (!prepare_signal(sig, t, false))
1598 goto out; 1606 goto out;
1599 1607
1600 ret = 0; 1608 ret = 0;
@@ -1605,6 +1613,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1605 */ 1613 */
1606 BUG_ON(q->info.si_code != SI_TIMER); 1614 BUG_ON(q->info.si_code != SI_TIMER);
1607 q->info.si_overrun++; 1615 q->info.si_overrun++;
1616 result = TRACE_SIGNAL_ALREADY_PENDING;
1608 goto out; 1617 goto out;
1609 } 1618 }
1610 q->info.si_overrun = 0; 1619 q->info.si_overrun = 0;
@@ -1614,7 +1623,9 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1614 list_add_tail(&q->list, &pending->list); 1623 list_add_tail(&q->list, &pending->list);
1615 sigaddset(&pending->signal, sig); 1624 sigaddset(&pending->signal, sig);
1616 complete_signal(sig, t, group); 1625 complete_signal(sig, t, group);
1626 result = TRACE_SIGNAL_DELIVERED;
1617out: 1627out:
1628 trace_signal_generate(sig, &q->info, t, group, result);
1618 unlock_task_sighand(t, &flags); 1629 unlock_task_sighand(t, &flags);
1619ret: 1630ret:
1620 return ret; 1631 return ret;
@@ -1642,6 +1653,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1642 BUG_ON(!tsk->ptrace && 1653 BUG_ON(!tsk->ptrace &&
1643 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1654 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1644 1655
1656 if (sig != SIGCHLD) {
1657 /*
1658 * This is only possible if parent == real_parent.
1659 * Check if it has changed security domain.
1660 */
1661 if (tsk->parent_exec_id != tsk->parent->self_exec_id)
1662 sig = SIGCHLD;
1663 }
1664
1645 info.si_signo = sig; 1665 info.si_signo = sig;
1646 info.si_errno = 0; 1666 info.si_errno = 0;
1647 /* 1667 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index db197d60489b..2f8b10ecf759 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -701,3 +701,93 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
701 return ret; 701 return ret;
702} 702}
703EXPORT_SYMBOL(on_each_cpu); 703EXPORT_SYMBOL(on_each_cpu);
704
705/**
706 * on_each_cpu_mask(): Run a function on processors specified by
707 * cpumask, which may include the local processor.
708 * @mask: The set of cpus to run on (only runs on online subset).
709 * @func: The function to run. This must be fast and non-blocking.
710 * @info: An arbitrary pointer to pass to the function.
711 * @wait: If true, wait (atomically) until function has completed
712 * on other CPUs.
713 *
714 * If @wait is true, then returns once @func has returned.
715 *
716 * You must not call this function with disabled interrupts or
717 * from a hardware interrupt handler or from a bottom half handler.
718 */
719void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
720 void *info, bool wait)
721{
722 int cpu = get_cpu();
723
724 smp_call_function_many(mask, func, info, wait);
725 if (cpumask_test_cpu(cpu, mask)) {
726 local_irq_disable();
727 func(info);
728 local_irq_enable();
729 }
730 put_cpu();
731}
732EXPORT_SYMBOL(on_each_cpu_mask);
733
734/*
735 * on_each_cpu_cond(): Call a function on each processor for which
736 * the supplied function cond_func returns true, optionally waiting
737 * for all the required CPUs to finish. This may include the local
738 * processor.
739 * @cond_func: A callback function that is passed a cpu id and
740 * the the info parameter. The function is called
741 * with preemption disabled. The function should
742 * return a blooean value indicating whether to IPI
743 * the specified CPU.
744 * @func: The function to run on all applicable CPUs.
745 * This must be fast and non-blocking.
746 * @info: An arbitrary pointer to pass to both functions.
747 * @wait: If true, wait (atomically) until function has
748 * completed on other CPUs.
749 * @gfp_flags: GFP flags to use when allocating the cpumask
750 * used internally by the function.
751 *
752 * The function might sleep if the GFP flags indicates a non
753 * atomic allocation is allowed.
754 *
755 * Preemption is disabled to protect against CPUs going offline but not online.
756 * CPUs going online during the call will not be seen or sent an IPI.
757 *
758 * You must not call this function with disabled interrupts or
759 * from a hardware interrupt handler or from a bottom half handler.
760 */
761void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
762 smp_call_func_t func, void *info, bool wait,
763 gfp_t gfp_flags)
764{
765 cpumask_var_t cpus;
766 int cpu, ret;
767
768 might_sleep_if(gfp_flags & __GFP_WAIT);
769
770 if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
771 preempt_disable();
772 for_each_online_cpu(cpu)
773 if (cond_func(cpu, info))
774 cpumask_set_cpu(cpu, cpus);
775 on_each_cpu_mask(cpus, func, info, wait);
776 preempt_enable();
777 free_cpumask_var(cpus);
778 } else {
779 /*
780 * No free cpumask, bother. No matter, we'll
781 * just have to IPI them one by one.
782 */
783 preempt_disable();
784 for_each_online_cpu(cpu)
785 if (cond_func(cpu, info)) {
786 ret = smp_call_function_single(cpu, func,
787 info, wait);
788 WARN_ON_ONCE(!ret);
789 }
790 preempt_enable();
791 }
792}
793EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f268369ebe1f..671f9594e368 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -297,7 +297,7 @@ void irq_enter(void)
297 int cpu = smp_processor_id(); 297 int cpu = smp_processor_id();
298 298
299 rcu_irq_enter(); 299 rcu_irq_enter();
300 if (idle_cpu(cpu) && !in_interrupt()) { 300 if (is_idle_task(current) && !in_interrupt()) {
301 /* 301 /*
302 * Prevent raise_softirq from needlessly waking up ksoftirqd 302 * Prevent raise_softirq from needlessly waking up ksoftirqd
303 * here, as softirq will be serviced on return from interrupt. 303 * here, as softirq will be serviced on return from interrupt.
@@ -310,31 +310,21 @@ void irq_enter(void)
310 __irq_enter(); 310 __irq_enter();
311} 311}
312 312
313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
314static inline void invoke_softirq(void) 313static inline void invoke_softirq(void)
315{ 314{
316 if (!force_irqthreads) 315 if (!force_irqthreads) {
316#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
317 __do_softirq(); 317 __do_softirq();
318 else {
319 __local_bh_disable((unsigned long)__builtin_return_address(0),
320 SOFTIRQ_OFFSET);
321 wakeup_softirqd();
322 __local_bh_enable(SOFTIRQ_OFFSET);
323 }
324}
325#else 318#else
326static inline void invoke_softirq(void)
327{
328 if (!force_irqthreads)
329 do_softirq(); 319 do_softirq();
330 else { 320#endif
321 } else {
331 __local_bh_disable((unsigned long)__builtin_return_address(0), 322 __local_bh_disable((unsigned long)__builtin_return_address(0),
332 SOFTIRQ_OFFSET); 323 SOFTIRQ_OFFSET);
333 wakeup_softirqd(); 324 wakeup_softirqd();
334 __local_bh_enable(SOFTIRQ_OFFSET); 325 __local_bh_enable(SOFTIRQ_OFFSET);
335 } 326 }
336} 327}
337#endif
338 328
339/* 329/*
340 * Exit an interrupt context. Process softirqs if needed and possible: 330 * Exit an interrupt context. Process softirqs if needed and possible:
@@ -385,6 +375,12 @@ void raise_softirq(unsigned int nr)
385 local_irq_restore(flags); 375 local_irq_restore(flags);
386} 376}
387 377
378void __raise_softirq_irqoff(unsigned int nr)
379{
380 trace_softirq_raise(nr);
381 or_softirq_pending(1UL << nr);
382}
383
388void open_softirq(int nr, void (*action)(struct softirq_action *)) 384void open_softirq(int nr, void (*action)(struct softirq_action *))
389{ 385{
390 softirq_vec[nr].action = action; 386 softirq_vec[nr].action = action;
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 84c7d96918bf..5cdd8065a3ce 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
163EXPORT_SYMBOL(_raw_spin_lock_bh); 163EXPORT_SYMBOL(_raw_spin_lock_bh);
164#endif 164#endif
165 165
166#ifndef CONFIG_INLINE_SPIN_UNLOCK 166#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) 167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
168{ 168{
169 __raw_spin_unlock(lock); 169 __raw_spin_unlock(lock);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 0febf61e1aa3..ba35f3a4a1f4 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -172,6 +172,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
172{ 172{
173 int idx; 173 int idx;
174 174
175 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
176 !lock_is_held(&rcu_bh_lock_map) &&
177 !lock_is_held(&rcu_lock_map) &&
178 !lock_is_held(&rcu_sched_lock_map),
179 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
180
175 idx = sp->completed; 181 idx = sp->completed;
176 mutex_lock(&sp->mutex); 182 mutex_lock(&sp->mutex);
177 183
@@ -280,19 +286,26 @@ void synchronize_srcu(struct srcu_struct *sp)
280EXPORT_SYMBOL_GPL(synchronize_srcu); 286EXPORT_SYMBOL_GPL(synchronize_srcu);
281 287
282/** 288/**
283 * synchronize_srcu_expedited - like synchronize_srcu, but less patient 289 * synchronize_srcu_expedited - Brute-force SRCU grace period
284 * @sp: srcu_struct with which to synchronize. 290 * @sp: srcu_struct with which to synchronize.
285 * 291 *
286 * Flip the completed counter, and wait for the old count to drain to zero. 292 * Wait for an SRCU grace period to elapse, but use a "big hammer"
287 * As with classic RCU, the updater must use some separate means of 293 * approach to force the grace period to end quickly. This consumes
288 * synchronizing concurrent updates. Can block; must be called from 294 * significant time on all CPUs and is unfriendly to real-time workloads,
289 * process context. 295 * so is thus not recommended for any sort of common-case code. In fact,
296 * if you are using synchronize_srcu_expedited() in a loop, please
297 * restructure your code to batch your updates, and then use a single
298 * synchronize_srcu() instead.
290 * 299 *
291 * Note that it is illegal to call synchronize_srcu_expedited() 300 * Note that it is illegal to call this function while holding any lock
292 * from the corresponding SRCU read-side critical section; doing so 301 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
293 * will result in deadlock. However, it is perfectly legal to call 302 * to call this function from a CPU-hotplug notifier. Failing to observe
294 * synchronize_srcu_expedited() on one srcu_struct from some other 303 * these restriction will result in deadlock. It is also illegal to call
295 * srcu_struct's read-side critical section. 304 * synchronize_srcu_expedited() from the corresponding SRCU read-side
305 * critical section; doing so will result in deadlock. However, it is
306 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
307 * from some other srcu_struct's read-side critical section, as long as
308 * the resulting graph of srcu_structs is acyclic.
296 */ 309 */
297void synchronize_srcu_expedited(struct srcu_struct *sp) 310void synchronize_srcu_expedited(struct srcu_struct *sp)
298{ 311{
diff --git a/kernel/sys.c b/kernel/sys.c
index 40701538fbd1..e7006eb6c1e4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
444 magic2 != LINUX_REBOOT_MAGIC2C)) 444 magic2 != LINUX_REBOOT_MAGIC2C))
445 return -EINVAL; 445 return -EINVAL;
446 446
447 /*
448 * If pid namespaces are enabled and the current task is in a child
449 * pid_namespace, the command is handled by reboot_pid_ns() which will
450 * call do_exit().
451 */
452 ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
453 if (ret)
454 return ret;
455
447 /* Instead of trying to make the power_off code look like 456 /* Instead of trying to make the power_off code look like
448 * halt when pm_power_off is not set do it the easy way. 457 * halt when pm_power_off is not set do it the easy way.
449 */ 458 */
@@ -1706,7 +1715,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
1706 if (arg4 | arg5) 1715 if (arg4 | arg5)
1707 return -EINVAL; 1716 return -EINVAL;
1708 1717
1709 if (!capable(CAP_SYS_ADMIN)) 1718 if (!capable(CAP_SYS_RESOURCE))
1710 return -EPERM; 1719 return -EPERM;
1711 1720
1712 if (addr >= TASK_SIZE) 1721 if (addr >= TASK_SIZE)
@@ -1962,6 +1971,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1962 case PR_SET_MM: 1971 case PR_SET_MM:
1963 error = prctl_set_mm(arg2, arg3, arg4, arg5); 1972 error = prctl_set_mm(arg2, arg3, arg4, arg5);
1964 break; 1973 break;
1974 case PR_SET_CHILD_SUBREAPER:
1975 me->signal->is_child_subreaper = !!arg2;
1976 error = 0;
1977 break;
1978 case PR_GET_CHILD_SUBREAPER:
1979 error = put_user(me->signal->is_child_subreaper,
1980 (int __user *) arg2);
1981 break;
1965 default: 1982 default:
1966 error = -EINVAL; 1983 error = -EINVAL;
1967 break; 1984 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f257e05e..52b3a06a02f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/bitmap.h>
26#include <linux/signal.h> 27#include <linux/signal.h>
27#include <linux/printk.h> 28#include <linux/printk.h>
28#include <linux/proc_fs.h> 29#include <linux/proc_fs.h>
@@ -58,6 +59,7 @@
58#include <linux/oom.h> 59#include <linux/oom.h>
59#include <linux/kmod.h> 60#include <linux/kmod.h>
60#include <linux/capability.h> 61#include <linux/capability.h>
62#include <linux/binfmts.h>
61 63
62#include <asm/uaccess.h> 64#include <asm/uaccess.h>
63#include <asm/processor.h> 65#include <asm/processor.h>
@@ -67,6 +69,9 @@
67#include <asm/stacktrace.h> 69#include <asm/stacktrace.h>
68#include <asm/io.h> 70#include <asm/io.h>
69#endif 71#endif
72#ifdef CONFIG_SPARC
73#include <asm/setup.h>
74#endif
70#ifdef CONFIG_BSD_PROCESS_ACCT 75#ifdef CONFIG_BSD_PROCESS_ACCT
71#include <linux/acct.h> 76#include <linux/acct.h>
72#endif 77#endif
@@ -141,7 +146,6 @@ static const int cap_last_cap = CAP_LAST_CAP;
141#include <linux/inotify.h> 146#include <linux/inotify.h>
142#endif 147#endif
143#ifdef CONFIG_SPARC 148#ifdef CONFIG_SPARC
144#include <asm/system.h>
145#endif 149#endif
146 150
147#ifdef CONFIG_SPARC64 151#ifdef CONFIG_SPARC64
@@ -192,20 +196,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
192 196
193#endif 197#endif
194 198
195static struct ctl_table root_table[];
196static struct ctl_table_root sysctl_table_root;
197static struct ctl_table_header root_table_header = {
198 {{.count = 1,
199 .ctl_table = root_table,
200 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
201 .root = &sysctl_table_root,
202 .set = &sysctl_table_root.default_set,
203};
204static struct ctl_table_root sysctl_table_root = {
205 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
206 .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
207};
208
209static struct ctl_table kern_table[]; 199static struct ctl_table kern_table[];
210static struct ctl_table vm_table[]; 200static struct ctl_table vm_table[];
211static struct ctl_table fs_table[]; 201static struct ctl_table fs_table[];
@@ -222,7 +212,7 @@ int sysctl_legacy_va_layout;
222 212
223/* The default sysctl tables: */ 213/* The default sysctl tables: */
224 214
225static struct ctl_table root_table[] = { 215static struct ctl_table sysctl_base_table[] = {
226 { 216 {
227 .procname = "kernel", 217 .procname = "kernel",
228 .mode = 0555, 218 .mode = 0555,
@@ -1559,490 +1549,12 @@ static struct ctl_table dev_table[] = {
1559 { } 1549 { }
1560}; 1550};
1561 1551
1562static DEFINE_SPINLOCK(sysctl_lock); 1552int __init sysctl_init(void)
1563
1564/* called under sysctl_lock */
1565static int use_table(struct ctl_table_header *p)
1566{ 1553{
1567 if (unlikely(p->unregistering)) 1554 register_sysctl_table(sysctl_base_table);
1568 return 0;
1569 p->used++;
1570 return 1;
1571}
1572
1573/* called under sysctl_lock */
1574static void unuse_table(struct ctl_table_header *p)
1575{
1576 if (!--p->used)
1577 if (unlikely(p->unregistering))
1578 complete(p->unregistering);
1579}
1580
1581/* called under sysctl_lock, will reacquire if has to wait */
1582static void start_unregistering(struct ctl_table_header *p)
1583{
1584 /*
1585 * if p->used is 0, nobody will ever touch that entry again;
1586 * we'll eliminate all paths to it before dropping sysctl_lock
1587 */
1588 if (unlikely(p->used)) {
1589 struct completion wait;
1590 init_completion(&wait);
1591 p->unregistering = &wait;
1592 spin_unlock(&sysctl_lock);
1593 wait_for_completion(&wait);
1594 spin_lock(&sysctl_lock);
1595 } else {
1596 /* anything non-NULL; we'll never dereference it */
1597 p->unregistering = ERR_PTR(-EINVAL);
1598 }
1599 /*
1600 * do not remove from the list until nobody holds it; walking the
1601 * list in do_sysctl() relies on that.
1602 */
1603 list_del_init(&p->ctl_entry);
1604}
1605
1606void sysctl_head_get(struct ctl_table_header *head)
1607{
1608 spin_lock(&sysctl_lock);
1609 head->count++;
1610 spin_unlock(&sysctl_lock);
1611}
1612
1613void sysctl_head_put(struct ctl_table_header *head)
1614{
1615 spin_lock(&sysctl_lock);
1616 if (!--head->count)
1617 kfree_rcu(head, rcu);
1618 spin_unlock(&sysctl_lock);
1619}
1620
1621struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
1622{
1623 if (!head)
1624 BUG();
1625 spin_lock(&sysctl_lock);
1626 if (!use_table(head))
1627 head = ERR_PTR(-ENOENT);
1628 spin_unlock(&sysctl_lock);
1629 return head;
1630}
1631
1632void sysctl_head_finish(struct ctl_table_header *head)
1633{
1634 if (!head)
1635 return;
1636 spin_lock(&sysctl_lock);
1637 unuse_table(head);
1638 spin_unlock(&sysctl_lock);
1639}
1640
1641static struct ctl_table_set *
1642lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
1643{
1644 struct ctl_table_set *set = &root->default_set;
1645 if (root->lookup)
1646 set = root->lookup(root, namespaces);
1647 return set;
1648}
1649
1650static struct list_head *
1651lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
1652{
1653 struct ctl_table_set *set = lookup_header_set(root, namespaces);
1654 return &set->list;
1655}
1656
1657struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
1658 struct ctl_table_header *prev)
1659{
1660 struct ctl_table_root *root;
1661 struct list_head *header_list;
1662 struct ctl_table_header *head;
1663 struct list_head *tmp;
1664
1665 spin_lock(&sysctl_lock);
1666 if (prev) {
1667 head = prev;
1668 tmp = &prev->ctl_entry;
1669 unuse_table(prev);
1670 goto next;
1671 }
1672 tmp = &root_table_header.ctl_entry;
1673 for (;;) {
1674 head = list_entry(tmp, struct ctl_table_header, ctl_entry);
1675
1676 if (!use_table(head))
1677 goto next;
1678 spin_unlock(&sysctl_lock);
1679 return head;
1680 next:
1681 root = head->root;
1682 tmp = tmp->next;
1683 header_list = lookup_header_list(root, namespaces);
1684 if (tmp != header_list)
1685 continue;
1686
1687 do {
1688 root = list_entry(root->root_list.next,
1689 struct ctl_table_root, root_list);
1690 if (root == &sysctl_table_root)
1691 goto out;
1692 header_list = lookup_header_list(root, namespaces);
1693 } while (list_empty(header_list));
1694 tmp = header_list->next;
1695 }
1696out:
1697 spin_unlock(&sysctl_lock);
1698 return NULL;
1699}
1700
1701struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1702{
1703 return __sysctl_head_next(current->nsproxy, prev);
1704}
1705
1706void register_sysctl_root(struct ctl_table_root *root)
1707{
1708 spin_lock(&sysctl_lock);
1709 list_add_tail(&root->root_list, &sysctl_table_root.root_list);
1710 spin_unlock(&sysctl_lock);
1711}
1712
1713/*
1714 * sysctl_perm does NOT grant the superuser all rights automatically, because
1715 * some sysctl variables are readonly even to root.
1716 */
1717
1718static int test_perm(int mode, int op)
1719{
1720 if (!current_euid())
1721 mode >>= 6;
1722 else if (in_egroup_p(0))
1723 mode >>= 3;
1724 if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
1725 return 0;
1726 return -EACCES;
1727}
1728
1729int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1730{
1731 int mode;
1732
1733 if (root->permissions)
1734 mode = root->permissions(root, current->nsproxy, table);
1735 else
1736 mode = table->mode;
1737
1738 return test_perm(mode, op);
1739}
1740
1741static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1742{
1743 for (; table->procname; table++) {
1744 table->parent = parent;
1745 if (table->child)
1746 sysctl_set_parent(table, table->child);
1747 }
1748}
1749
1750static __init int sysctl_init(void)
1751{
1752 sysctl_set_parent(NULL, root_table);
1753#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1754 sysctl_check_table(current->nsproxy, root_table);
1755#endif
1756 return 0; 1555 return 0;
1757} 1556}
1758 1557
1759core_initcall(sysctl_init);
1760
1761static struct ctl_table *is_branch_in(struct ctl_table *branch,
1762 struct ctl_table *table)
1763{
1764 struct ctl_table *p;
1765 const char *s = branch->procname;
1766
1767 /* branch should have named subdirectory as its first element */
1768 if (!s || !branch->child)
1769 return NULL;
1770
1771 /* ... and nothing else */
1772 if (branch[1].procname)
1773 return NULL;
1774
1775 /* table should contain subdirectory with the same name */
1776 for (p = table; p->procname; p++) {
1777 if (!p->child)
1778 continue;
1779 if (p->procname && strcmp(p->procname, s) == 0)
1780 return p;
1781 }
1782 return NULL;
1783}
1784
1785/* see if attaching q to p would be an improvement */
1786static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1787{
1788 struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
1789 struct ctl_table *next;
1790 int is_better = 0;
1791 int not_in_parent = !p->attached_by;
1792
1793 while ((next = is_branch_in(by, to)) != NULL) {
1794 if (by == q->attached_by)
1795 is_better = 1;
1796 if (to == p->attached_by)
1797 not_in_parent = 1;
1798 by = by->child;
1799 to = next->child;
1800 }
1801
1802 if (is_better && not_in_parent) {
1803 q->attached_by = by;
1804 q->attached_to = to;
1805 q->parent = p;
1806 }
1807}
1808
1809/**
1810 * __register_sysctl_paths - register a sysctl hierarchy
1811 * @root: List of sysctl headers to register on
1812 * @namespaces: Data to compute which lists of sysctl entries are visible
1813 * @path: The path to the directory the sysctl table is in.
1814 * @table: the top-level table structure
1815 *
1816 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1817 * array. A completely 0 filled entry terminates the table.
1818 *
1819 * The members of the &struct ctl_table structure are used as follows:
1820 *
1821 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
1822 * enter a sysctl file
1823 *
1824 * data - a pointer to data for use by proc_handler
1825 *
1826 * maxlen - the maximum size in bytes of the data
1827 *
1828 * mode - the file permissions for the /proc/sys file, and for sysctl(2)
1829 *
1830 * child - a pointer to the child sysctl table if this entry is a directory, or
1831 * %NULL.
1832 *
1833 * proc_handler - the text handler routine (described below)
1834 *
1835 * de - for internal use by the sysctl routines
1836 *
1837 * extra1, extra2 - extra pointers usable by the proc handler routines
1838 *
1839 * Leaf nodes in the sysctl tree will be represented by a single file
1840 * under /proc; non-leaf nodes will be represented by directories.
1841 *
1842 * sysctl(2) can automatically manage read and write requests through
1843 * the sysctl table. The data and maxlen fields of the ctl_table
1844 * struct enable minimal validation of the values being written to be
1845 * performed, and the mode field allows minimal authentication.
1846 *
1847 * There must be a proc_handler routine for any terminal nodes
1848 * mirrored under /proc/sys (non-terminals are handled by a built-in
1849 * directory handler). Several default handlers are available to
1850 * cover common cases -
1851 *
1852 * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
1853 * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
1854 * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
1855 *
1856 * It is the handler's job to read the input buffer from user memory
1857 * and process it. The handler should return 0 on success.
1858 *
1859 * This routine returns %NULL on a failure to register, and a pointer
1860 * to the table header on success.
1861 */
1862struct ctl_table_header *__register_sysctl_paths(
1863 struct ctl_table_root *root,
1864 struct nsproxy *namespaces,
1865 const struct ctl_path *path, struct ctl_table *table)
1866{
1867 struct ctl_table_header *header;
1868 struct ctl_table *new, **prevp;
1869 unsigned int n, npath;
1870 struct ctl_table_set *set;
1871
1872 /* Count the path components */
1873 for (npath = 0; path[npath].procname; ++npath)
1874 ;
1875
1876 /*
1877 * For each path component, allocate a 2-element ctl_table array.
1878 * The first array element will be filled with the sysctl entry
1879 * for this, the second will be the sentinel (procname == 0).
1880 *
1881 * We allocate everything in one go so that we don't have to
1882 * worry about freeing additional memory in unregister_sysctl_table.
1883 */
1884 header = kzalloc(sizeof(struct ctl_table_header) +
1885 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
1886 if (!header)
1887 return NULL;
1888
1889 new = (struct ctl_table *) (header + 1);
1890
1891 /* Now connect the dots */
1892 prevp = &header->ctl_table;
1893 for (n = 0; n < npath; ++n, ++path) {
1894 /* Copy the procname */
1895 new->procname = path->procname;
1896 new->mode = 0555;
1897
1898 *prevp = new;
1899 prevp = &new->child;
1900
1901 new += 2;
1902 }
1903 *prevp = table;
1904 header->ctl_table_arg = table;
1905
1906 INIT_LIST_HEAD(&header->ctl_entry);
1907 header->used = 0;
1908 header->unregistering = NULL;
1909 header->root = root;
1910 sysctl_set_parent(NULL, header->ctl_table);
1911 header->count = 1;
1912#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1913 if (sysctl_check_table(namespaces, header->ctl_table)) {
1914 kfree(header);
1915 return NULL;
1916 }
1917#endif
1918 spin_lock(&sysctl_lock);
1919 header->set = lookup_header_set(root, namespaces);
1920 header->attached_by = header->ctl_table;
1921 header->attached_to = root_table;
1922 header->parent = &root_table_header;
1923 for (set = header->set; set; set = set->parent) {
1924 struct ctl_table_header *p;
1925 list_for_each_entry(p, &set->list, ctl_entry) {
1926 if (p->unregistering)
1927 continue;
1928 try_attach(p, header);
1929 }
1930 }
1931 header->parent->count++;
1932 list_add_tail(&header->ctl_entry, &header->set->list);
1933 spin_unlock(&sysctl_lock);
1934
1935 return header;
1936}
1937
1938/**
1939 * register_sysctl_table_path - register a sysctl table hierarchy
1940 * @path: The path to the directory the sysctl table is in.
1941 * @table: the top-level table structure
1942 *
1943 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1944 * array. A completely 0 filled entry terminates the table.
1945 *
1946 * See __register_sysctl_paths for more details.
1947 */
1948struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1949 struct ctl_table *table)
1950{
1951 return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
1952 path, table);
1953}
1954
1955/**
1956 * register_sysctl_table - register a sysctl table hierarchy
1957 * @table: the top-level table structure
1958 *
1959 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1960 * array. A completely 0 filled entry terminates the table.
1961 *
1962 * See register_sysctl_paths for more details.
1963 */
1964struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
1965{
1966 static const struct ctl_path null_path[] = { {} };
1967
1968 return register_sysctl_paths(null_path, table);
1969}
1970
1971/**
1972 * unregister_sysctl_table - unregister a sysctl table hierarchy
1973 * @header: the header returned from register_sysctl_table
1974 *
1975 * Unregisters the sysctl table and all children. proc entries may not
1976 * actually be removed until they are no longer used by anyone.
1977 */
1978void unregister_sysctl_table(struct ctl_table_header * header)
1979{
1980 might_sleep();
1981
1982 if (header == NULL)
1983 return;
1984
1985 spin_lock(&sysctl_lock);
1986 start_unregistering(header);
1987 if (!--header->parent->count) {
1988 WARN_ON(1);
1989 kfree_rcu(header->parent, rcu);
1990 }
1991 if (!--header->count)
1992 kfree_rcu(header, rcu);
1993 spin_unlock(&sysctl_lock);
1994}
1995
1996int sysctl_is_seen(struct ctl_table_header *p)
1997{
1998 struct ctl_table_set *set = p->set;
1999 int res;
2000 spin_lock(&sysctl_lock);
2001 if (p->unregistering)
2002 res = 0;
2003 else if (!set->is_seen)
2004 res = 1;
2005 else
2006 res = set->is_seen(set);
2007 spin_unlock(&sysctl_lock);
2008 return res;
2009}
2010
2011void setup_sysctl_set(struct ctl_table_set *p,
2012 struct ctl_table_set *parent,
2013 int (*is_seen)(struct ctl_table_set *))
2014{
2015 INIT_LIST_HEAD(&p->list);
2016 p->parent = parent ? parent : &sysctl_table_root.default_set;
2017 p->is_seen = is_seen;
2018}
2019
2020#else /* !CONFIG_SYSCTL */
2021struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
2022{
2023 return NULL;
2024}
2025
2026struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
2027 struct ctl_table *table)
2028{
2029 return NULL;
2030}
2031
2032void unregister_sysctl_table(struct ctl_table_header * table)
2033{
2034}
2035
2036void setup_sysctl_set(struct ctl_table_set *p,
2037 struct ctl_table_set *parent,
2038 int (*is_seen)(struct ctl_table_set *))
2039{
2040}
2041
2042void sysctl_head_put(struct ctl_table_header *head)
2043{
2044}
2045
2046#endif /* CONFIG_SYSCTL */ 1558#endif /* CONFIG_SYSCTL */
2047 1559
2048/* 1560/*
@@ -2884,9 +2396,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2884 } 2396 }
2885 } 2397 }
2886 2398
2887 while (val_a <= val_b) 2399 bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
2888 set_bit(val_a++, tmp_bitmap);
2889
2890 first = 0; 2400 first = 0;
2891 proc_skip_char(&kbuf, &left, '\n'); 2401 proc_skip_char(&kbuf, &left, '\n');
2892 } 2402 }
@@ -2929,8 +2439,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2929 if (*ppos) 2439 if (*ppos)
2930 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); 2440 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
2931 else 2441 else
2932 memcpy(bitmap, tmp_bitmap, 2442 bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
2933 BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
2934 } 2443 }
2935 kfree(tmp_bitmap); 2444 kfree(tmp_bitmap);
2936 *lenp -= left; 2445 *lenp -= left;
@@ -3008,6 +2517,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
3008EXPORT_SYMBOL(proc_dostring); 2517EXPORT_SYMBOL(proc_dostring);
3009EXPORT_SYMBOL(proc_doulongvec_minmax); 2518EXPORT_SYMBOL(proc_doulongvec_minmax);
3010EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2519EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
3011EXPORT_SYMBOL(register_sysctl_table);
3012EXPORT_SYMBOL(register_sysctl_paths);
3013EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
deleted file mode 100644
index 362da653813d..000000000000
--- a/kernel/sysctl_check.c
+++ /dev/null
@@ -1,160 +0,0 @@
1#include <linux/stat.h>
2#include <linux/sysctl.h>
3#include "../fs/xfs/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h>
5#include <linux/string.h>
6#include <net/ip_vs.h>
7
8
9static int sysctl_depth(struct ctl_table *table)
10{
11 struct ctl_table *tmp;
12 int depth;
13
14 depth = 0;
15 for (tmp = table; tmp->parent; tmp = tmp->parent)
16 depth++;
17
18 return depth;
19}
20
21static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
22{
23 int i;
24
25 for (i = 0; table && i < n; i++)
26 table = table->parent;
27
28 return table;
29}
30
31
32static void sysctl_print_path(struct ctl_table *table)
33{
34 struct ctl_table *tmp;
35 int depth, i;
36 depth = sysctl_depth(table);
37 if (table->procname) {
38 for (i = depth; i >= 0; i--) {
39 tmp = sysctl_parent(table, i);
40 printk("/%s", tmp->procname?tmp->procname:"");
41 }
42 }
43 printk(" ");
44}
45
46static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
47 struct ctl_table *table)
48{
49 struct ctl_table_header *head;
50 struct ctl_table *ref, *test;
51 int depth, cur_depth;
52
53 depth = sysctl_depth(table);
54
55 for (head = __sysctl_head_next(namespaces, NULL); head;
56 head = __sysctl_head_next(namespaces, head)) {
57 cur_depth = depth;
58 ref = head->ctl_table;
59repeat:
60 test = sysctl_parent(table, cur_depth);
61 for (; ref->procname; ref++) {
62 int match = 0;
63 if (cur_depth && !ref->child)
64 continue;
65
66 if (test->procname && ref->procname &&
67 (strcmp(test->procname, ref->procname) == 0))
68 match++;
69
70 if (match) {
71 if (cur_depth != 0) {
72 cur_depth--;
73 ref = ref->child;
74 goto repeat;
75 }
76 goto out;
77 }
78 }
79 }
80 ref = NULL;
81out:
82 sysctl_head_finish(head);
83 return ref;
84}
85
86static void set_fail(const char **fail, struct ctl_table *table, const char *str)
87{
88 if (*fail) {
89 printk(KERN_ERR "sysctl table check failed: ");
90 sysctl_print_path(table);
91 printk(" %s\n", *fail);
92 dump_stack();
93 }
94 *fail = str;
95}
96
97static void sysctl_check_leaf(struct nsproxy *namespaces,
98 struct ctl_table *table, const char **fail)
99{
100 struct ctl_table *ref;
101
102 ref = sysctl_check_lookup(namespaces, table);
103 if (ref && (ref != table))
104 set_fail(fail, table, "Sysctl already exists");
105}
106
107int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
108{
109 int error = 0;
110 for (; table->procname; table++) {
111 const char *fail = NULL;
112
113 if (table->parent) {
114 if (!table->parent->procname)
115 set_fail(&fail, table, "Parent without procname");
116 }
117 if (table->child) {
118 if (table->data)
119 set_fail(&fail, table, "Directory with data?");
120 if (table->maxlen)
121 set_fail(&fail, table, "Directory with maxlen?");
122 if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
123 set_fail(&fail, table, "Writable sysctl directory");
124 if (table->proc_handler)
125 set_fail(&fail, table, "Directory with proc_handler");
126 if (table->extra1)
127 set_fail(&fail, table, "Directory with extra1");
128 if (table->extra2)
129 set_fail(&fail, table, "Directory with extra2");
130 } else {
131 if ((table->proc_handler == proc_dostring) ||
132 (table->proc_handler == proc_dointvec) ||
133 (table->proc_handler == proc_dointvec_minmax) ||
134 (table->proc_handler == proc_dointvec_jiffies) ||
135 (table->proc_handler == proc_dointvec_userhz_jiffies) ||
136 (table->proc_handler == proc_dointvec_ms_jiffies) ||
137 (table->proc_handler == proc_doulongvec_minmax) ||
138 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
139 if (!table->data)
140 set_fail(&fail, table, "No data");
141 if (!table->maxlen)
142 set_fail(&fail, table, "No maxlen");
143 }
144#ifdef CONFIG_PROC_SYSCTL
145 if (!table->proc_handler)
146 set_fail(&fail, table, "No proc_handler");
147#endif
148 sysctl_check_leaf(namespaces, table, &fail);
149 }
150 if (table->mode > 0777)
151 set_fail(&fail, table, "bogus .mode");
152 if (fail) {
153 set_fail(&fail, table, NULL);
154 error = -EINVAL;
155 }
156 if (table->child)
157 error |= sysctl_check_table(namespaces, table->child);
158 }
159 return error;
160}
diff --git a/kernel/time.c b/kernel/time.c
index 73e416db0a1e..ba744cf80696 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -163,7 +163,6 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
163 return error; 163 return error;
164 164
165 if (tz) { 165 if (tz) {
166 /* SMP safe, global irq locking makes it work. */
167 sys_tz = *tz; 166 sys_tz = *tz;
168 update_vsyscall_tz(); 167 update_vsyscall_tz();
169 if (firsttime) { 168 if (firsttime) {
@@ -173,12 +172,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
173 } 172 }
174 } 173 }
175 if (tv) 174 if (tv)
176 {
177 /* SMP safe, again the code in arch/foo/time.c should
178 * globally block out interrupts when it runs.
179 */
180 return do_settimeofday(tv); 175 return do_settimeofday(tv);
181 }
182 return 0; 176 return 0;
183} 177}
184 178
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a46f5d64504..8a538c55fc7b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -96,6 +96,11 @@ static int alarmtimer_rtc_add_device(struct device *dev,
96 return 0; 96 return 0;
97} 97}
98 98
99static inline void alarmtimer_rtc_timer_init(void)
100{
101 rtc_timer_init(&rtctimer, NULL, NULL);
102}
103
99static struct class_interface alarmtimer_rtc_interface = { 104static struct class_interface alarmtimer_rtc_interface = {
100 .add_dev = &alarmtimer_rtc_add_device, 105 .add_dev = &alarmtimer_rtc_add_device,
101}; 106};
@@ -117,6 +122,7 @@ static inline struct rtc_device *alarmtimer_get_rtcdev(void)
117#define rtcdev (NULL) 122#define rtcdev (NULL)
118static inline int alarmtimer_rtc_interface_setup(void) { return 0; } 123static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
119static inline void alarmtimer_rtc_interface_remove(void) { } 124static inline void alarmtimer_rtc_interface_remove(void) { }
125static inline void alarmtimer_rtc_timer_init(void) { }
120#endif 126#endif
121 127
122/** 128/**
@@ -783,6 +789,8 @@ static int __init alarmtimer_init(void)
783 .nsleep = alarm_timer_nsleep, 789 .nsleep = alarm_timer_nsleep,
784 }; 790 };
785 791
792 alarmtimer_rtc_timer_init();
793
786 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); 794 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
787 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); 795 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
788 796
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index a45ca167ab24..c9583382141a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -500,7 +500,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
500{ 500{
501 u64 ret; 501 u64 ret;
502 /* 502 /*
503 * We won't try to correct for more then 11% adjustments (110,000 ppm), 503 * We won't try to correct for more than 11% adjustments (110,000 ppm),
504 */ 504 */
505 ret = (u64)cs->mult * 11; 505 ret = (u64)cs->mult * 11;
506 do_div(ret,100); 506 do_div(ret,100);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f6117a4c7cb8..f03fd83b170b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,17 +22,18 @@
22 * NTP timekeeping variables: 22 * NTP timekeeping variables:
23 */ 23 */
24 24
25DEFINE_SPINLOCK(ntp_lock);
26
27
25/* USER_HZ period (usecs): */ 28/* USER_HZ period (usecs): */
26unsigned long tick_usec = TICK_USEC; 29unsigned long tick_usec = TICK_USEC;
27 30
28/* ACTHZ period (nsecs): */ 31/* ACTHZ period (nsecs): */
29unsigned long tick_nsec; 32unsigned long tick_nsec;
30 33
31u64 tick_length; 34static u64 tick_length;
32static u64 tick_length_base; 35static u64 tick_length_base;
33 36
34static struct hrtimer leap_timer;
35
36#define MAX_TICKADJ 500LL /* usecs */ 37#define MAX_TICKADJ 500LL /* usecs */
37#define MAX_TICKADJ_SCALED \ 38#define MAX_TICKADJ_SCALED \
38 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) 39 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -49,7 +50,7 @@ static struct hrtimer leap_timer;
49static int time_state = TIME_OK; 50static int time_state = TIME_OK;
50 51
51/* clock status bits: */ 52/* clock status bits: */
52int time_status = STA_UNSYNC; 53static int time_status = STA_UNSYNC;
53 54
54/* TAI offset (secs): */ 55/* TAI offset (secs): */
55static long time_tai; 56static long time_tai;
@@ -133,7 +134,7 @@ static inline void pps_reset_freq_interval(void)
133/** 134/**
134 * pps_clear - Clears the PPS state variables 135 * pps_clear - Clears the PPS state variables
135 * 136 *
136 * Must be called while holding a write on the xtime_lock 137 * Must be called while holding a write on the ntp_lock
137 */ 138 */
138static inline void pps_clear(void) 139static inline void pps_clear(void)
139{ 140{
@@ -149,7 +150,7 @@ static inline void pps_clear(void)
149 * the last PPS signal. When it reaches 0, indicate that PPS signal is 150 * the last PPS signal. When it reaches 0, indicate that PPS signal is
150 * missing. 151 * missing.
151 * 152 *
152 * Must be called while holding a write on the xtime_lock 153 * Must be called while holding a write on the ntp_lock
153 */ 154 */
154static inline void pps_dec_valid(void) 155static inline void pps_dec_valid(void)
155{ 156{
@@ -233,6 +234,17 @@ static inline void pps_fill_timex(struct timex *txc)
233 234
234#endif /* CONFIG_NTP_PPS */ 235#endif /* CONFIG_NTP_PPS */
235 236
237
238/**
239 * ntp_synced - Returns 1 if the NTP status is not UNSYNC
240 *
241 */
242static inline int ntp_synced(void)
243{
244 return !(time_status & STA_UNSYNC);
245}
246
247
236/* 248/*
237 * NTP methods: 249 * NTP methods:
238 */ 250 */
@@ -275,7 +287,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
275 287
276 time_status |= STA_MODE; 288 time_status |= STA_MODE;
277 289
278 return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); 290 return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
279} 291}
280 292
281static void ntp_update_offset(long offset) 293static void ntp_update_offset(long offset)
@@ -330,11 +342,13 @@ static void ntp_update_offset(long offset)
330 342
331/** 343/**
332 * ntp_clear - Clears the NTP state variables 344 * ntp_clear - Clears the NTP state variables
333 *
334 * Must be called while holding a write on the xtime_lock
335 */ 345 */
336void ntp_clear(void) 346void ntp_clear(void)
337{ 347{
348 unsigned long flags;
349
350 spin_lock_irqsave(&ntp_lock, flags);
351
338 time_adjust = 0; /* stop active adjtime() */ 352 time_adjust = 0; /* stop active adjtime() */
339 time_status |= STA_UNSYNC; 353 time_status |= STA_UNSYNC;
340 time_maxerror = NTP_PHASE_LIMIT; 354 time_maxerror = NTP_PHASE_LIMIT;
@@ -347,63 +361,81 @@ void ntp_clear(void)
347 361
348 /* Clear PPS state variables */ 362 /* Clear PPS state variables */
349 pps_clear(); 363 pps_clear();
364 spin_unlock_irqrestore(&ntp_lock, flags);
365
366}
367
368
369u64 ntp_tick_length(void)
370{
371 unsigned long flags;
372 s64 ret;
373
374 spin_lock_irqsave(&ntp_lock, flags);
375 ret = tick_length;
376 spin_unlock_irqrestore(&ntp_lock, flags);
377 return ret;
350} 378}
351 379
380
352/* 381/*
353 * Leap second processing. If in leap-insert state at the end of the 382 * this routine handles the overflow of the microsecond field
354 * day, the system clock is set back one second; if in leap-delete 383 *
355 * state, the system clock is set ahead one second. 384 * The tricky bits of code to handle the accurate clock support
385 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
386 * They were originally developed for SUN and DEC kernels.
387 * All the kudos should go to Dave for this stuff.
388 *
389 * Also handles leap second processing, and returns leap offset
356 */ 390 */
357static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) 391int second_overflow(unsigned long secs)
358{ 392{
359 enum hrtimer_restart res = HRTIMER_NORESTART; 393 s64 delta;
394 int leap = 0;
395 unsigned long flags;
360 396
361 write_seqlock(&xtime_lock); 397 spin_lock_irqsave(&ntp_lock, flags);
362 398
399 /*
400 * Leap second processing. If in leap-insert state at the end of the
401 * day, the system clock is set back one second; if in leap-delete
402 * state, the system clock is set ahead one second.
403 */
363 switch (time_state) { 404 switch (time_state) {
364 case TIME_OK: 405 case TIME_OK:
406 if (time_status & STA_INS)
407 time_state = TIME_INS;
408 else if (time_status & STA_DEL)
409 time_state = TIME_DEL;
365 break; 410 break;
366 case TIME_INS: 411 case TIME_INS:
367 timekeeping_leap_insert(-1); 412 if (secs % 86400 == 0) {
368 time_state = TIME_OOP; 413 leap = -1;
369 printk(KERN_NOTICE 414 time_state = TIME_OOP;
370 "Clock: inserting leap second 23:59:60 UTC\n"); 415 printk(KERN_NOTICE
371 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); 416 "Clock: inserting leap second 23:59:60 UTC\n");
372 res = HRTIMER_RESTART; 417 }
373 break; 418 break;
374 case TIME_DEL: 419 case TIME_DEL:
375 timekeeping_leap_insert(1); 420 if ((secs + 1) % 86400 == 0) {
376 time_tai--; 421 leap = 1;
377 time_state = TIME_WAIT; 422 time_tai--;
378 printk(KERN_NOTICE 423 time_state = TIME_WAIT;
379 "Clock: deleting leap second 23:59:59 UTC\n"); 424 printk(KERN_NOTICE
425 "Clock: deleting leap second 23:59:59 UTC\n");
426 }
380 break; 427 break;
381 case TIME_OOP: 428 case TIME_OOP:
382 time_tai++; 429 time_tai++;
383 time_state = TIME_WAIT; 430 time_state = TIME_WAIT;
384 /* fall through */ 431 break;
432
385 case TIME_WAIT: 433 case TIME_WAIT:
386 if (!(time_status & (STA_INS | STA_DEL))) 434 if (!(time_status & (STA_INS | STA_DEL)))
387 time_state = TIME_OK; 435 time_state = TIME_OK;
388 break; 436 break;
389 } 437 }
390 438
391 write_sequnlock(&xtime_lock);
392
393 return res;
394}
395
396/*
397 * this routine handles the overflow of the microsecond field
398 *
399 * The tricky bits of code to handle the accurate clock support
400 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
401 * They were originally developed for SUN and DEC kernels.
402 * All the kudos should go to Dave for this stuff.
403 */
404void second_overflow(void)
405{
406 s64 delta;
407 439
408 /* Bump the maxerror field */ 440 /* Bump the maxerror field */
409 time_maxerror += MAXFREQ / NSEC_PER_USEC; 441 time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -423,30 +455,34 @@ void second_overflow(void)
423 pps_dec_valid(); 455 pps_dec_valid();
424 456
425 if (!time_adjust) 457 if (!time_adjust)
426 return; 458 goto out;
427 459
428 if (time_adjust > MAX_TICKADJ) { 460 if (time_adjust > MAX_TICKADJ) {
429 time_adjust -= MAX_TICKADJ; 461 time_adjust -= MAX_TICKADJ;
430 tick_length += MAX_TICKADJ_SCALED; 462 tick_length += MAX_TICKADJ_SCALED;
431 return; 463 goto out;
432 } 464 }
433 465
434 if (time_adjust < -MAX_TICKADJ) { 466 if (time_adjust < -MAX_TICKADJ) {
435 time_adjust += MAX_TICKADJ; 467 time_adjust += MAX_TICKADJ;
436 tick_length -= MAX_TICKADJ_SCALED; 468 tick_length -= MAX_TICKADJ_SCALED;
437 return; 469 goto out;
438 } 470 }
439 471
440 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) 472 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
441 << NTP_SCALE_SHIFT; 473 << NTP_SCALE_SHIFT;
442 time_adjust = 0; 474 time_adjust = 0;
475
476
477
478out:
479 spin_unlock_irqrestore(&ntp_lock, flags);
480
481 return leap;
443} 482}
444 483
445#ifdef CONFIG_GENERIC_CMOS_UPDATE 484#ifdef CONFIG_GENERIC_CMOS_UPDATE
446 485
447/* Disable the cmos update - used by virtualization and embedded */
448int no_sync_cmos_clock __read_mostly;
449
450static void sync_cmos_clock(struct work_struct *work); 486static void sync_cmos_clock(struct work_struct *work);
451 487
452static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); 488static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -493,35 +529,13 @@ static void sync_cmos_clock(struct work_struct *work)
493 529
494static void notify_cmos_timer(void) 530static void notify_cmos_timer(void)
495{ 531{
496 if (!no_sync_cmos_clock) 532 schedule_delayed_work(&sync_cmos_work, 0);
497 schedule_delayed_work(&sync_cmos_work, 0);
498} 533}
499 534
500#else 535#else
501static inline void notify_cmos_timer(void) { } 536static inline void notify_cmos_timer(void) { }
502#endif 537#endif
503 538
504/*
505 * Start the leap seconds timer:
506 */
507static inline void ntp_start_leap_timer(struct timespec *ts)
508{
509 long now = ts->tv_sec;
510
511 if (time_status & STA_INS) {
512 time_state = TIME_INS;
513 now += 86400 - now % 86400;
514 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
515
516 return;
517 }
518
519 if (time_status & STA_DEL) {
520 time_state = TIME_DEL;
521 now += 86400 - (now + 1) % 86400;
522 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
523 }
524}
525 539
526/* 540/*
527 * Propagate a new txc->status value into the NTP state: 541 * Propagate a new txc->status value into the NTP state:
@@ -546,22 +560,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
546 time_status &= STA_RONLY; 560 time_status &= STA_RONLY;
547 time_status |= txc->status & ~STA_RONLY; 561 time_status |= txc->status & ~STA_RONLY;
548 562
549 switch (time_state) {
550 case TIME_OK:
551 ntp_start_leap_timer(ts);
552 break;
553 case TIME_INS:
554 case TIME_DEL:
555 time_state = TIME_OK;
556 ntp_start_leap_timer(ts);
557 case TIME_WAIT:
558 if (!(time_status & (STA_INS | STA_DEL)))
559 time_state = TIME_OK;
560 break;
561 case TIME_OOP:
562 hrtimer_restart(&leap_timer);
563 break;
564 }
565} 563}
566/* 564/*
567 * Called with the xtime lock held, so we can access and modify 565 * Called with the xtime lock held, so we can access and modify
@@ -643,9 +641,6 @@ int do_adjtimex(struct timex *txc)
643 (txc->tick < 900000/USER_HZ || 641 (txc->tick < 900000/USER_HZ ||
644 txc->tick > 1100000/USER_HZ)) 642 txc->tick > 1100000/USER_HZ))
645 return -EINVAL; 643 return -EINVAL;
646
647 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
648 hrtimer_cancel(&leap_timer);
649 } 644 }
650 645
651 if (txc->modes & ADJ_SETOFFSET) { 646 if (txc->modes & ADJ_SETOFFSET) {
@@ -663,7 +658,7 @@ int do_adjtimex(struct timex *txc)
663 658
664 getnstimeofday(&ts); 659 getnstimeofday(&ts);
665 660
666 write_seqlock_irq(&xtime_lock); 661 spin_lock_irq(&ntp_lock);
667 662
668 if (txc->modes & ADJ_ADJTIME) { 663 if (txc->modes & ADJ_ADJTIME) {
669 long save_adjust = time_adjust; 664 long save_adjust = time_adjust;
@@ -705,7 +700,7 @@ int do_adjtimex(struct timex *txc)
705 /* fill PPS status fields */ 700 /* fill PPS status fields */
706 pps_fill_timex(txc); 701 pps_fill_timex(txc);
707 702
708 write_sequnlock_irq(&xtime_lock); 703 spin_unlock_irq(&ntp_lock);
709 704
710 txc->time.tv_sec = ts.tv_sec; 705 txc->time.tv_sec = ts.tv_sec;
711 txc->time.tv_usec = ts.tv_nsec; 706 txc->time.tv_usec = ts.tv_nsec;
@@ -903,7 +898,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
903 898
904 pts_norm = pps_normalize_ts(*phase_ts); 899 pts_norm = pps_normalize_ts(*phase_ts);
905 900
906 write_seqlock_irqsave(&xtime_lock, flags); 901 spin_lock_irqsave(&ntp_lock, flags);
907 902
908 /* clear the error bits, they will be set again if needed */ 903 /* clear the error bits, they will be set again if needed */
909 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); 904 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -916,7 +911,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
916 * just start the frequency interval */ 911 * just start the frequency interval */
917 if (unlikely(pps_fbase.tv_sec == 0)) { 912 if (unlikely(pps_fbase.tv_sec == 0)) {
918 pps_fbase = *raw_ts; 913 pps_fbase = *raw_ts;
919 write_sequnlock_irqrestore(&xtime_lock, flags); 914 spin_unlock_irqrestore(&ntp_lock, flags);
920 return; 915 return;
921 } 916 }
922 917
@@ -931,7 +926,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
931 time_status |= STA_PPSJITTER; 926 time_status |= STA_PPSJITTER;
932 /* restart the frequency calibration interval */ 927 /* restart the frequency calibration interval */
933 pps_fbase = *raw_ts; 928 pps_fbase = *raw_ts;
934 write_sequnlock_irqrestore(&xtime_lock, flags); 929 spin_unlock_irqrestore(&ntp_lock, flags);
935 pr_err("hardpps: PPSJITTER: bad pulse\n"); 930 pr_err("hardpps: PPSJITTER: bad pulse\n");
936 return; 931 return;
937 } 932 }
@@ -948,7 +943,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
948 943
949 hardpps_update_phase(pts_norm.nsec); 944 hardpps_update_phase(pts_norm.nsec);
950 945
951 write_sequnlock_irqrestore(&xtime_lock, flags); 946 spin_unlock_irqrestore(&ntp_lock, flags);
952} 947}
953EXPORT_SYMBOL(hardpps); 948EXPORT_SYMBOL(hardpps);
954 949
@@ -967,6 +962,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
967void __init ntp_init(void) 962void __init ntp_init(void)
968{ 963{
969 ntp_clear(); 964 ntp_clear();
970 hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
971 leap_timer.function = ntp_leap_second;
972} 965}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index fd4a7b1625a2..e883f57a3cd3 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -575,11 +575,15 @@ void tick_broadcast_switch_to_oneshot(void)
575 unsigned long flags; 575 unsigned long flags;
576 576
577 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 577 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
578 if (cpumask_empty(tick_get_broadcast_mask()))
579 goto end;
578 580
579 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 581 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
580 bc = tick_broadcast_device.evtdev; 582 bc = tick_broadcast_device.evtdev;
581 if (bc) 583 if (bc)
582 tick_broadcast_setup_oneshot(bc); 584 tick_broadcast_setup_oneshot(bc);
585
586end:
583 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 587 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
584} 588}
585 589
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7656642e4b8e..3526038f2836 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -182,11 +182,7 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
182 182
183static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) 183static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
184{ 184{
185 ktime_t now; 185 ktime_t now = ktime_get();
186
187 now = ktime_get();
188
189 update_ts_time_stats(cpu, ts, now, NULL);
190 186
191 ts->idle_entrytime = now; 187 ts->idle_entrytime = now;
192 ts->idle_active = 1; 188 ts->idle_active = 1;
@@ -562,20 +558,21 @@ void tick_nohz_idle_exit(void)
562 558
563 local_irq_disable(); 559 local_irq_disable();
564 560
565 if (ts->idle_active || (ts->inidle && ts->tick_stopped)) 561 WARN_ON_ONCE(!ts->inidle);
562
563 ts->inidle = 0;
564
565 if (ts->idle_active || ts->tick_stopped)
566 now = ktime_get(); 566 now = ktime_get();
567 567
568 if (ts->idle_active) 568 if (ts->idle_active)
569 tick_nohz_stop_idle(cpu, now); 569 tick_nohz_stop_idle(cpu, now);
570 570
571 if (!ts->inidle || !ts->tick_stopped) { 571 if (!ts->tick_stopped) {
572 ts->inidle = 0;
573 local_irq_enable(); 572 local_irq_enable();
574 return; 573 return;
575 } 574 }
576 575
577 ts->inidle = 0;
578
579 /* Update jiffies first */ 576 /* Update jiffies first */
580 select_nohz_load_balancer(0); 577 select_nohz_load_balancer(0);
581 tick_do_update_jiffies64(now); 578 tick_do_update_jiffies64(now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0c6358186401..d66b21308f7c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -25,6 +25,8 @@
25struct timekeeper { 25struct timekeeper {
26 /* Current clocksource used for timekeeping. */ 26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock; 27 struct clocksource *clock;
28 /* NTP adjusted clock multiplier */
29 u32 mult;
28 /* The shift value of the current clocksource. */ 30 /* The shift value of the current clocksource. */
29 int shift; 31 int shift;
30 32
@@ -45,12 +47,47 @@ struct timekeeper {
45 /* Shift conversion between clock shifted nano seconds and 47 /* Shift conversion between clock shifted nano seconds and
46 * ntp shifted nano seconds. */ 48 * ntp shifted nano seconds. */
47 int ntp_error_shift; 49 int ntp_error_shift;
48 /* NTP adjusted clock multiplier */ 50
49 u32 mult; 51 /* The current time */
52 struct timespec xtime;
53 /*
54 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
55 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
56 * at zero at system boot time, so wall_to_monotonic will be negative,
57 * however, we will ALWAYS keep the tv_nsec part positive so we can use
58 * the usual normalization.
59 *
60 * wall_to_monotonic is moved after resume from suspend for the
61 * monotonic time not to jump. We need to add total_sleep_time to
62 * wall_to_monotonic to get the real boot based time offset.
63 *
64 * - wall_to_monotonic is no longer the boot time, getboottime must be
65 * used instead.
66 */
67 struct timespec wall_to_monotonic;
68 /* time spent in suspend */
69 struct timespec total_sleep_time;
70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
71 struct timespec raw_time;
72
73 /* Seqlock for all timekeeper values */
74 seqlock_t lock;
50}; 75};
51 76
52static struct timekeeper timekeeper; 77static struct timekeeper timekeeper;
53 78
79/*
80 * This read-write spinlock protects us from races in SMP while
81 * playing with xtime.
82 */
83__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
84
85
86/* flag for if timekeeping is suspended */
87int __read_mostly timekeeping_suspended;
88
89
90
54/** 91/**
55 * timekeeper_setup_internals - Set up internals to use clocksource clock. 92 * timekeeper_setup_internals - Set up internals to use clocksource clock.
56 * 93 *
@@ -135,49 +172,18 @@ static inline s64 timekeeping_get_ns_raw(void)
135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 172 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
136} 173}
137 174
138/* 175/* must hold write on timekeeper.lock */
139 * This read-write spinlock protects us from races in SMP while 176static void timekeeping_update(bool clearntp)
140 * playing with xtime.
141 */
142__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
143
144
145/*
146 * The current time
147 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
148 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
149 * at zero at system boot time, so wall_to_monotonic will be negative,
150 * however, we will ALWAYS keep the tv_nsec part positive so we can use
151 * the usual normalization.
152 *
153 * wall_to_monotonic is moved after resume from suspend for the monotonic
154 * time not to jump. We need to add total_sleep_time to wall_to_monotonic
155 * to get the real boot based time offset.
156 *
157 * - wall_to_monotonic is no longer the boot time, getboottime must be
158 * used instead.
159 */
160static struct timespec xtime __attribute__ ((aligned (16)));
161static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
162static struct timespec total_sleep_time;
163
164/*
165 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
166 */
167static struct timespec raw_time;
168
169/* flag for if timekeeping is suspended */
170int __read_mostly timekeeping_suspended;
171
172/* must hold xtime_lock */
173void timekeeping_leap_insert(int leapsecond)
174{ 177{
175 xtime.tv_sec += leapsecond; 178 if (clearntp) {
176 wall_to_monotonic.tv_sec -= leapsecond; 179 timekeeper.ntp_error = 0;
177 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 180 ntp_clear();
178 timekeeper.mult); 181 }
182 update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
183 timekeeper.clock, timekeeper.mult);
179} 184}
180 185
186
181/** 187/**
182 * timekeeping_forward_now - update clock to the current time 188 * timekeeping_forward_now - update clock to the current time
183 * 189 *
@@ -202,10 +208,10 @@ static void timekeeping_forward_now(void)
202 /* If arch requires, add in gettimeoffset() */ 208 /* If arch requires, add in gettimeoffset() */
203 nsec += arch_gettimeoffset(); 209 nsec += arch_gettimeoffset();
204 210
205 timespec_add_ns(&xtime, nsec); 211 timespec_add_ns(&timekeeper.xtime, nsec);
206 212
207 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 213 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
208 timespec_add_ns(&raw_time, nsec); 214 timespec_add_ns(&timekeeper.raw_time, nsec);
209} 215}
210 216
211/** 217/**
@@ -222,15 +228,15 @@ void getnstimeofday(struct timespec *ts)
222 WARN_ON(timekeeping_suspended); 228 WARN_ON(timekeeping_suspended);
223 229
224 do { 230 do {
225 seq = read_seqbegin(&xtime_lock); 231 seq = read_seqbegin(&timekeeper.lock);
226 232
227 *ts = xtime; 233 *ts = timekeeper.xtime;
228 nsecs = timekeeping_get_ns(); 234 nsecs = timekeeping_get_ns();
229 235
230 /* If arch requires, add in gettimeoffset() */ 236 /* If arch requires, add in gettimeoffset() */
231 nsecs += arch_gettimeoffset(); 237 nsecs += arch_gettimeoffset();
232 238
233 } while (read_seqretry(&xtime_lock, seq)); 239 } while (read_seqretry(&timekeeper.lock, seq));
234 240
235 timespec_add_ns(ts, nsecs); 241 timespec_add_ns(ts, nsecs);
236} 242}
@@ -245,14 +251,16 @@ ktime_t ktime_get(void)
245 WARN_ON(timekeeping_suspended); 251 WARN_ON(timekeeping_suspended);
246 252
247 do { 253 do {
248 seq = read_seqbegin(&xtime_lock); 254 seq = read_seqbegin(&timekeeper.lock);
249 secs = xtime.tv_sec + wall_to_monotonic.tv_sec; 255 secs = timekeeper.xtime.tv_sec +
250 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; 256 timekeeper.wall_to_monotonic.tv_sec;
257 nsecs = timekeeper.xtime.tv_nsec +
258 timekeeper.wall_to_monotonic.tv_nsec;
251 nsecs += timekeeping_get_ns(); 259 nsecs += timekeeping_get_ns();
252 /* If arch requires, add in gettimeoffset() */ 260 /* If arch requires, add in gettimeoffset() */
253 nsecs += arch_gettimeoffset(); 261 nsecs += arch_gettimeoffset();
254 262
255 } while (read_seqretry(&xtime_lock, seq)); 263 } while (read_seqretry(&timekeeper.lock, seq));
256 /* 264 /*
257 * Use ktime_set/ktime_add_ns to create a proper ktime on 265 * Use ktime_set/ktime_add_ns to create a proper ktime on
258 * 32-bit architectures without CONFIG_KTIME_SCALAR. 266 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -278,14 +286,14 @@ void ktime_get_ts(struct timespec *ts)
278 WARN_ON(timekeeping_suspended); 286 WARN_ON(timekeeping_suspended);
279 287
280 do { 288 do {
281 seq = read_seqbegin(&xtime_lock); 289 seq = read_seqbegin(&timekeeper.lock);
282 *ts = xtime; 290 *ts = timekeeper.xtime;
283 tomono = wall_to_monotonic; 291 tomono = timekeeper.wall_to_monotonic;
284 nsecs = timekeeping_get_ns(); 292 nsecs = timekeeping_get_ns();
285 /* If arch requires, add in gettimeoffset() */ 293 /* If arch requires, add in gettimeoffset() */
286 nsecs += arch_gettimeoffset(); 294 nsecs += arch_gettimeoffset();
287 295
288 } while (read_seqretry(&xtime_lock, seq)); 296 } while (read_seqretry(&timekeeper.lock, seq));
289 297
290 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, 298 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
291 ts->tv_nsec + tomono.tv_nsec + nsecs); 299 ts->tv_nsec + tomono.tv_nsec + nsecs);
@@ -313,10 +321,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
313 do { 321 do {
314 u32 arch_offset; 322 u32 arch_offset;
315 323
316 seq = read_seqbegin(&xtime_lock); 324 seq = read_seqbegin(&timekeeper.lock);
317 325
318 *ts_raw = raw_time; 326 *ts_raw = timekeeper.raw_time;
319 *ts_real = xtime; 327 *ts_real = timekeeper.xtime;
320 328
321 nsecs_raw = timekeeping_get_ns_raw(); 329 nsecs_raw = timekeeping_get_ns_raw();
322 nsecs_real = timekeeping_get_ns(); 330 nsecs_real = timekeeping_get_ns();
@@ -326,7 +334,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
326 nsecs_raw += arch_offset; 334 nsecs_raw += arch_offset;
327 nsecs_real += arch_offset; 335 nsecs_real += arch_offset;
328 336
329 } while (read_seqretry(&xtime_lock, seq)); 337 } while (read_seqretry(&timekeeper.lock, seq));
330 338
331 timespec_add_ns(ts_raw, nsecs_raw); 339 timespec_add_ns(ts_raw, nsecs_raw);
332 timespec_add_ns(ts_real, nsecs_real); 340 timespec_add_ns(ts_real, nsecs_real);
@@ -365,23 +373,19 @@ int do_settimeofday(const struct timespec *tv)
365 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 373 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
366 return -EINVAL; 374 return -EINVAL;
367 375
368 write_seqlock_irqsave(&xtime_lock, flags); 376 write_seqlock_irqsave(&timekeeper.lock, flags);
369 377
370 timekeeping_forward_now(); 378 timekeeping_forward_now();
371 379
372 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; 380 ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec;
373 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; 381 ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec;
374 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); 382 timekeeper.wall_to_monotonic =
383 timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
375 384
376 xtime = *tv; 385 timekeeper.xtime = *tv;
377 386 timekeeping_update(true);
378 timekeeper.ntp_error = 0;
379 ntp_clear();
380 387
381 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 388 write_sequnlock_irqrestore(&timekeeper.lock, flags);
382 timekeeper.mult);
383
384 write_sequnlock_irqrestore(&xtime_lock, flags);
385 389
386 /* signal hrtimers about time change */ 390 /* signal hrtimers about time change */
387 clock_was_set(); 391 clock_was_set();
@@ -405,20 +409,17 @@ int timekeeping_inject_offset(struct timespec *ts)
405 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 409 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
406 return -EINVAL; 410 return -EINVAL;
407 411
408 write_seqlock_irqsave(&xtime_lock, flags); 412 write_seqlock_irqsave(&timekeeper.lock, flags);
409 413
410 timekeeping_forward_now(); 414 timekeeping_forward_now();
411 415
412 xtime = timespec_add(xtime, *ts); 416 timekeeper.xtime = timespec_add(timekeeper.xtime, *ts);
413 wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); 417 timekeeper.wall_to_monotonic =
414 418 timespec_sub(timekeeper.wall_to_monotonic, *ts);
415 timekeeper.ntp_error = 0;
416 ntp_clear();
417 419
418 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 420 timekeeping_update(true);
419 timekeeper.mult);
420 421
421 write_sequnlock_irqrestore(&xtime_lock, flags); 422 write_sequnlock_irqrestore(&timekeeper.lock, flags);
422 423
423 /* signal hrtimers about time change */ 424 /* signal hrtimers about time change */
424 clock_was_set(); 425 clock_was_set();
@@ -435,9 +436,12 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
435static int change_clocksource(void *data) 436static int change_clocksource(void *data)
436{ 437{
437 struct clocksource *new, *old; 438 struct clocksource *new, *old;
439 unsigned long flags;
438 440
439 new = (struct clocksource *) data; 441 new = (struct clocksource *) data;
440 442
443 write_seqlock_irqsave(&timekeeper.lock, flags);
444
441 timekeeping_forward_now(); 445 timekeeping_forward_now();
442 if (!new->enable || new->enable(new) == 0) { 446 if (!new->enable || new->enable(new) == 0) {
443 old = timekeeper.clock; 447 old = timekeeper.clock;
@@ -445,6 +449,10 @@ static int change_clocksource(void *data)
445 if (old->disable) 449 if (old->disable)
446 old->disable(old); 450 old->disable(old);
447 } 451 }
452 timekeeping_update(true);
453
454 write_sequnlock_irqrestore(&timekeeper.lock, flags);
455
448 return 0; 456 return 0;
449} 457}
450 458
@@ -490,11 +498,11 @@ void getrawmonotonic(struct timespec *ts)
490 s64 nsecs; 498 s64 nsecs;
491 499
492 do { 500 do {
493 seq = read_seqbegin(&xtime_lock); 501 seq = read_seqbegin(&timekeeper.lock);
494 nsecs = timekeeping_get_ns_raw(); 502 nsecs = timekeeping_get_ns_raw();
495 *ts = raw_time; 503 *ts = timekeeper.raw_time;
496 504
497 } while (read_seqretry(&xtime_lock, seq)); 505 } while (read_seqretry(&timekeeper.lock, seq));
498 506
499 timespec_add_ns(ts, nsecs); 507 timespec_add_ns(ts, nsecs);
500} 508}
@@ -510,24 +518,30 @@ int timekeeping_valid_for_hres(void)
510 int ret; 518 int ret;
511 519
512 do { 520 do {
513 seq = read_seqbegin(&xtime_lock); 521 seq = read_seqbegin(&timekeeper.lock);
514 522
515 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 523 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
516 524
517 } while (read_seqretry(&xtime_lock, seq)); 525 } while (read_seqretry(&timekeeper.lock, seq));
518 526
519 return ret; 527 return ret;
520} 528}
521 529
522/** 530/**
523 * timekeeping_max_deferment - Returns max time the clocksource can be deferred 531 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
524 *
525 * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
526 * ensure that the clocksource does not change!
527 */ 532 */
528u64 timekeeping_max_deferment(void) 533u64 timekeeping_max_deferment(void)
529{ 534{
530 return timekeeper.clock->max_idle_ns; 535 unsigned long seq;
536 u64 ret;
537 do {
538 seq = read_seqbegin(&timekeeper.lock);
539
540 ret = timekeeper.clock->max_idle_ns;
541
542 } while (read_seqretry(&timekeeper.lock, seq));
543
544 return ret;
531} 545}
532 546
533/** 547/**
@@ -572,28 +586,29 @@ void __init timekeeping_init(void)
572 read_persistent_clock(&now); 586 read_persistent_clock(&now);
573 read_boot_clock(&boot); 587 read_boot_clock(&boot);
574 588
575 write_seqlock_irqsave(&xtime_lock, flags); 589 seqlock_init(&timekeeper.lock);
576 590
577 ntp_init(); 591 ntp_init();
578 592
593 write_seqlock_irqsave(&timekeeper.lock, flags);
579 clock = clocksource_default_clock(); 594 clock = clocksource_default_clock();
580 if (clock->enable) 595 if (clock->enable)
581 clock->enable(clock); 596 clock->enable(clock);
582 timekeeper_setup_internals(clock); 597 timekeeper_setup_internals(clock);
583 598
584 xtime.tv_sec = now.tv_sec; 599 timekeeper.xtime.tv_sec = now.tv_sec;
585 xtime.tv_nsec = now.tv_nsec; 600 timekeeper.xtime.tv_nsec = now.tv_nsec;
586 raw_time.tv_sec = 0; 601 timekeeper.raw_time.tv_sec = 0;
587 raw_time.tv_nsec = 0; 602 timekeeper.raw_time.tv_nsec = 0;
588 if (boot.tv_sec == 0 && boot.tv_nsec == 0) { 603 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
589 boot.tv_sec = xtime.tv_sec; 604 boot.tv_sec = timekeeper.xtime.tv_sec;
590 boot.tv_nsec = xtime.tv_nsec; 605 boot.tv_nsec = timekeeper.xtime.tv_nsec;
591 } 606 }
592 set_normalized_timespec(&wall_to_monotonic, 607 set_normalized_timespec(&timekeeper.wall_to_monotonic,
593 -boot.tv_sec, -boot.tv_nsec); 608 -boot.tv_sec, -boot.tv_nsec);
594 total_sleep_time.tv_sec = 0; 609 timekeeper.total_sleep_time.tv_sec = 0;
595 total_sleep_time.tv_nsec = 0; 610 timekeeper.total_sleep_time.tv_nsec = 0;
596 write_sequnlock_irqrestore(&xtime_lock, flags); 611 write_sequnlock_irqrestore(&timekeeper.lock, flags);
597} 612}
598 613
599/* time in seconds when suspend began */ 614/* time in seconds when suspend began */
@@ -614,9 +629,11 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
614 return; 629 return;
615 } 630 }
616 631
617 xtime = timespec_add(xtime, *delta); 632 timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
618 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); 633 timekeeper.wall_to_monotonic =
619 total_sleep_time = timespec_add(total_sleep_time, *delta); 634 timespec_sub(timekeeper.wall_to_monotonic, *delta);
635 timekeeper.total_sleep_time = timespec_add(
636 timekeeper.total_sleep_time, *delta);
620} 637}
621 638
622 639
@@ -640,17 +657,15 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
640 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) 657 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
641 return; 658 return;
642 659
643 write_seqlock_irqsave(&xtime_lock, flags); 660 write_seqlock_irqsave(&timekeeper.lock, flags);
661
644 timekeeping_forward_now(); 662 timekeeping_forward_now();
645 663
646 __timekeeping_inject_sleeptime(delta); 664 __timekeeping_inject_sleeptime(delta);
647 665
648 timekeeper.ntp_error = 0; 666 timekeeping_update(true);
649 ntp_clear();
650 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
651 timekeeper.mult);
652 667
653 write_sequnlock_irqrestore(&xtime_lock, flags); 668 write_sequnlock_irqrestore(&timekeeper.lock, flags);
654 669
655 /* signal hrtimers about time change */ 670 /* signal hrtimers about time change */
656 clock_was_set(); 671 clock_was_set();
@@ -673,7 +688,7 @@ static void timekeeping_resume(void)
673 688
674 clocksource_resume(); 689 clocksource_resume();
675 690
676 write_seqlock_irqsave(&xtime_lock, flags); 691 write_seqlock_irqsave(&timekeeper.lock, flags);
677 692
678 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 693 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
679 ts = timespec_sub(ts, timekeeping_suspend_time); 694 ts = timespec_sub(ts, timekeeping_suspend_time);
@@ -683,7 +698,7 @@ static void timekeeping_resume(void)
683 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 698 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
684 timekeeper.ntp_error = 0; 699 timekeeper.ntp_error = 0;
685 timekeeping_suspended = 0; 700 timekeeping_suspended = 0;
686 write_sequnlock_irqrestore(&xtime_lock, flags); 701 write_sequnlock_irqrestore(&timekeeper.lock, flags);
687 702
688 touch_softlockup_watchdog(); 703 touch_softlockup_watchdog();
689 704
@@ -701,7 +716,7 @@ static int timekeeping_suspend(void)
701 716
702 read_persistent_clock(&timekeeping_suspend_time); 717 read_persistent_clock(&timekeeping_suspend_time);
703 718
704 write_seqlock_irqsave(&xtime_lock, flags); 719 write_seqlock_irqsave(&timekeeper.lock, flags);
705 timekeeping_forward_now(); 720 timekeeping_forward_now();
706 timekeeping_suspended = 1; 721 timekeeping_suspended = 1;
707 722
@@ -711,7 +726,7 @@ static int timekeeping_suspend(void)
711 * try to compensate so the difference in system time 726 * try to compensate so the difference in system time
712 * and persistent_clock time stays close to constant. 727 * and persistent_clock time stays close to constant.
713 */ 728 */
714 delta = timespec_sub(xtime, timekeeping_suspend_time); 729 delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time);
715 delta_delta = timespec_sub(delta, old_delta); 730 delta_delta = timespec_sub(delta, old_delta);
716 if (abs(delta_delta.tv_sec) >= 2) { 731 if (abs(delta_delta.tv_sec) >= 2) {
717 /* 732 /*
@@ -724,7 +739,7 @@ static int timekeeping_suspend(void)
724 timekeeping_suspend_time = 739 timekeeping_suspend_time =
725 timespec_add(timekeeping_suspend_time, delta_delta); 740 timespec_add(timekeeping_suspend_time, delta_delta);
726 } 741 }
727 write_sequnlock_irqrestore(&xtime_lock, flags); 742 write_sequnlock_irqrestore(&timekeeper.lock, flags);
728 743
729 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 744 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
730 clocksource_suspend(); 745 clocksource_suspend();
@@ -775,7 +790,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
775 * Now calculate the error in (1 << look_ahead) ticks, but first 790 * Now calculate the error in (1 << look_ahead) ticks, but first
776 * remove the single look ahead already included in the error. 791 * remove the single look ahead already included in the error.
777 */ 792 */
778 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); 793 tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1);
779 tick_error -= timekeeper.xtime_interval >> 1; 794 tick_error -= timekeeper.xtime_interval >> 1;
780 error = ((error - tick_error) >> look_ahead) + tick_error; 795 error = ((error - tick_error) >> look_ahead) + tick_error;
781 796
@@ -807,7 +822,7 @@ static void timekeeping_adjust(s64 offset)
807 int adj; 822 int adj;
808 823
809 /* 824 /*
810 * The point of this is to check if the error is greater then half 825 * The point of this is to check if the error is greater than half
811 * an interval. 826 * an interval.
812 * 827 *
813 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. 828 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
@@ -815,7 +830,7 @@ static void timekeeping_adjust(s64 offset)
815 * Note we subtract one in the shift, so that error is really error*2. 830 * Note we subtract one in the shift, so that error is really error*2.
816 * This "saves" dividing(shifting) interval twice, but keeps the 831 * This "saves" dividing(shifting) interval twice, but keeps the
817 * (error > interval) comparison as still measuring if error is 832 * (error > interval) comparison as still measuring if error is
818 * larger then half an interval. 833 * larger than half an interval.
819 * 834 *
820 * Note: It does not "save" on aggravation when reading the code. 835 * Note: It does not "save" on aggravation when reading the code.
821 */ 836 */
@@ -823,7 +838,7 @@ static void timekeeping_adjust(s64 offset)
823 if (error > interval) { 838 if (error > interval) {
824 /* 839 /*
825 * We now divide error by 4(via shift), which checks if 840 * We now divide error by 4(via shift), which checks if
826 * the error is greater then twice the interval. 841 * the error is greater than twice the interval.
827 * If it is greater, we need a bigadjust, if its smaller, 842 * If it is greater, we need a bigadjust, if its smaller,
828 * we can adjust by 1. 843 * we can adjust by 1.
829 */ 844 */
@@ -854,13 +869,15 @@ static void timekeeping_adjust(s64 offset)
854 } else /* No adjustment needed */ 869 } else /* No adjustment needed */
855 return; 870 return;
856 871
857 WARN_ONCE(timekeeper.clock->maxadj && 872 if (unlikely(timekeeper.clock->maxadj &&
858 (timekeeper.mult + adj > timekeeper.clock->mult + 873 (timekeeper.mult + adj >
859 timekeeper.clock->maxadj), 874 timekeeper.clock->mult + timekeeper.clock->maxadj))) {
860 "Adjusting %s more then 11%% (%ld vs %ld)\n", 875 printk_once(KERN_WARNING
876 "Adjusting %s more than 11%% (%ld vs %ld)\n",
861 timekeeper.clock->name, (long)timekeeper.mult + adj, 877 timekeeper.clock->name, (long)timekeeper.mult + adj,
862 (long)timekeeper.clock->mult + 878 (long)timekeeper.clock->mult +
863 timekeeper.clock->maxadj); 879 timekeeper.clock->maxadj);
880 }
864 /* 881 /*
865 * So the following can be confusing. 882 * So the following can be confusing.
866 * 883 *
@@ -932,7 +949,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
932 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 949 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
933 u64 raw_nsecs; 950 u64 raw_nsecs;
934 951
935 /* If the offset is smaller then a shifted interval, do nothing */ 952 /* If the offset is smaller than a shifted interval, do nothing */
936 if (offset < timekeeper.cycle_interval<<shift) 953 if (offset < timekeeper.cycle_interval<<shift)
937 return offset; 954 return offset;
938 955
@@ -942,23 +959,25 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
942 959
943 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; 960 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
944 while (timekeeper.xtime_nsec >= nsecps) { 961 while (timekeeper.xtime_nsec >= nsecps) {
962 int leap;
945 timekeeper.xtime_nsec -= nsecps; 963 timekeeper.xtime_nsec -= nsecps;
946 xtime.tv_sec++; 964 timekeeper.xtime.tv_sec++;
947 second_overflow(); 965 leap = second_overflow(timekeeper.xtime.tv_sec);
966 timekeeper.xtime.tv_sec += leap;
948 } 967 }
949 968
950 /* Accumulate raw time */ 969 /* Accumulate raw time */
951 raw_nsecs = timekeeper.raw_interval << shift; 970 raw_nsecs = timekeeper.raw_interval << shift;
952 raw_nsecs += raw_time.tv_nsec; 971 raw_nsecs += timekeeper.raw_time.tv_nsec;
953 if (raw_nsecs >= NSEC_PER_SEC) { 972 if (raw_nsecs >= NSEC_PER_SEC) {
954 u64 raw_secs = raw_nsecs; 973 u64 raw_secs = raw_nsecs;
955 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); 974 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
956 raw_time.tv_sec += raw_secs; 975 timekeeper.raw_time.tv_sec += raw_secs;
957 } 976 }
958 raw_time.tv_nsec = raw_nsecs; 977 timekeeper.raw_time.tv_nsec = raw_nsecs;
959 978
960 /* Accumulate error between NTP and clock interval */ 979 /* Accumulate error between NTP and clock interval */
961 timekeeper.ntp_error += tick_length << shift; 980 timekeeper.ntp_error += ntp_tick_length() << shift;
962 timekeeper.ntp_error -= 981 timekeeper.ntp_error -=
963 (timekeeper.xtime_interval + timekeeper.xtime_remainder) << 982 (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
964 (timekeeper.ntp_error_shift + shift); 983 (timekeeper.ntp_error_shift + shift);
@@ -970,17 +989,19 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
970/** 989/**
971 * update_wall_time - Uses the current clocksource to increment the wall time 990 * update_wall_time - Uses the current clocksource to increment the wall time
972 * 991 *
973 * Called from the timer interrupt, must hold a write on xtime_lock.
974 */ 992 */
975static void update_wall_time(void) 993static void update_wall_time(void)
976{ 994{
977 struct clocksource *clock; 995 struct clocksource *clock;
978 cycle_t offset; 996 cycle_t offset;
979 int shift = 0, maxshift; 997 int shift = 0, maxshift;
998 unsigned long flags;
999
1000 write_seqlock_irqsave(&timekeeper.lock, flags);
980 1001
981 /* Make sure we're fully resumed: */ 1002 /* Make sure we're fully resumed: */
982 if (unlikely(timekeeping_suspended)) 1003 if (unlikely(timekeeping_suspended))
983 return; 1004 goto out;
984 1005
985 clock = timekeeper.clock; 1006 clock = timekeeper.clock;
986 1007
@@ -989,20 +1010,21 @@ static void update_wall_time(void)
989#else 1010#else
990 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1011 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
991#endif 1012#endif
992 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 1013 timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec <<
1014 timekeeper.shift;
993 1015
994 /* 1016 /*
995 * With NO_HZ we may have to accumulate many cycle_intervals 1017 * With NO_HZ we may have to accumulate many cycle_intervals
996 * (think "ticks") worth of time at once. To do this efficiently, 1018 * (think "ticks") worth of time at once. To do this efficiently,
997 * we calculate the largest doubling multiple of cycle_intervals 1019 * we calculate the largest doubling multiple of cycle_intervals
998 * that is smaller then the offset. We then accumulate that 1020 * that is smaller than the offset. We then accumulate that
999 * chunk in one go, and then try to consume the next smaller 1021 * chunk in one go, and then try to consume the next smaller
1000 * doubled multiple. 1022 * doubled multiple.
1001 */ 1023 */
1002 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); 1024 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
1003 shift = max(0, shift); 1025 shift = max(0, shift);
1004 /* Bound shift to one less then what overflows tick_length */ 1026 /* Bound shift to one less than what overflows tick_length */
1005 maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; 1027 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
1006 shift = min(shift, maxshift); 1028 shift = min(shift, maxshift);
1007 while (offset >= timekeeper.cycle_interval) { 1029 while (offset >= timekeeper.cycle_interval) {
1008 offset = logarithmic_accumulation(offset, shift); 1030 offset = logarithmic_accumulation(offset, shift);
@@ -1040,24 +1062,30 @@ static void update_wall_time(void)
1040 * Store full nanoseconds into xtime after rounding it up and 1062 * Store full nanoseconds into xtime after rounding it up and
1041 * add the remainder to the error difference. 1063 * add the remainder to the error difference.
1042 */ 1064 */
1043 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; 1065 timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >>
1044 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; 1066 timekeeper.shift) + 1;
1067 timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec <<
1068 timekeeper.shift;
1045 timekeeper.ntp_error += timekeeper.xtime_nsec << 1069 timekeeper.ntp_error += timekeeper.xtime_nsec <<
1046 timekeeper.ntp_error_shift; 1070 timekeeper.ntp_error_shift;
1047 1071
1048 /* 1072 /*
1049 * Finally, make sure that after the rounding 1073 * Finally, make sure that after the rounding
1050 * xtime.tv_nsec isn't larger then NSEC_PER_SEC 1074 * xtime.tv_nsec isn't larger than NSEC_PER_SEC
1051 */ 1075 */
1052 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { 1076 if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) {
1053 xtime.tv_nsec -= NSEC_PER_SEC; 1077 int leap;
1054 xtime.tv_sec++; 1078 timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
1055 second_overflow(); 1079 timekeeper.xtime.tv_sec++;
1080 leap = second_overflow(timekeeper.xtime.tv_sec);
1081 timekeeper.xtime.tv_sec += leap;
1056 } 1082 }
1057 1083
1058 /* check to see if there is a new clocksource to use */ 1084 timekeeping_update(false);
1059 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 1085
1060 timekeeper.mult); 1086out:
1087 write_sequnlock_irqrestore(&timekeeper.lock, flags);
1088
1061} 1089}
1062 1090
1063/** 1091/**
@@ -1074,8 +1102,10 @@ static void update_wall_time(void)
1074void getboottime(struct timespec *ts) 1102void getboottime(struct timespec *ts)
1075{ 1103{
1076 struct timespec boottime = { 1104 struct timespec boottime = {
1077 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, 1105 .tv_sec = timekeeper.wall_to_monotonic.tv_sec +
1078 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec 1106 timekeeper.total_sleep_time.tv_sec,
1107 .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec +
1108 timekeeper.total_sleep_time.tv_nsec
1079 }; 1109 };
1080 1110
1081 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 1111 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
@@ -1101,13 +1131,13 @@ void get_monotonic_boottime(struct timespec *ts)
1101 WARN_ON(timekeeping_suspended); 1131 WARN_ON(timekeeping_suspended);
1102 1132
1103 do { 1133 do {
1104 seq = read_seqbegin(&xtime_lock); 1134 seq = read_seqbegin(&timekeeper.lock);
1105 *ts = xtime; 1135 *ts = timekeeper.xtime;
1106 tomono = wall_to_monotonic; 1136 tomono = timekeeper.wall_to_monotonic;
1107 sleep = total_sleep_time; 1137 sleep = timekeeper.total_sleep_time;
1108 nsecs = timekeeping_get_ns(); 1138 nsecs = timekeeping_get_ns();
1109 1139
1110 } while (read_seqretry(&xtime_lock, seq)); 1140 } while (read_seqretry(&timekeeper.lock, seq));
1111 1141
1112 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, 1142 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
1113 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); 1143 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
@@ -1137,19 +1167,19 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
1137 */ 1167 */
1138void monotonic_to_bootbased(struct timespec *ts) 1168void monotonic_to_bootbased(struct timespec *ts)
1139{ 1169{
1140 *ts = timespec_add(*ts, total_sleep_time); 1170 *ts = timespec_add(*ts, timekeeper.total_sleep_time);
1141} 1171}
1142EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 1172EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
1143 1173
1144unsigned long get_seconds(void) 1174unsigned long get_seconds(void)
1145{ 1175{
1146 return xtime.tv_sec; 1176 return timekeeper.xtime.tv_sec;
1147} 1177}
1148EXPORT_SYMBOL(get_seconds); 1178EXPORT_SYMBOL(get_seconds);
1149 1179
1150struct timespec __current_kernel_time(void) 1180struct timespec __current_kernel_time(void)
1151{ 1181{
1152 return xtime; 1182 return timekeeper.xtime;
1153} 1183}
1154 1184
1155struct timespec current_kernel_time(void) 1185struct timespec current_kernel_time(void)
@@ -1158,10 +1188,10 @@ struct timespec current_kernel_time(void)
1158 unsigned long seq; 1188 unsigned long seq;
1159 1189
1160 do { 1190 do {
1161 seq = read_seqbegin(&xtime_lock); 1191 seq = read_seqbegin(&timekeeper.lock);
1162 1192
1163 now = xtime; 1193 now = timekeeper.xtime;
1164 } while (read_seqretry(&xtime_lock, seq)); 1194 } while (read_seqretry(&timekeeper.lock, seq));
1165 1195
1166 return now; 1196 return now;
1167} 1197}
@@ -1173,11 +1203,11 @@ struct timespec get_monotonic_coarse(void)
1173 unsigned long seq; 1203 unsigned long seq;
1174 1204
1175 do { 1205 do {
1176 seq = read_seqbegin(&xtime_lock); 1206 seq = read_seqbegin(&timekeeper.lock);
1177 1207
1178 now = xtime; 1208 now = timekeeper.xtime;
1179 mono = wall_to_monotonic; 1209 mono = timekeeper.wall_to_monotonic;
1180 } while (read_seqretry(&xtime_lock, seq)); 1210 } while (read_seqretry(&timekeeper.lock, seq));
1181 1211
1182 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1212 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
1183 now.tv_nsec + mono.tv_nsec); 1213 now.tv_nsec + mono.tv_nsec);
@@ -1209,11 +1239,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1209 unsigned long seq; 1239 unsigned long seq;
1210 1240
1211 do { 1241 do {
1212 seq = read_seqbegin(&xtime_lock); 1242 seq = read_seqbegin(&timekeeper.lock);
1213 *xtim = xtime; 1243 *xtim = timekeeper.xtime;
1214 *wtom = wall_to_monotonic; 1244 *wtom = timekeeper.wall_to_monotonic;
1215 *sleep = total_sleep_time; 1245 *sleep = timekeeper.total_sleep_time;
1216 } while (read_seqretry(&xtime_lock, seq)); 1246 } while (read_seqretry(&timekeeper.lock, seq));
1217} 1247}
1218 1248
1219/** 1249/**
@@ -1225,11 +1255,14 @@ ktime_t ktime_get_monotonic_offset(void)
1225 struct timespec wtom; 1255 struct timespec wtom;
1226 1256
1227 do { 1257 do {
1228 seq = read_seqbegin(&xtime_lock); 1258 seq = read_seqbegin(&timekeeper.lock);
1229 wtom = wall_to_monotonic; 1259 wtom = timekeeper.wall_to_monotonic;
1230 } while (read_seqretry(&xtime_lock, seq)); 1260 } while (read_seqretry(&timekeeper.lock, seq));
1261
1231 return timespec_to_ktime(wtom); 1262 return timespec_to_ktime(wtom);
1232} 1263}
1264EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1265
1233 1266
1234/** 1267/**
1235 * xtime_update() - advances the timekeeping infrastructure 1268 * xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 683d559a0eef..867bd1dd2dd0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,6 +62,8 @@
62#define FTRACE_HASH_DEFAULT_BITS 10 62#define FTRACE_HASH_DEFAULT_BITS 10
63#define FTRACE_HASH_MAX_BITS 12 63#define FTRACE_HASH_MAX_BITS 12
64 64
65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
66
65/* ftrace_enabled is a method to turn ftrace on or off */ 67/* ftrace_enabled is a method to turn ftrace on or off */
66int ftrace_enabled __read_mostly; 68int ftrace_enabled __read_mostly;
67static int last_ftrace_enabled; 69static int last_ftrace_enabled;
@@ -89,12 +91,14 @@ static struct ftrace_ops ftrace_list_end __read_mostly = {
89}; 91};
90 92
91static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; 93static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
94static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
92static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 95static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
93ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 96ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
94static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; 97static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
95ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 98ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
96ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 99ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
97static struct ftrace_ops global_ops; 100static struct ftrace_ops global_ops;
101static struct ftrace_ops control_ops;
98 102
99static void 103static void
100ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); 104ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
@@ -168,6 +172,32 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
168} 172}
169#endif 173#endif
170 174
175static void control_ops_disable_all(struct ftrace_ops *ops)
176{
177 int cpu;
178
179 for_each_possible_cpu(cpu)
180 *per_cpu_ptr(ops->disabled, cpu) = 1;
181}
182
183static int control_ops_alloc(struct ftrace_ops *ops)
184{
185 int __percpu *disabled;
186
187 disabled = alloc_percpu(int);
188 if (!disabled)
189 return -ENOMEM;
190
191 ops->disabled = disabled;
192 control_ops_disable_all(ops);
193 return 0;
194}
195
196static void control_ops_free(struct ftrace_ops *ops)
197{
198 free_percpu(ops->disabled);
199}
200
171static void update_global_ops(void) 201static void update_global_ops(void)
172{ 202{
173 ftrace_func_t func; 203 ftrace_func_t func;
@@ -259,6 +289,26 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
259 return 0; 289 return 0;
260} 290}
261 291
292static void add_ftrace_list_ops(struct ftrace_ops **list,
293 struct ftrace_ops *main_ops,
294 struct ftrace_ops *ops)
295{
296 int first = *list == &ftrace_list_end;
297 add_ftrace_ops(list, ops);
298 if (first)
299 add_ftrace_ops(&ftrace_ops_list, main_ops);
300}
301
302static int remove_ftrace_list_ops(struct ftrace_ops **list,
303 struct ftrace_ops *main_ops,
304 struct ftrace_ops *ops)
305{
306 int ret = remove_ftrace_ops(list, ops);
307 if (!ret && *list == &ftrace_list_end)
308 ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
309 return ret;
310}
311
262static int __register_ftrace_function(struct ftrace_ops *ops) 312static int __register_ftrace_function(struct ftrace_ops *ops)
263{ 313{
264 if (ftrace_disabled) 314 if (ftrace_disabled)
@@ -270,15 +320,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
270 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) 320 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
271 return -EBUSY; 321 return -EBUSY;
272 322
323 /* We don't support both control and global flags set. */
324 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
325 return -EINVAL;
326
273 if (!core_kernel_data((unsigned long)ops)) 327 if (!core_kernel_data((unsigned long)ops))
274 ops->flags |= FTRACE_OPS_FL_DYNAMIC; 328 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
275 329
276 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 330 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
277 int first = ftrace_global_list == &ftrace_list_end; 331 add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
278 add_ftrace_ops(&ftrace_global_list, ops);
279 ops->flags |= FTRACE_OPS_FL_ENABLED; 332 ops->flags |= FTRACE_OPS_FL_ENABLED;
280 if (first) 333 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
281 add_ftrace_ops(&ftrace_ops_list, &global_ops); 334 if (control_ops_alloc(ops))
335 return -ENOMEM;
336 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
282 } else 337 } else
283 add_ftrace_ops(&ftrace_ops_list, ops); 338 add_ftrace_ops(&ftrace_ops_list, ops);
284 339
@@ -302,11 +357,23 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
302 return -EINVAL; 357 return -EINVAL;
303 358
304 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 359 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
305 ret = remove_ftrace_ops(&ftrace_global_list, ops); 360 ret = remove_ftrace_list_ops(&ftrace_global_list,
306 if (!ret && ftrace_global_list == &ftrace_list_end) 361 &global_ops, ops);
307 ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
308 if (!ret) 362 if (!ret)
309 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 363 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
364 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
365 ret = remove_ftrace_list_ops(&ftrace_control_list,
366 &control_ops, ops);
367 if (!ret) {
368 /*
369 * The ftrace_ops is now removed from the list,
370 * so there'll be no new users. We must ensure
371 * all current users are done before we free
372 * the control data.
373 */
374 synchronize_sched();
375 control_ops_free(ops);
376 }
310 } else 377 } else
311 ret = remove_ftrace_ops(&ftrace_ops_list, ops); 378 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
312 379
@@ -1119,6 +1186,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1119 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); 1186 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1120} 1187}
1121 1188
1189void ftrace_free_filter(struct ftrace_ops *ops)
1190{
1191 free_ftrace_hash(ops->filter_hash);
1192 free_ftrace_hash(ops->notrace_hash);
1193}
1194
1122static struct ftrace_hash *alloc_ftrace_hash(int size_bits) 1195static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1123{ 1196{
1124 struct ftrace_hash *hash; 1197 struct ftrace_hash *hash;
@@ -1129,7 +1202,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1129 return NULL; 1202 return NULL;
1130 1203
1131 size = 1 << size_bits; 1204 size = 1 << size_bits;
1132 hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); 1205 hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);
1133 1206
1134 if (!hash->buckets) { 1207 if (!hash->buckets) {
1135 kfree(hash); 1208 kfree(hash);
@@ -3146,8 +3219,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3146 mutex_lock(&ftrace_regex_lock); 3219 mutex_lock(&ftrace_regex_lock);
3147 if (reset) 3220 if (reset)
3148 ftrace_filter_reset(hash); 3221 ftrace_filter_reset(hash);
3149 if (buf) 3222 if (buf && !ftrace_match_records(hash, buf, len)) {
3150 ftrace_match_records(hash, buf, len); 3223 ret = -EINVAL;
3224 goto out_regex_unlock;
3225 }
3151 3226
3152 mutex_lock(&ftrace_lock); 3227 mutex_lock(&ftrace_lock);
3153 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 3228 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
@@ -3157,6 +3232,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3157 3232
3158 mutex_unlock(&ftrace_lock); 3233 mutex_unlock(&ftrace_lock);
3159 3234
3235 out_regex_unlock:
3160 mutex_unlock(&ftrace_regex_lock); 3236 mutex_unlock(&ftrace_regex_lock);
3161 3237
3162 free_ftrace_hash(hash); 3238 free_ftrace_hash(hash);
@@ -3173,10 +3249,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3173 * Filters denote which functions should be enabled when tracing is enabled. 3249 * Filters denote which functions should be enabled when tracing is enabled.
3174 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 3250 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
3175 */ 3251 */
3176void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, 3252int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
3177 int len, int reset) 3253 int len, int reset)
3178{ 3254{
3179 ftrace_set_regex(ops, buf, len, reset, 1); 3255 return ftrace_set_regex(ops, buf, len, reset, 1);
3180} 3256}
3181EXPORT_SYMBOL_GPL(ftrace_set_filter); 3257EXPORT_SYMBOL_GPL(ftrace_set_filter);
3182 3258
@@ -3191,10 +3267,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
3191 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 3267 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
3192 * for tracing. 3268 * for tracing.
3193 */ 3269 */
3194void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, 3270int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
3195 int len, int reset) 3271 int len, int reset)
3196{ 3272{
3197 ftrace_set_regex(ops, buf, len, reset, 0); 3273 return ftrace_set_regex(ops, buf, len, reset, 0);
3198} 3274}
3199EXPORT_SYMBOL_GPL(ftrace_set_notrace); 3275EXPORT_SYMBOL_GPL(ftrace_set_notrace);
3200/** 3276/**
@@ -3871,6 +3947,36 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
3871#endif /* CONFIG_DYNAMIC_FTRACE */ 3947#endif /* CONFIG_DYNAMIC_FTRACE */
3872 3948
3873static void 3949static void
3950ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
3951{
3952 struct ftrace_ops *op;
3953
3954 if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
3955 return;
3956
3957 /*
3958 * Some of the ops may be dynamically allocated,
3959 * they must be freed after a synchronize_sched().
3960 */
3961 preempt_disable_notrace();
3962 trace_recursion_set(TRACE_CONTROL_BIT);
3963 op = rcu_dereference_raw(ftrace_control_list);
3964 while (op != &ftrace_list_end) {
3965 if (!ftrace_function_local_disabled(op) &&
3966 ftrace_ops_test(op, ip))
3967 op->func(ip, parent_ip);
3968
3969 op = rcu_dereference_raw(op->next);
3970 };
3971 trace_recursion_clear(TRACE_CONTROL_BIT);
3972 preempt_enable_notrace();
3973}
3974
3975static struct ftrace_ops control_ops = {
3976 .func = ftrace_ops_control_func,
3977};
3978
3979static void
3874ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) 3980ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3875{ 3981{
3876 struct ftrace_ops *op; 3982 struct ftrace_ops *op;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3f1bc5d2a00..10d5503f0d04 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2764,12 +2764,12 @@ static const char readme_msg[] =
2764 "tracing mini-HOWTO:\n\n" 2764 "tracing mini-HOWTO:\n\n"
2765 "# mount -t debugfs nodev /sys/kernel/debug\n\n" 2765 "# mount -t debugfs nodev /sys/kernel/debug\n\n"
2766 "# cat /sys/kernel/debug/tracing/available_tracers\n" 2766 "# cat /sys/kernel/debug/tracing/available_tracers\n"
2767 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" 2767 "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n"
2768 "# cat /sys/kernel/debug/tracing/current_tracer\n" 2768 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2769 "nop\n" 2769 "nop\n"
2770 "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" 2770 "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n"
2771 "# cat /sys/kernel/debug/tracing/current_tracer\n" 2771 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2772 "sched_switch\n" 2772 "wakeup\n"
2773 "# cat /sys/kernel/debug/tracing/trace_options\n" 2773 "# cat /sys/kernel/debug/tracing/trace_options\n"
2774 "noprint-parent nosym-offset nosym-addr noverbose\n" 2774 "noprint-parent nosym-offset nosym-addr noverbose\n"
2775 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" 2775 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b93ecbadad6d..54faec790bc1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -56,17 +56,23 @@ enum trace_type {
56#define F_STRUCT(args...) args 56#define F_STRUCT(args...) args
57 57
58#undef FTRACE_ENTRY 58#undef FTRACE_ENTRY
59#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 59#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
60 struct struct_name { \ 60 struct struct_name { \
61 struct trace_entry ent; \ 61 struct trace_entry ent; \
62 tstruct \ 62 tstruct \
63 } 63 }
64 64
65#undef TP_ARGS 65#undef TP_ARGS
66#define TP_ARGS(args...) args 66#define TP_ARGS(args...) args
67 67
68#undef FTRACE_ENTRY_DUP 68#undef FTRACE_ENTRY_DUP
69#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) 69#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
70
71#undef FTRACE_ENTRY_REG
72#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
73 filter, regfn) \
74 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
75 filter)
70 76
71#include "trace_entries.h" 77#include "trace_entries.h"
72 78
@@ -288,6 +294,8 @@ struct tracer {
288/* for function tracing recursion */ 294/* for function tracing recursion */
289#define TRACE_INTERNAL_BIT (1<<11) 295#define TRACE_INTERNAL_BIT (1<<11)
290#define TRACE_GLOBAL_BIT (1<<12) 296#define TRACE_GLOBAL_BIT (1<<12)
297#define TRACE_CONTROL_BIT (1<<13)
298
291/* 299/*
292 * Abuse of the trace_recursion. 300 * Abuse of the trace_recursion.
293 * As we need a way to maintain state if we are tracing the function 301 * As we need a way to maintain state if we are tracing the function
@@ -589,6 +597,8 @@ static inline int ftrace_trace_task(struct task_struct *task)
589static inline int ftrace_is_dead(void) { return 0; } 597static inline int ftrace_is_dead(void) { return 0; }
590#endif 598#endif
591 599
600int ftrace_event_is_function(struct ftrace_event_call *call);
601
592/* 602/*
593 * struct trace_parser - servers for reading the user input separated by spaces 603 * struct trace_parser - servers for reading the user input separated by spaces
594 * @cont: set if the input is not complete - no final space char was found 604 * @cont: set if the input is not complete - no final space char was found
@@ -766,9 +776,7 @@ struct filter_pred {
766 u64 val; 776 u64 val;
767 struct regex regex; 777 struct regex regex;
768 unsigned short *ops; 778 unsigned short *ops;
769#ifdef CONFIG_FTRACE_STARTUP_TEST
770 struct ftrace_event_field *field; 779 struct ftrace_event_field *field;
771#endif
772 int offset; 780 int offset;
773 int not; 781 int not;
774 int op; 782 int op;
@@ -818,12 +826,22 @@ extern const char *__start___trace_bprintk_fmt[];
818extern const char *__stop___trace_bprintk_fmt[]; 826extern const char *__stop___trace_bprintk_fmt[];
819 827
820#undef FTRACE_ENTRY 828#undef FTRACE_ENTRY
821#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 829#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
822 extern struct ftrace_event_call \ 830 extern struct ftrace_event_call \
823 __attribute__((__aligned__(4))) event_##call; 831 __attribute__((__aligned__(4))) event_##call;
824#undef FTRACE_ENTRY_DUP 832#undef FTRACE_ENTRY_DUP
825#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 833#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
826 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 834 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
835 filter)
827#include "trace_entries.h" 836#include "trace_entries.h"
828 837
838#ifdef CONFIG_PERF_EVENTS
839#ifdef CONFIG_FUNCTION_TRACER
840int perf_ftrace_event_register(struct ftrace_event_call *call,
841 enum trace_reg type, void *data);
842#else
843#define perf_ftrace_event_register NULL
844#endif /* CONFIG_FUNCTION_TRACER */
845#endif /* CONFIG_PERF_EVENTS */
846
829#endif /* _LINUX_KERNEL_TRACE_H */ 847#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 93365907f219..d91eb0541b3a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -55,7 +55,7 @@
55/* 55/*
56 * Function trace entry - function address and parent function address: 56 * Function trace entry - function address and parent function address:
57 */ 57 */
58FTRACE_ENTRY(function, ftrace_entry, 58FTRACE_ENTRY_REG(function, ftrace_entry,
59 59
60 TRACE_FN, 60 TRACE_FN,
61 61
@@ -64,7 +64,11 @@ FTRACE_ENTRY(function, ftrace_entry,
64 __field( unsigned long, parent_ip ) 64 __field( unsigned long, parent_ip )
65 ), 65 ),
66 66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) 67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip),
68
69 FILTER_TRACE_FN,
70
71 perf_ftrace_event_register
68); 72);
69 73
70/* Function call entry */ 74/* Function call entry */
@@ -78,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
78 __field_desc( int, graph_ent, depth ) 82 __field_desc( int, graph_ent, depth )
79 ), 83 ),
80 84
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth) 85 F_printk("--> %lx (%d)", __entry->func, __entry->depth),
86
87 FILTER_OTHER
82); 88);
83 89
84/* Function return entry */ 90/* Function return entry */
@@ -98,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", 104 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth, 105 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime, 106 __entry->calltime, __entry->rettime,
101 __entry->depth) 107 __entry->depth),
108
109 FILTER_OTHER
102); 110);
103 111
104/* 112/*
@@ -127,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", 135 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state, 136 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state, 137 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu 138 __entry->next_cpu),
131 ) 139
140 FILTER_OTHER
132); 141);
133 142
134/* 143/*
@@ -146,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", 155 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state, 156 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state, 157 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu 158 __entry->next_cpu),
150 ) 159
160 FILTER_OTHER
151); 161);
152 162
153/* 163/*
@@ -169,7 +179,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
169 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", 179 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
170 __entry->caller[0], __entry->caller[1], __entry->caller[2], 180 __entry->caller[0], __entry->caller[1], __entry->caller[2],
171 __entry->caller[3], __entry->caller[4], __entry->caller[5], 181 __entry->caller[3], __entry->caller[4], __entry->caller[5],
172 __entry->caller[6], __entry->caller[7]) 182 __entry->caller[6], __entry->caller[7]),
183
184 FILTER_OTHER
173); 185);
174 186
175FTRACE_ENTRY(user_stack, userstack_entry, 187FTRACE_ENTRY(user_stack, userstack_entry,
@@ -185,7 +197,9 @@ FTRACE_ENTRY(user_stack, userstack_entry,
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", 197 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2], 198 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5], 199 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7]) 200 __entry->caller[6], __entry->caller[7]),
201
202 FILTER_OTHER
189); 203);
190 204
191/* 205/*
@@ -202,7 +216,9 @@ FTRACE_ENTRY(bprint, bprint_entry,
202 ), 216 ),
203 217
204 F_printk("%08lx fmt:%p", 218 F_printk("%08lx fmt:%p",
205 __entry->ip, __entry->fmt) 219 __entry->ip, __entry->fmt),
220
221 FILTER_OTHER
206); 222);
207 223
208FTRACE_ENTRY(print, print_entry, 224FTRACE_ENTRY(print, print_entry,
@@ -215,7 +231,9 @@ FTRACE_ENTRY(print, print_entry,
215 ), 231 ),
216 232
217 F_printk("%08lx %s", 233 F_printk("%08lx %s",
218 __entry->ip, __entry->buf) 234 __entry->ip, __entry->buf),
235
236 FILTER_OTHER
219); 237);
220 238
221FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, 239FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
@@ -234,7 +252,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
234 252
235 F_printk("%lx %lx %lx %d %x %x", 253 F_printk("%lx %lx %lx %d %x %x",
236 (unsigned long)__entry->phys, __entry->value, __entry->pc, 254 (unsigned long)__entry->phys, __entry->value, __entry->pc,
237 __entry->map_id, __entry->opcode, __entry->width) 255 __entry->map_id, __entry->opcode, __entry->width),
256
257 FILTER_OTHER
238); 258);
239 259
240FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, 260FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
@@ -252,7 +272,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
252 272
253 F_printk("%lx %lx %lx %d %x", 273 F_printk("%lx %lx %lx %d %x",
254 (unsigned long)__entry->phys, __entry->virt, __entry->len, 274 (unsigned long)__entry->phys, __entry->virt, __entry->len,
255 __entry->map_id, __entry->opcode) 275 __entry->map_id, __entry->opcode),
276
277 FILTER_OTHER
256); 278);
257 279
258 280
@@ -272,6 +294,8 @@ FTRACE_ENTRY(branch, trace_branch,
272 294
273 F_printk("%u:%s:%s (%u)", 295 F_printk("%u:%s:%s (%u)",
274 __entry->line, 296 __entry->line,
275 __entry->func, __entry->file, __entry->correct) 297 __entry->func, __entry->file, __entry->correct),
298
299 FILTER_OTHER
276); 300);
277 301
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 19a359d5e6d5..fee3752ae8f6 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,6 +24,11 @@ static int total_ref_count;
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event, 24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 /* The ftrace function trace is allowed only for root. */
28 if (ftrace_event_is_function(tp_event) &&
29 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
30 return -EPERM;
31
27 /* No tracing, just counting, so no obvious leak */ 32 /* No tracing, just counting, so no obvious leak */
28 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) 33 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
29 return 0; 34 return 0;
@@ -44,23 +49,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
44 return 0; 49 return 0;
45} 50}
46 51
47static int perf_trace_event_init(struct ftrace_event_call *tp_event, 52static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
48 struct perf_event *p_event) 53 struct perf_event *p_event)
49{ 54{
50 struct hlist_head __percpu *list; 55 struct hlist_head __percpu *list;
51 int ret; 56 int ret = -ENOMEM;
52 int cpu; 57 int cpu;
53 58
54 ret = perf_trace_event_perm(tp_event, p_event);
55 if (ret)
56 return ret;
57
58 p_event->tp_event = tp_event; 59 p_event->tp_event = tp_event;
59 if (tp_event->perf_refcount++ > 0) 60 if (tp_event->perf_refcount++ > 0)
60 return 0; 61 return 0;
61 62
62 ret = -ENOMEM;
63
64 list = alloc_percpu(struct hlist_head); 63 list = alloc_percpu(struct hlist_head);
65 if (!list) 64 if (!list)
66 goto fail; 65 goto fail;
@@ -83,7 +82,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
83 } 82 }
84 } 83 }
85 84
86 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); 85 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
87 if (ret) 86 if (ret)
88 goto fail; 87 goto fail;
89 88
@@ -108,6 +107,69 @@ fail:
108 return ret; 107 return ret;
109} 108}
110 109
110static void perf_trace_event_unreg(struct perf_event *p_event)
111{
112 struct ftrace_event_call *tp_event = p_event->tp_event;
113 int i;
114
115 if (--tp_event->perf_refcount > 0)
116 goto out;
117
118 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
119
120 /*
121 * Ensure our callback won't be called anymore. The buffers
122 * will be freed after that.
123 */
124 tracepoint_synchronize_unregister();
125
126 free_percpu(tp_event->perf_events);
127 tp_event->perf_events = NULL;
128
129 if (!--total_ref_count) {
130 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
131 free_percpu(perf_trace_buf[i]);
132 perf_trace_buf[i] = NULL;
133 }
134 }
135out:
136 module_put(tp_event->mod);
137}
138
139static int perf_trace_event_open(struct perf_event *p_event)
140{
141 struct ftrace_event_call *tp_event = p_event->tp_event;
142 return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
143}
144
145static void perf_trace_event_close(struct perf_event *p_event)
146{
147 struct ftrace_event_call *tp_event = p_event->tp_event;
148 tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
149}
150
151static int perf_trace_event_init(struct ftrace_event_call *tp_event,
152 struct perf_event *p_event)
153{
154 int ret;
155
156 ret = perf_trace_event_perm(tp_event, p_event);
157 if (ret)
158 return ret;
159
160 ret = perf_trace_event_reg(tp_event, p_event);
161 if (ret)
162 return ret;
163
164 ret = perf_trace_event_open(p_event);
165 if (ret) {
166 perf_trace_event_unreg(p_event);
167 return ret;
168 }
169
170 return 0;
171}
172
111int perf_trace_init(struct perf_event *p_event) 173int perf_trace_init(struct perf_event *p_event)
112{ 174{
113 struct ftrace_event_call *tp_event; 175 struct ftrace_event_call *tp_event;
@@ -130,6 +192,14 @@ int perf_trace_init(struct perf_event *p_event)
130 return ret; 192 return ret;
131} 193}
132 194
195void perf_trace_destroy(struct perf_event *p_event)
196{
197 mutex_lock(&event_mutex);
198 perf_trace_event_close(p_event);
199 perf_trace_event_unreg(p_event);
200 mutex_unlock(&event_mutex);
201}
202
133int perf_trace_add(struct perf_event *p_event, int flags) 203int perf_trace_add(struct perf_event *p_event, int flags)
134{ 204{
135 struct ftrace_event_call *tp_event = p_event->tp_event; 205 struct ftrace_event_call *tp_event = p_event->tp_event;
@@ -146,43 +216,14 @@ int perf_trace_add(struct perf_event *p_event, int flags)
146 list = this_cpu_ptr(pcpu_list); 216 list = this_cpu_ptr(pcpu_list);
147 hlist_add_head_rcu(&p_event->hlist_entry, list); 217 hlist_add_head_rcu(&p_event->hlist_entry, list);
148 218
149 return 0; 219 return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
150} 220}
151 221
152void perf_trace_del(struct perf_event *p_event, int flags) 222void perf_trace_del(struct perf_event *p_event, int flags)
153{ 223{
154 hlist_del_rcu(&p_event->hlist_entry);
155}
156
157void perf_trace_destroy(struct perf_event *p_event)
158{
159 struct ftrace_event_call *tp_event = p_event->tp_event; 224 struct ftrace_event_call *tp_event = p_event->tp_event;
160 int i; 225 hlist_del_rcu(&p_event->hlist_entry);
161 226 tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
162 mutex_lock(&event_mutex);
163 if (--tp_event->perf_refcount > 0)
164 goto out;
165
166 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
167
168 /*
169 * Ensure our callback won't be called anymore. The buffers
170 * will be freed after that.
171 */
172 tracepoint_synchronize_unregister();
173
174 free_percpu(tp_event->perf_events);
175 tp_event->perf_events = NULL;
176
177 if (!--total_ref_count) {
178 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
179 free_percpu(perf_trace_buf[i]);
180 perf_trace_buf[i] = NULL;
181 }
182 }
183out:
184 module_put(tp_event->mod);
185 mutex_unlock(&event_mutex);
186} 227}
187 228
188__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 229__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
@@ -214,3 +255,86 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
214 return raw_data; 255 return raw_data;
215} 256}
216EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 257EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
258
259#ifdef CONFIG_FUNCTION_TRACER
260static void
261perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
262{
263 struct ftrace_entry *entry;
264 struct hlist_head *head;
265 struct pt_regs regs;
266 int rctx;
267
268#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
269 sizeof(u64)) - sizeof(u32))
270
271 BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
272
273 perf_fetch_caller_regs(&regs);
274
275 entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
276 if (!entry)
277 return;
278
279 entry->ip = ip;
280 entry->parent_ip = parent_ip;
281
282 head = this_cpu_ptr(event_function.perf_events);
283 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
284 1, &regs, head);
285
286#undef ENTRY_SIZE
287}
288
289static int perf_ftrace_function_register(struct perf_event *event)
290{
291 struct ftrace_ops *ops = &event->ftrace_ops;
292
293 ops->flags |= FTRACE_OPS_FL_CONTROL;
294 ops->func = perf_ftrace_function_call;
295 return register_ftrace_function(ops);
296}
297
298static int perf_ftrace_function_unregister(struct perf_event *event)
299{
300 struct ftrace_ops *ops = &event->ftrace_ops;
301 int ret = unregister_ftrace_function(ops);
302 ftrace_free_filter(ops);
303 return ret;
304}
305
306static void perf_ftrace_function_enable(struct perf_event *event)
307{
308 ftrace_function_local_enable(&event->ftrace_ops);
309}
310
311static void perf_ftrace_function_disable(struct perf_event *event)
312{
313 ftrace_function_local_disable(&event->ftrace_ops);
314}
315
316int perf_ftrace_event_register(struct ftrace_event_call *call,
317 enum trace_reg type, void *data)
318{
319 switch (type) {
320 case TRACE_REG_REGISTER:
321 case TRACE_REG_UNREGISTER:
322 break;
323 case TRACE_REG_PERF_REGISTER:
324 case TRACE_REG_PERF_UNREGISTER:
325 return 0;
326 case TRACE_REG_PERF_OPEN:
327 return perf_ftrace_function_register(data);
328 case TRACE_REG_PERF_CLOSE:
329 return perf_ftrace_function_unregister(data);
330 case TRACE_REG_PERF_ADD:
331 perf_ftrace_function_enable(data);
332 return 0;
333 case TRACE_REG_PERF_DEL:
334 perf_ftrace_function_disable(data);
335 return 0;
336 }
337
338 return -EINVAL;
339}
340#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c212a7f934ec..079a93ae8a9d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -147,7 +147,8 @@ int trace_event_raw_init(struct ftrace_event_call *call)
147} 147}
148EXPORT_SYMBOL_GPL(trace_event_raw_init); 148EXPORT_SYMBOL_GPL(trace_event_raw_init);
149 149
150int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) 150int ftrace_event_reg(struct ftrace_event_call *call,
151 enum trace_reg type, void *data)
151{ 152{
152 switch (type) { 153 switch (type) {
153 case TRACE_REG_REGISTER: 154 case TRACE_REG_REGISTER:
@@ -170,6 +171,11 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
170 call->class->perf_probe, 171 call->class->perf_probe,
171 call); 172 call);
172 return 0; 173 return 0;
174 case TRACE_REG_PERF_OPEN:
175 case TRACE_REG_PERF_CLOSE:
176 case TRACE_REG_PERF_ADD:
177 case TRACE_REG_PERF_DEL:
178 return 0;
173#endif 179#endif
174 } 180 }
175 return 0; 181 return 0;
@@ -209,7 +215,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
209 tracing_stop_cmdline_record(); 215 tracing_stop_cmdline_record();
210 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 216 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
211 } 217 }
212 call->class->reg(call, TRACE_REG_UNREGISTER); 218 call->class->reg(call, TRACE_REG_UNREGISTER, NULL);
213 } 219 }
214 break; 220 break;
215 case 1: 221 case 1:
@@ -218,7 +224,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
218 tracing_start_cmdline_record(); 224 tracing_start_cmdline_record();
219 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 225 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
220 } 226 }
221 ret = call->class->reg(call, TRACE_REG_REGISTER); 227 ret = call->class->reg(call, TRACE_REG_REGISTER, NULL);
222 if (ret) { 228 if (ret) {
223 tracing_stop_cmdline_record(); 229 tracing_stop_cmdline_record();
224 pr_info("event trace: Could not enable event " 230 pr_info("event trace: Could not enable event "
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 24aee7127451..431dba8b7542 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -81,6 +81,7 @@ enum {
81 FILT_ERR_TOO_MANY_PREDS, 81 FILT_ERR_TOO_MANY_PREDS,
82 FILT_ERR_MISSING_FIELD, 82 FILT_ERR_MISSING_FIELD,
83 FILT_ERR_INVALID_FILTER, 83 FILT_ERR_INVALID_FILTER,
84 FILT_ERR_IP_FIELD_ONLY,
84}; 85};
85 86
86static char *err_text[] = { 87static char *err_text[] = {
@@ -96,6 +97,7 @@ static char *err_text[] = {
96 "Too many terms in predicate expression", 97 "Too many terms in predicate expression",
97 "Missing field name and/or value", 98 "Missing field name and/or value",
98 "Meaningless filter expression", 99 "Meaningless filter expression",
100 "Only 'ip' field is supported for function trace",
99}; 101};
100 102
101struct opstack_op { 103struct opstack_op {
@@ -685,7 +687,7 @@ find_event_field(struct ftrace_event_call *call, char *name)
685 687
686static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 688static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
687{ 689{
688 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); 690 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
689 if (!stack->preds) 691 if (!stack->preds)
690 return -ENOMEM; 692 return -ENOMEM;
691 stack->index = n_preds; 693 stack->index = n_preds;
@@ -826,8 +828,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
826 if (filter->preds) 828 if (filter->preds)
827 __free_preds(filter); 829 __free_preds(filter);
828 830
829 filter->preds = 831 filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);
830 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
831 832
832 if (!filter->preds) 833 if (!filter->preds)
833 return -ENOMEM; 834 return -ENOMEM;
@@ -900,6 +901,11 @@ int filter_assign_type(const char *type)
900 return FILTER_OTHER; 901 return FILTER_OTHER;
901} 902}
902 903
904static bool is_function_field(struct ftrace_event_field *field)
905{
906 return field->filter_type == FILTER_TRACE_FN;
907}
908
903static bool is_string_field(struct ftrace_event_field *field) 909static bool is_string_field(struct ftrace_event_field *field)
904{ 910{
905 return field->filter_type == FILTER_DYN_STRING || 911 return field->filter_type == FILTER_DYN_STRING ||
@@ -987,6 +993,11 @@ static int init_pred(struct filter_parse_state *ps,
987 fn = filter_pred_strloc; 993 fn = filter_pred_strloc;
988 else 994 else
989 fn = filter_pred_pchar; 995 fn = filter_pred_pchar;
996 } else if (is_function_field(field)) {
997 if (strcmp(field->name, "ip")) {
998 parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
999 return -EINVAL;
1000 }
990 } else { 1001 } else {
991 if (field->is_signed) 1002 if (field->is_signed)
992 ret = strict_strtoll(pred->regex.pattern, 0, &val); 1003 ret = strict_strtoll(pred->regex.pattern, 0, &val);
@@ -1334,10 +1345,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
1334 1345
1335 strcpy(pred.regex.pattern, operand2); 1346 strcpy(pred.regex.pattern, operand2);
1336 pred.regex.len = strlen(pred.regex.pattern); 1347 pred.regex.len = strlen(pred.regex.pattern);
1337
1338#ifdef CONFIG_FTRACE_STARTUP_TEST
1339 pred.field = field; 1348 pred.field = field;
1340#endif
1341 return init_pred(ps, field, &pred) ? NULL : &pred; 1349 return init_pred(ps, field, &pred) ? NULL : &pred;
1342} 1350}
1343 1351
@@ -1486,7 +1494,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1486 children = count_leafs(preds, &preds[root->left]); 1494 children = count_leafs(preds, &preds[root->left]);
1487 children += count_leafs(preds, &preds[root->right]); 1495 children += count_leafs(preds, &preds[root->right]);
1488 1496
1489 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); 1497 root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);
1490 if (!root->ops) 1498 if (!root->ops)
1491 return -ENOMEM; 1499 return -ENOMEM;
1492 1500
@@ -1950,6 +1958,148 @@ void ftrace_profile_free_filter(struct perf_event *event)
1950 __free_filter(filter); 1958 __free_filter(filter);
1951} 1959}
1952 1960
1961struct function_filter_data {
1962 struct ftrace_ops *ops;
1963 int first_filter;
1964 int first_notrace;
1965};
1966
1967#ifdef CONFIG_FUNCTION_TRACER
1968static char **
1969ftrace_function_filter_re(char *buf, int len, int *count)
1970{
1971 char *str, *sep, **re;
1972
1973 str = kstrndup(buf, len, GFP_KERNEL);
1974 if (!str)
1975 return NULL;
1976
1977 /*
1978 * The argv_split function takes white space
1979 * as a separator, so convert ',' into spaces.
1980 */
1981 while ((sep = strchr(str, ',')))
1982 *sep = ' ';
1983
1984 re = argv_split(GFP_KERNEL, str, count);
1985 kfree(str);
1986 return re;
1987}
1988
1989static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter,
1990 int reset, char *re, int len)
1991{
1992 int ret;
1993
1994 if (filter)
1995 ret = ftrace_set_filter(ops, re, len, reset);
1996 else
1997 ret = ftrace_set_notrace(ops, re, len, reset);
1998
1999 return ret;
2000}
2001
2002static int __ftrace_function_set_filter(int filter, char *buf, int len,
2003 struct function_filter_data *data)
2004{
2005 int i, re_cnt, ret;
2006 int *reset;
2007 char **re;
2008
2009 reset = filter ? &data->first_filter : &data->first_notrace;
2010
2011 /*
2012 * The 'ip' field could have multiple filters set, separated
2013 * either by space or comma. We first cut the filter and apply
2014 * all pieces separatelly.
2015 */
2016 re = ftrace_function_filter_re(buf, len, &re_cnt);
2017 if (!re)
2018 return -EINVAL;
2019
2020 for (i = 0; i < re_cnt; i++) {
2021 ret = ftrace_function_set_regexp(data->ops, filter, *reset,
2022 re[i], strlen(re[i]));
2023 if (ret)
2024 break;
2025
2026 if (*reset)
2027 *reset = 0;
2028 }
2029
2030 argv_free(re);
2031 return ret;
2032}
2033
2034static int ftrace_function_check_pred(struct filter_pred *pred, int leaf)
2035{
2036 struct ftrace_event_field *field = pred->field;
2037
2038 if (leaf) {
2039 /*
2040 * Check the leaf predicate for function trace, verify:
2041 * - only '==' and '!=' is used
2042 * - the 'ip' field is used
2043 */
2044 if ((pred->op != OP_EQ) && (pred->op != OP_NE))
2045 return -EINVAL;
2046
2047 if (strcmp(field->name, "ip"))
2048 return -EINVAL;
2049 } else {
2050 /*
2051 * Check the non leaf predicate for function trace, verify:
2052 * - only '||' is used
2053 */
2054 if (pred->op != OP_OR)
2055 return -EINVAL;
2056 }
2057
2058 return 0;
2059}
2060
2061static int ftrace_function_set_filter_cb(enum move_type move,
2062 struct filter_pred *pred,
2063 int *err, void *data)
2064{
2065 /* Checking the node is valid for function trace. */
2066 if ((move != MOVE_DOWN) ||
2067 (pred->left != FILTER_PRED_INVALID)) {
2068 *err = ftrace_function_check_pred(pred, 0);
2069 } else {
2070 *err = ftrace_function_check_pred(pred, 1);
2071 if (*err)
2072 return WALK_PRED_ABORT;
2073
2074 *err = __ftrace_function_set_filter(pred->op == OP_EQ,
2075 pred->regex.pattern,
2076 pred->regex.len,
2077 data);
2078 }
2079
2080 return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT;
2081}
2082
2083static int ftrace_function_set_filter(struct perf_event *event,
2084 struct event_filter *filter)
2085{
2086 struct function_filter_data data = {
2087 .first_filter = 1,
2088 .first_notrace = 1,
2089 .ops = &event->ftrace_ops,
2090 };
2091
2092 return walk_pred_tree(filter->preds, filter->root,
2093 ftrace_function_set_filter_cb, &data);
2094}
2095#else
2096static int ftrace_function_set_filter(struct perf_event *event,
2097 struct event_filter *filter)
2098{
2099 return -ENODEV;
2100}
2101#endif /* CONFIG_FUNCTION_TRACER */
2102
1953int ftrace_profile_set_filter(struct perf_event *event, int event_id, 2103int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1954 char *filter_str) 2104 char *filter_str)
1955{ 2105{
@@ -1970,9 +2120,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1970 goto out_unlock; 2120 goto out_unlock;
1971 2121
1972 err = create_filter(call, filter_str, false, &filter); 2122 err = create_filter(call, filter_str, false, &filter);
1973 if (!err) 2123 if (err)
1974 event->filter = filter; 2124 goto free_filter;
2125
2126 if (ftrace_event_is_function(call))
2127 err = ftrace_function_set_filter(event, filter);
1975 else 2128 else
2129 event->filter = filter;
2130
2131free_filter:
2132 if (err || ftrace_event_is_function(call))
1976 __free_filter(filter); 2133 __free_filter(filter);
1977 2134
1978out_unlock: 2135out_unlock:
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index bbeec31e0ae3..7b46c9bd22ae 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -18,6 +18,16 @@
18#undef TRACE_SYSTEM 18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace 19#define TRACE_SYSTEM ftrace
20 20
21/*
22 * The FTRACE_ENTRY_REG macro allows ftrace entry to define register
23 * function and thus become accesible via perf.
24 */
25#undef FTRACE_ENTRY_REG
26#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
27 filter, regfn) \
28 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
29 filter)
30
21/* not needed for this file */ 31/* not needed for this file */
22#undef __field_struct 32#undef __field_struct
23#define __field_struct(type, item) 33#define __field_struct(type, item)
@@ -44,21 +54,22 @@
44#define F_printk(fmt, args...) fmt, args 54#define F_printk(fmt, args...) fmt, args
45 55
46#undef FTRACE_ENTRY 56#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 57#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
48struct ____ftrace_##name { \ 58struct ____ftrace_##name { \
49 tstruct \ 59 tstruct \
50}; \ 60}; \
51static void __always_unused ____ftrace_check_##name(void) \ 61static void __always_unused ____ftrace_check_##name(void) \
52{ \ 62{ \
53 struct ____ftrace_##name *__entry = NULL; \ 63 struct ____ftrace_##name *__entry = NULL; \
54 \ 64 \
55 /* force compile-time check on F_printk() */ \ 65 /* force compile-time check on F_printk() */ \
56 printk(print); \ 66 printk(print); \
57} 67}
58 68
59#undef FTRACE_ENTRY_DUP 69#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ 70#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) 71 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
72 filter)
62 73
63#include "trace_entries.h" 74#include "trace_entries.h"
64 75
@@ -67,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void) \
67 ret = trace_define_field(event_call, #type, #item, \ 78 ret = trace_define_field(event_call, #type, #item, \
68 offsetof(typeof(field), item), \ 79 offsetof(typeof(field), item), \
69 sizeof(field.item), \ 80 sizeof(field.item), \
70 is_signed_type(type), FILTER_OTHER); \ 81 is_signed_type(type), filter_type); \
71 if (ret) \ 82 if (ret) \
72 return ret; 83 return ret;
73 84
@@ -77,7 +88,7 @@ static void __always_unused ____ftrace_check_##name(void) \
77 offsetof(typeof(field), \ 88 offsetof(typeof(field), \
78 container.item), \ 89 container.item), \
79 sizeof(field.container.item), \ 90 sizeof(field.container.item), \
80 is_signed_type(type), FILTER_OTHER); \ 91 is_signed_type(type), filter_type); \
81 if (ret) \ 92 if (ret) \
82 return ret; 93 return ret;
83 94
@@ -91,7 +102,7 @@ static void __always_unused ____ftrace_check_##name(void) \
91 ret = trace_define_field(event_call, event_storage, #item, \ 102 ret = trace_define_field(event_call, event_storage, #item, \
92 offsetof(typeof(field), item), \ 103 offsetof(typeof(field), item), \
93 sizeof(field.item), \ 104 sizeof(field.item), \
94 is_signed_type(type), FILTER_OTHER); \ 105 is_signed_type(type), filter_type); \
95 mutex_unlock(&event_storage_mutex); \ 106 mutex_unlock(&event_storage_mutex); \
96 if (ret) \ 107 if (ret) \
97 return ret; \ 108 return ret; \
@@ -104,7 +115,7 @@ static void __always_unused ____ftrace_check_##name(void) \
104 offsetof(typeof(field), \ 115 offsetof(typeof(field), \
105 container.item), \ 116 container.item), \
106 sizeof(field.container.item), \ 117 sizeof(field.container.item), \
107 is_signed_type(type), FILTER_OTHER); \ 118 is_signed_type(type), filter_type); \
108 if (ret) \ 119 if (ret) \
109 return ret; 120 return ret;
110 121
@@ -112,17 +123,18 @@ static void __always_unused ____ftrace_check_##name(void) \
112#define __dynamic_array(type, item) \ 123#define __dynamic_array(type, item) \
113 ret = trace_define_field(event_call, #type, #item, \ 124 ret = trace_define_field(event_call, #type, #item, \
114 offsetof(typeof(field), item), \ 125 offsetof(typeof(field), item), \
115 0, is_signed_type(type), FILTER_OTHER);\ 126 0, is_signed_type(type), filter_type);\
116 if (ret) \ 127 if (ret) \
117 return ret; 128 return ret;
118 129
119#undef FTRACE_ENTRY 130#undef FTRACE_ENTRY
120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
121int \ 132int \
122ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ 133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
123{ \ 134{ \
124 struct struct_name field; \ 135 struct struct_name field; \
125 int ret; \ 136 int ret; \
137 int filter_type = filter; \
126 \ 138 \
127 tstruct; \ 139 tstruct; \
128 \ 140 \
@@ -152,13 +164,15 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
152#undef F_printk 164#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args) 165#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154 166
155#undef FTRACE_ENTRY 167#undef FTRACE_ENTRY_REG
156#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ 168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
169 regfn) \
157 \ 170 \
158struct ftrace_event_class event_class_ftrace_##call = { \ 171struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \ 172 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \ 173 .define_fields = ftrace_define_fields_##call, \
161 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
175 .reg = regfn, \
162}; \ 176}; \
163 \ 177 \
164struct ftrace_event_call __used event_##call = { \ 178struct ftrace_event_call __used event_##call = { \
@@ -170,4 +184,14 @@ struct ftrace_event_call __used event_##call = { \
170struct ftrace_event_call __used \ 184struct ftrace_event_call __used \
171__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 185__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
172 186
187#undef FTRACE_ENTRY
188#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \
189 FTRACE_ENTRY_REG(call, struct_name, etype, \
190 PARAMS(tstruct), PARAMS(print), filter, NULL)
191
192int ftrace_event_is_function(struct ftrace_event_call *call)
193{
194 return call == &event_function;
195}
196
173#include "trace_entries.h" 197#include "trace_entries.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 00d527c945a4..580a05ec926b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1892,7 +1892,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1892#endif /* CONFIG_PERF_EVENTS */ 1892#endif /* CONFIG_PERF_EVENTS */
1893 1893
1894static __kprobes 1894static __kprobes
1895int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) 1895int kprobe_register(struct ftrace_event_call *event,
1896 enum trace_reg type, void *data)
1896{ 1897{
1897 struct trace_probe *tp = (struct trace_probe *)event->data; 1898 struct trace_probe *tp = (struct trace_probe *)event->data;
1898 1899
@@ -1909,6 +1910,11 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1909 case TRACE_REG_PERF_UNREGISTER: 1910 case TRACE_REG_PERF_UNREGISTER:
1910 disable_trace_probe(tp, TP_FLAG_PROFILE); 1911 disable_trace_probe(tp, TP_FLAG_PROFILE);
1911 return 0; 1912 return 0;
1913 case TRACE_REG_PERF_OPEN:
1914 case TRACE_REG_PERF_CLOSE:
1915 case TRACE_REG_PERF_ADD:
1916 case TRACE_REG_PERF_DEL:
1917 return 0;
1912#endif 1918#endif
1913 } 1919 }
1914 return 0; 1920 return 0;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0d6ff3555942..859fae6b1825 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
264 return ret; 264 return ret;
265} 265}
266 266
267int trace_seq_path(struct trace_seq *s, struct path *path) 267int trace_seq_path(struct trace_seq *s, const struct path *path)
268{ 268{
269 unsigned char *p; 269 unsigned char *p;
270 270
@@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
300 unsigned long mask; 300 unsigned long mask;
301 const char *str; 301 const char *str;
302 const char *ret = p->buffer + p->len; 302 const char *ret = p->buffer + p->len;
303 int i; 303 int i, first = 1;
304 304
305 for (i = 0; flag_array[i].name && flags; i++) { 305 for (i = 0; flag_array[i].name && flags; i++) {
306 306
@@ -310,14 +310,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
310 310
311 str = flag_array[i].name; 311 str = flag_array[i].name;
312 flags &= ~mask; 312 flags &= ~mask;
313 if (p->len && delim) 313 if (!first && delim)
314 trace_seq_puts(p, delim); 314 trace_seq_puts(p, delim);
315 else
316 first = 0;
315 trace_seq_puts(p, str); 317 trace_seq_puts(p, str);
316 } 318 }
317 319
318 /* check for left over flags */ 320 /* check for left over flags */
319 if (flags) { 321 if (flags) {
320 if (p->len && delim) 322 if (!first && delim)
321 trace_seq_puts(p, delim); 323 trace_seq_puts(p, delim);
322 trace_seq_printf(p, "0x%lx", flags); 324 trace_seq_printf(p, "0x%lx", flags);
323 } 325 }
@@ -344,7 +346,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
344 break; 346 break;
345 } 347 }
346 348
347 if (!p->len) 349 if (ret == (const char *)(p->buffer + p->len))
348 trace_seq_printf(p, "0x%lx", val); 350 trace_seq_printf(p, "0x%lx", val);
349 351
350 trace_seq_putc(p, 0); 352 trace_seq_putc(p, 0);
@@ -370,7 +372,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
370 break; 372 break;
371 } 373 }
372 374
373 if (!p->len) 375 if (ret == (const char *)(p->buffer + p->len))
374 trace_seq_printf(p, "0x%llx", val); 376 trace_seq_printf(p, "0x%llx", val);
375 377
376 trace_seq_putc(p, 0); 378 trace_seq_putc(p, 0);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index cb654542c1a1..96fc73369099 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -17,9 +17,9 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
17static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 17static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
18 18
19static int syscall_enter_register(struct ftrace_event_call *event, 19static int syscall_enter_register(struct ftrace_event_call *event,
20 enum trace_reg type); 20 enum trace_reg type, void *data);
21static int syscall_exit_register(struct ftrace_event_call *event, 21static int syscall_exit_register(struct ftrace_event_call *event,
22 enum trace_reg type); 22 enum trace_reg type, void *data);
23 23
24static int syscall_enter_define_fields(struct ftrace_event_call *call); 24static int syscall_enter_define_fields(struct ftrace_event_call *call);
25static int syscall_exit_define_fields(struct ftrace_event_call *call); 25static int syscall_exit_define_fields(struct ftrace_event_call *call);
@@ -468,8 +468,8 @@ int __init init_ftrace_syscalls(void)
468 unsigned long addr; 468 unsigned long addr;
469 int i; 469 int i;
470 470
471 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * 471 syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
472 NR_syscalls, GFP_KERNEL); 472 GFP_KERNEL);
473 if (!syscalls_metadata) { 473 if (!syscalls_metadata) {
474 WARN_ON(1); 474 WARN_ON(1);
475 return -ENOMEM; 475 return -ENOMEM;
@@ -649,7 +649,7 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
649#endif /* CONFIG_PERF_EVENTS */ 649#endif /* CONFIG_PERF_EVENTS */
650 650
651static int syscall_enter_register(struct ftrace_event_call *event, 651static int syscall_enter_register(struct ftrace_event_call *event,
652 enum trace_reg type) 652 enum trace_reg type, void *data)
653{ 653{
654 switch (type) { 654 switch (type) {
655 case TRACE_REG_REGISTER: 655 case TRACE_REG_REGISTER:
@@ -664,13 +664,18 @@ static int syscall_enter_register(struct ftrace_event_call *event,
664 case TRACE_REG_PERF_UNREGISTER: 664 case TRACE_REG_PERF_UNREGISTER:
665 perf_sysenter_disable(event); 665 perf_sysenter_disable(event);
666 return 0; 666 return 0;
667 case TRACE_REG_PERF_OPEN:
668 case TRACE_REG_PERF_CLOSE:
669 case TRACE_REG_PERF_ADD:
670 case TRACE_REG_PERF_DEL:
671 return 0;
667#endif 672#endif
668 } 673 }
669 return 0; 674 return 0;
670} 675}
671 676
672static int syscall_exit_register(struct ftrace_event_call *event, 677static int syscall_exit_register(struct ftrace_event_call *event,
673 enum trace_reg type) 678 enum trace_reg type, void *data)
674{ 679{
675 switch (type) { 680 switch (type) {
676 case TRACE_REG_REGISTER: 681 case TRACE_REG_REGISTER:
@@ -685,6 +690,11 @@ static int syscall_exit_register(struct ftrace_event_call *event,
685 case TRACE_REG_PERF_UNREGISTER: 690 case TRACE_REG_PERF_UNREGISTER:
686 perf_sysexit_disable(event); 691 perf_sysexit_disable(event);
687 return 0; 692 return 0;
693 case TRACE_REG_PERF_OPEN:
694 case TRACE_REG_PERF_CLOSE:
695 case TRACE_REG_PERF_ADD:
696 case TRACE_REG_PERF_DEL:
697 return 0;
688#endif 698#endif
689 } 699 }
690 return 0; 700 return 0;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index f1539decd99d..d96ba22dabfa 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,7 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h> 28#include <linux/static_key.h>
29 29
30extern struct tracepoint * const __start___tracepoints_ptrs[]; 30extern struct tracepoint * const __start___tracepoints_ptrs[];
31extern struct tracepoint * const __stop___tracepoints_ptrs[]; 31extern struct tracepoint * const __stop___tracepoints_ptrs[];
@@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
256{ 256{
257 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 257 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
258 258
259 if (elem->regfunc && !jump_label_enabled(&elem->key) && active) 259 if (elem->regfunc && !static_key_enabled(&elem->key) && active)
260 elem->regfunc(); 260 elem->regfunc();
261 else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) 261 else if (elem->unregfunc && static_key_enabled(&elem->key) && !active)
262 elem->unregfunc(); 262 elem->unregfunc();
263 263
264 /* 264 /*
@@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
269 * is used. 269 * is used.
270 */ 270 */
271 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 271 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
272 if (active && !jump_label_enabled(&elem->key)) 272 if (active && !static_key_enabled(&elem->key))
273 jump_label_inc(&elem->key); 273 static_key_slow_inc(&elem->key);
274 else if (!active && jump_label_enabled(&elem->key)) 274 else if (!active && static_key_enabled(&elem->key))
275 jump_label_dec(&elem->key); 275 static_key_slow_dec(&elem->key);
276} 276}
277 277
278/* 278/*
@@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
283 */ 283 */
284static void disable_tracepoint(struct tracepoint *elem) 284static void disable_tracepoint(struct tracepoint *elem)
285{ 285{
286 if (elem->unregfunc && jump_label_enabled(&elem->key)) 286 if (elem->unregfunc && static_key_enabled(&elem->key))
287 elem->unregfunc(); 287 elem->unregfunc();
288 288
289 if (jump_label_enabled(&elem->key)) 289 if (static_key_enabled(&elem->key))
290 jump_label_dec(&elem->key); 290 static_key_slow_dec(&elem->key);
291 rcu_assign_pointer(elem->funcs, NULL); 291 rcu_assign_pointer(elem->funcs, NULL);
292} 292}
293 293
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index d117262deba3..df30ee08bdd4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -3,15 +3,14 @@
3 * 3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. 4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 * 5 *
6 * this code detects hard lockups: incidents in where on a CPU 6 * Note: Most of this code is borrowed heavily from the original softlockup
7 * the kernel does not respond to anything except NMI. 7 * detector, so thanks to Ingo for the initial implementation.
8 * 8 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
9 * Note: Most of this code is borrowed heavily from softlockup.c,
10 * so thanks to Ingo for the initial implementation.
11 * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
12 * to those contributors as well. 9 * to those contributors as well.
13 */ 10 */
14 11
12#define pr_fmt(fmt) "NMI watchdog: " fmt
13
15#include <linux/mm.h> 14#include <linux/mm.h>
16#include <linux/cpu.h> 15#include <linux/cpu.h>
17#include <linux/nmi.h> 16#include <linux/nmi.h>
@@ -117,9 +116,10 @@ static unsigned long get_sample_period(void)
117{ 116{
118 /* 117 /*
119 * convert watchdog_thresh from seconds to ns 118 * convert watchdog_thresh from seconds to ns
120 * the divide by 5 is to give hrtimer 5 chances to 119 * the divide by 5 is to give hrtimer several chances (two
121 * increment before the hardlockup detector generates 120 * or three with the current relation between the soft
122 * a warning 121 * and hard thresholds) to increment before the
122 * hardlockup detector generates a warning
123 */ 123 */
124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5); 124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
125} 125}
@@ -321,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
321 */ 321 */
322static int watchdog(void *unused) 322static int watchdog(void *unused)
323{ 323{
324 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 324 struct sched_param param = { .sched_priority = 0 };
325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
326 326
327 sched_setscheduler(current, SCHED_FIFO, &param);
328
329 /* initialize timestamp */ 327 /* initialize timestamp */
330 __touch_watchdog(); 328 __touch_watchdog();
331 329
@@ -336,9 +334,11 @@ static int watchdog(void *unused)
336 334
337 set_current_state(TASK_INTERRUPTIBLE); 335 set_current_state(TASK_INTERRUPTIBLE);
338 /* 336 /*
339 * Run briefly once per second to reset the softlockup timestamp. 337 * Run briefly (kicked by the hrtimer callback function) once every
340 * If this gets delayed for more than 60 seconds then the 338 * get_sample_period() seconds (4 seconds by default) to reset the
341 * debug-printout triggers in watchdog_timer_fn(). 339 * softlockup timestamp. If this gets delayed for more than
340 * 2*watchdog_thresh seconds then the debug-printout triggers in
341 * watchdog_timer_fn().
342 */ 342 */
343 while (!kthread_should_stop()) { 343 while (!kthread_should_stop()) {
344 __touch_watchdog(); 344 __touch_watchdog();
@@ -349,8 +349,11 @@ static int watchdog(void *unused)
349 349
350 set_current_state(TASK_INTERRUPTIBLE); 350 set_current_state(TASK_INTERRUPTIBLE);
351 } 351 }
352 /*
353 * Drop the policy/priority elevation during thread exit to avoid a
354 * scheduling latency spike.
355 */
352 __set_current_state(TASK_RUNNING); 356 __set_current_state(TASK_RUNNING);
353 param.sched_priority = 0;
354 sched_setscheduler(current, SCHED_NORMAL, &param); 357 sched_setscheduler(current, SCHED_NORMAL, &param);
355 return 0; 358 return 0;
356} 359}
@@ -376,18 +379,20 @@ static int watchdog_nmi_enable(int cpu)
376 /* Try to register using hardware perf events */ 379 /* Try to register using hardware perf events */
377 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 380 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
378 if (!IS_ERR(event)) { 381 if (!IS_ERR(event)) {
379 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 382 pr_info("enabled, takes one hw-pmu counter.\n");
380 goto out_save; 383 goto out_save;
381 } 384 }
382 385
383 386
384 /* vary the KERN level based on the returned errno */ 387 /* vary the KERN level based on the returned errno */
385 if (PTR_ERR(event) == -EOPNOTSUPP) 388 if (PTR_ERR(event) == -EOPNOTSUPP)
386 printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); 389 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
387 else if (PTR_ERR(event) == -ENOENT) 390 else if (PTR_ERR(event) == -ENOENT)
388 printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); 391 pr_warning("disabled (cpu%i): hardware events not enabled\n",
392 cpu);
389 else 393 else
390 printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); 394 pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
395 cpu, PTR_ERR(event));
391 return PTR_ERR(event); 396 return PTR_ERR(event);
392 397
393 /* success path */ 398 /* success path */
@@ -439,9 +444,10 @@ static int watchdog_enable(int cpu)
439 444
440 /* create the watchdog thread */ 445 /* create the watchdog thread */
441 if (!p) { 446 if (!p) {
447 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
442 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); 448 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
443 if (IS_ERR(p)) { 449 if (IS_ERR(p)) {
444 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 450 pr_err("softlockup watchdog for %i failed\n", cpu);
445 if (!err) { 451 if (!err) {
446 /* if hardlockup hasn't already set this */ 452 /* if hardlockup hasn't already set this */
447 err = PTR_ERR(p); 453 err = PTR_ERR(p);
@@ -450,6 +456,7 @@ static int watchdog_enable(int cpu)
450 } 456 }
451 goto out; 457 goto out;
452 } 458 }
459 sched_setscheduler(p, SCHED_FIFO, &param);
453 kthread_bind(p, cpu); 460 kthread_bind(p, cpu);
454 per_cpu(watchdog_touch_ts, cpu) = 0; 461 per_cpu(watchdog_touch_ts, cpu) = 0;
455 per_cpu(softlockup_watchdog, cpu) = p; 462 per_cpu(softlockup_watchdog, cpu) = p;
@@ -496,7 +503,7 @@ static void watchdog_enable_all_cpus(void)
496 watchdog_enabled = 1; 503 watchdog_enabled = 1;
497 504
498 if (!watchdog_enabled) 505 if (!watchdog_enabled)
499 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); 506 pr_err("failed to be enabled on some cpus\n");
500 507
501} 508}
502 509
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bec7b5b53e03..5abf42f63c08 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -253,11 +253,13 @@ struct workqueue_struct *system_long_wq __read_mostly;
253struct workqueue_struct *system_nrt_wq __read_mostly; 253struct workqueue_struct *system_nrt_wq __read_mostly;
254struct workqueue_struct *system_unbound_wq __read_mostly; 254struct workqueue_struct *system_unbound_wq __read_mostly;
255struct workqueue_struct *system_freezable_wq __read_mostly; 255struct workqueue_struct *system_freezable_wq __read_mostly;
256struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
256EXPORT_SYMBOL_GPL(system_wq); 257EXPORT_SYMBOL_GPL(system_wq);
257EXPORT_SYMBOL_GPL(system_long_wq); 258EXPORT_SYMBOL_GPL(system_long_wq);
258EXPORT_SYMBOL_GPL(system_nrt_wq); 259EXPORT_SYMBOL_GPL(system_nrt_wq);
259EXPORT_SYMBOL_GPL(system_unbound_wq); 260EXPORT_SYMBOL_GPL(system_unbound_wq);
260EXPORT_SYMBOL_GPL(system_freezable_wq); 261EXPORT_SYMBOL_GPL(system_freezable_wq);
262EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
261 263
262#define CREATE_TRACE_POINTS 264#define CREATE_TRACE_POINTS
263#include <trace/events/workqueue.h> 265#include <trace/events/workqueue.h>
@@ -474,13 +476,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
474 struct workqueue_struct *wq) 476 struct workqueue_struct *wq)
475{ 477{
476 if (!(wq->flags & WQ_UNBOUND)) { 478 if (!(wq->flags & WQ_UNBOUND)) {
477 if (likely(cpu < nr_cpu_ids)) { 479 if (likely(cpu < nr_cpu_ids))
478#ifdef CONFIG_SMP
479 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); 480 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
480#else
481 return wq->cpu_wq.single;
482#endif
483 }
484 } else if (likely(cpu == WORK_CPU_UNBOUND)) 481 } else if (likely(cpu == WORK_CPU_UNBOUND))
485 return wq->cpu_wq.single; 482 return wq->cpu_wq.single;
486 return NULL; 483 return NULL;
@@ -2897,13 +2894,8 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2897 const size_t size = sizeof(struct cpu_workqueue_struct); 2894 const size_t size = sizeof(struct cpu_workqueue_struct);
2898 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 2895 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2899 __alignof__(unsigned long long)); 2896 __alignof__(unsigned long long));
2900#ifdef CONFIG_SMP
2901 bool percpu = !(wq->flags & WQ_UNBOUND);
2902#else
2903 bool percpu = false;
2904#endif
2905 2897
2906 if (percpu) 2898 if (!(wq->flags & WQ_UNBOUND))
2907 wq->cpu_wq.pcpu = __alloc_percpu(size, align); 2899 wq->cpu_wq.pcpu = __alloc_percpu(size, align);
2908 else { 2900 else {
2909 void *ptr; 2901 void *ptr;
@@ -2927,13 +2919,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2927 2919
2928static void free_cwqs(struct workqueue_struct *wq) 2920static void free_cwqs(struct workqueue_struct *wq)
2929{ 2921{
2930#ifdef CONFIG_SMP 2922 if (!(wq->flags & WQ_UNBOUND))
2931 bool percpu = !(wq->flags & WQ_UNBOUND);
2932#else
2933 bool percpu = false;
2934#endif
2935
2936 if (percpu)
2937 free_percpu(wq->cpu_wq.pcpu); 2923 free_percpu(wq->cpu_wq.pcpu);
2938 else if (wq->cpu_wq.single) { 2924 else if (wq->cpu_wq.single) {
2939 /* the pointer to free is stored right after the cwq */ 2925 /* the pointer to free is stored right after the cwq */
@@ -3833,8 +3819,11 @@ static int __init init_workqueues(void)
3833 WQ_UNBOUND_MAX_ACTIVE); 3819 WQ_UNBOUND_MAX_ACTIVE);
3834 system_freezable_wq = alloc_workqueue("events_freezable", 3820 system_freezable_wq = alloc_workqueue("events_freezable",
3835 WQ_FREEZABLE, 0); 3821 WQ_FREEZABLE, 0);
3822 system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable",
3823 WQ_NON_REENTRANT | WQ_FREEZABLE, 0);
3836 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || 3824 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3837 !system_unbound_wq || !system_freezable_wq); 3825 !system_unbound_wq || !system_freezable_wq ||
3826 !system_nrt_freezable_wq);
3838 return 0; 3827 return 0;
3839} 3828}
3840early_initcall(init_workqueues); 3829early_initcall(init_workqueues);