aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Kconfig.preempt1
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/auditsc.c5
-rw-r--r--kernel/cgroup.c354
-rw-r--r--kernel/cgroup_freezer.c11
-rw-r--r--kernel/cpuset.c80
-rw-r--r--kernel/cred.c1
-rw-r--r--kernel/debug/debug_core.c34
-rw-r--r--kernel/debug/gdbstub.c10
-rw-r--r--kernel/debug/kdb/kdb_bp.c7
-rw-r--r--kernel/debug/kdb/kdb_bt.c1
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c95
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/debug/kdb/kdb_private.h7
-rw-r--r--kernel/debug/kdb/kdb_support.c4
-rw-r--r--kernel/dma.c1
-rw-r--r--kernel/events/callchain.c2
-rw-r--r--kernel/events/core.c384
-rw-r--r--kernel/events/hw_breakpoint.c17
-rw-r--r--kernel/exit.c84
-rw-r--r--kernel/fork.c123
-rw-r--r--kernel/freezer.c6
-rw-r--r--kernel/futex.c51
-rw-r--r--kernel/hung_task.c11
-rw-r--r--kernel/irq/Kconfig10
-rw-r--r--kernel/irq/autoprobe.c4
-rw-r--r--kernel/irq/chip.c47
-rw-r--r--kernel/irq/handle.c14
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/irqdomain.c828
-rw-r--r--kernel/irq/manage.c132
-rw-r--r--kernel/jump_label.c135
-rw-r--r--kernel/kexec.c15
-rw-r--r--kernel/kmod.c84
-rw-r--r--kernel/kprobes.c20
-rw-r--r--kernel/lockdep.c8
-rw-r--r--kernel/module.c37
-rw-r--r--kernel/mutex.c4
-rw-r--r--kernel/padata.c44
-rw-r--r--kernel/params.c43
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/pid_namespace.c41
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/hibernate.c47
-rw-r--r--kernel/power/main.c20
-rw-r--r--kernel/power/power.h23
-rw-r--r--kernel/power/process.c50
-rw-r--r--kernel/power/qos.c23
-rw-r--r--kernel/power/snapshot.c38
-rw-r--r--kernel/power/suspend.c84
-rw-r--r--kernel/power/user.c17
-rw-r--r--kernel/printk.c51
-rw-r--r--kernel/ptrace.c66
-rw-r--r--kernel/rcu.h26
-rw-r--r--kernel/rcupdate.c5
-rw-r--r--kernel/rcutiny.c26
-rw-r--r--kernel/rcutiny_plugin.h77
-rw-r--r--kernel/rcutorture.c99
-rw-r--r--kernel/rcutree.c507
-rw-r--r--kernel/rcutree.h27
-rw-r--r--kernel/rcutree_plugin.h450
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/relay.c10
-rw-r--r--kernel/res_counter.c25
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/rwsem.c1
-rw-r--r--kernel/sched/auto_group.c12
-rw-r--r--kernel/sched/core.c249
-rw-r--r--kernel/sched/cpupri.c3
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c452
-rw-r--r--kernel/sched/rt.c50
-rw-r--r--kernel/sched/sched.h29
-rw-r--r--kernel/sched/stats.c4
-rw-r--r--kernel/signal.c56
-rw-r--r--kernel/smp.c90
-rw-r--r--kernel/softirq.c34
-rw-r--r--kernel/spinlock.c2
-rw-r--r--kernel/srcu.c33
-rw-r--r--kernel/sys.c19
-rw-r--r--kernel/sysctl.c514
-rw-r--r--kernel/sysctl_check.c160
-rw-r--r--kernel/time.c6
-rw-r--r--kernel/time/alarmtimer.c8
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/ntp.c191
-rw-r--r--kernel/time/tick-broadcast.c4
-rw-r--r--kernel/time/tick-sched.c17
-rw-r--r--kernel/time/timekeeping.c373
-rw-r--r--kernel/trace/ftrace.c134
-rw-r--r--kernel/trace/trace.c6
-rw-r--r--kernel/trace/trace.h38
-rw-r--r--kernel/trace/trace_entries.h54
-rw-r--r--kernel/trace/trace_event_perf.c208
-rw-r--r--kernel/trace/trace_events.c12
-rw-r--r--kernel/trace/trace_events_filter.c175
-rw-r--r--kernel/trace/trace_export.c64
-rw-r--r--kernel/trace/trace_kprobe.c8
-rw-r--r--kernel/trace/trace_output.c14
-rw-r--r--kernel/trace/trace_syscalls.c22
-rw-r--r--kernel/tracepoint.c20
-rw-r--r--kernel/watchdog.c53
-rw-r--r--kernel/workqueue.c29
106 files changed, 4698 insertions, 2913 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 5068e2a4e75f..2251882daf53 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -124,8 +124,8 @@ config INLINE_SPIN_LOCK_IRQSAVE
124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ 124 def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
125 ARCH_INLINE_SPIN_LOCK_IRQSAVE 125 ARCH_INLINE_SPIN_LOCK_IRQSAVE
126 126
127config INLINE_SPIN_UNLOCK 127config UNINLINE_SPIN_UNLOCK
128 def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) 128 bool
129 129
130config INLINE_SPIN_UNLOCK_BH 130config INLINE_SPIN_UNLOCK_BH
131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH 131 def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 24e7cb0ba26a..3f9c97419f02 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY
36config PREEMPT 36config PREEMPT
37 bool "Preemptible Kernel (Low-Latency Desktop)" 37 bool "Preemptible Kernel (Low-Latency Desktop)"
38 select PREEMPT_COUNT 38 select PREEMPT_COUNT
39 select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
39 help 40 help
40 This option reduces the latency of the kernel by making 41 This option reduces the latency of the kernel by making
41 all kernel code (that is not executing in a critical section) 42 all kernel code (that is not executing in a critical section)
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e76..cb41b9547c9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,7 +27,6 @@ obj-y += power/
27 27
28obj-$(CONFIG_FREEZER) += freezer.o 28obj-$(CONFIG_FREEZER) += freezer.o
29obj-$(CONFIG_PROFILING) += profile.o 29obj-$(CONFIG_PROFILING) += profile.o
30obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
31obj-$(CONFIG_STACKTRACE) += stacktrace.o 30obj-$(CONFIG_STACKTRACE) += stacktrace.o
32obj-y += time/ 31obj-y += time/
33obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 32obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/audit.c b/kernel/audit.c
index bb0eb5bb9a0a..1c7f2c61416b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1418 1418
1419/* This is a helper-function to print the escaped d_path */ 1419/* This is a helper-function to print the escaped d_path */
1420void audit_log_d_path(struct audit_buffer *ab, const char *prefix, 1420void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
1421 struct path *path) 1421 const struct path *path)
1422{ 1422{
1423 char *p, *pathname; 1423 char *p, *pathname;
1424 1424
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index caaea6e944f8..af1de0f34eae 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1863,11 +1863,12 @@ void __audit_syscall_entry(int arch, int major,
1863 1863
1864/** 1864/**
1865 * audit_syscall_exit - deallocate audit context after a system call 1865 * audit_syscall_exit - deallocate audit context after a system call
1866 * @pt_regs: syscall registers 1866 * @success: success value of the syscall
1867 * @return_code: return value of the syscall
1867 * 1868 *
1868 * Tear down after system call. If the audit context has been marked as 1869 * Tear down after system call. If the audit context has been marked as
1869 * auditable (either because of the AUDIT_RECORD_CONTEXT state from 1870 * auditable (either because of the AUDIT_RECORD_CONTEXT state from
1870 * filtering, or because some other part of the kernel write an audit 1871 * filtering, or because some other part of the kernel wrote an audit
1871 * message), then write out the syscall information. In call cases, 1872 * message), then write out the syscall information. In call cases,
1872 * free the names stored from getname(). 1873 * free the names stored from getname().
1873 */ 1874 */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a5d3b5325f77..f4ea4b6f3cf1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -818,7 +818,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
818 818
819 for_each_subsys(cgrp->root, ss) 819 for_each_subsys(cgrp->root, ss)
820 if (ss->pre_destroy) { 820 if (ss->pre_destroy) {
821 ret = ss->pre_destroy(ss, cgrp); 821 ret = ss->pre_destroy(cgrp);
822 if (ret) 822 if (ret)
823 break; 823 break;
824 } 824 }
@@ -846,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
846 * Release the subsystem state objects. 846 * Release the subsystem state objects.
847 */ 847 */
848 for_each_subsys(cgrp->root, ss) 848 for_each_subsys(cgrp->root, ss)
849 ss->destroy(ss, cgrp); 849 ss->destroy(cgrp);
850 850
851 cgrp->root->number_of_cgroups--; 851 cgrp->root->number_of_cgroups--;
852 mutex_unlock(&cgroup_mutex); 852 mutex_unlock(&cgroup_mutex);
@@ -1015,7 +1015,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1015 list_move(&ss->sibling, &root->subsys_list); 1015 list_move(&ss->sibling, &root->subsys_list);
1016 ss->root = root; 1016 ss->root = root;
1017 if (ss->bind) 1017 if (ss->bind)
1018 ss->bind(ss, cgrp); 1018 ss->bind(cgrp);
1019 mutex_unlock(&ss->hierarchy_mutex); 1019 mutex_unlock(&ss->hierarchy_mutex);
1020 /* refcount was already taken, and we're keeping it */ 1020 /* refcount was already taken, and we're keeping it */
1021 } else if (bit & removed_bits) { 1021 } else if (bit & removed_bits) {
@@ -1025,7 +1025,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
1025 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1025 BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
1026 mutex_lock(&ss->hierarchy_mutex); 1026 mutex_lock(&ss->hierarchy_mutex);
1027 if (ss->bind) 1027 if (ss->bind)
1028 ss->bind(ss, dummytop); 1028 ss->bind(dummytop);
1029 dummytop->subsys[i]->cgroup = dummytop; 1029 dummytop->subsys[i]->cgroup = dummytop;
1030 cgrp->subsys[i] = NULL; 1030 cgrp->subsys[i] = NULL;
1031 subsys[i]->root = &rootnode; 1031 subsys[i]->root = &rootnode;
@@ -1472,7 +1472,6 @@ static int cgroup_get_rootdir(struct super_block *sb)
1472 1472
1473 struct inode *inode = 1473 struct inode *inode =
1474 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1474 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
1475 struct dentry *dentry;
1476 1475
1477 if (!inode) 1476 if (!inode)
1478 return -ENOMEM; 1477 return -ENOMEM;
@@ -1481,12 +1480,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
1481 inode->i_op = &cgroup_dir_inode_operations; 1480 inode->i_op = &cgroup_dir_inode_operations;
1482 /* directories start off with i_nlink == 2 (for "." entry) */ 1481 /* directories start off with i_nlink == 2 (for "." entry) */
1483 inc_nlink(inode); 1482 inc_nlink(inode);
1484 dentry = d_alloc_root(inode); 1483 sb->s_root = d_make_root(inode);
1485 if (!dentry) { 1484 if (!sb->s_root)
1486 iput(inode);
1487 return -ENOMEM; 1485 return -ENOMEM;
1488 }
1489 sb->s_root = dentry;
1490 /* for everything else we want ->d_op set */ 1486 /* for everything else we want ->d_op set */
1491 sb->s_d_op = &cgroup_dops; 1487 sb->s_d_op = &cgroup_dops;
1492 return 0; 1488 return 0;
@@ -1763,6 +1759,7 @@ EXPORT_SYMBOL_GPL(cgroup_path);
1763struct task_and_cgroup { 1759struct task_and_cgroup {
1764 struct task_struct *task; 1760 struct task_struct *task;
1765 struct cgroup *cgrp; 1761 struct cgroup *cgrp;
1762 struct css_set *cg;
1766}; 1763};
1767 1764
1768struct cgroup_taskset { 1765struct cgroup_taskset {
@@ -1843,11 +1840,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size);
1843 * will already exist. If not set, this function might sleep, and can fail with 1840 * will already exist. If not set, this function might sleep, and can fail with
1844 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. 1841 * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
1845 */ 1842 */
1846static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1843static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1847 struct task_struct *tsk, bool guarantee) 1844 struct task_struct *tsk, struct css_set *newcg)
1848{ 1845{
1849 struct css_set *oldcg; 1846 struct css_set *oldcg;
1850 struct css_set *newcg;
1851 1847
1852 /* 1848 /*
1853 * We are synchronized through threadgroup_lock() against PF_EXITING 1849 * We are synchronized through threadgroup_lock() against PF_EXITING
@@ -1857,23 +1853,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1857 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1853 WARN_ON_ONCE(tsk->flags & PF_EXITING);
1858 oldcg = tsk->cgroups; 1854 oldcg = tsk->cgroups;
1859 1855
1860 /* locate or allocate a new css_set for this task. */
1861 if (guarantee) {
1862 /* we know the css_set we want already exists. */
1863 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
1864 read_lock(&css_set_lock);
1865 newcg = find_existing_css_set(oldcg, cgrp, template);
1866 BUG_ON(!newcg);
1867 get_css_set(newcg);
1868 read_unlock(&css_set_lock);
1869 } else {
1870 might_sleep();
1871 /* find_css_set will give us newcg already referenced. */
1872 newcg = find_css_set(oldcg, cgrp);
1873 if (!newcg)
1874 return -ENOMEM;
1875 }
1876
1877 task_lock(tsk); 1856 task_lock(tsk);
1878 rcu_assign_pointer(tsk->cgroups, newcg); 1857 rcu_assign_pointer(tsk->cgroups, newcg);
1879 task_unlock(tsk); 1858 task_unlock(tsk);
@@ -1892,7 +1871,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
1892 put_css_set(oldcg); 1871 put_css_set(oldcg);
1893 1872
1894 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1873 set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
1895 return 0;
1896} 1874}
1897 1875
1898/** 1876/**
@@ -1910,6 +1888,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1910 struct cgroup *oldcgrp; 1888 struct cgroup *oldcgrp;
1911 struct cgroupfs_root *root = cgrp->root; 1889 struct cgroupfs_root *root = cgrp->root;
1912 struct cgroup_taskset tset = { }; 1890 struct cgroup_taskset tset = { };
1891 struct css_set *newcg;
1913 1892
1914 /* @tsk either already exited or can't exit until the end */ 1893 /* @tsk either already exited or can't exit until the end */
1915 if (tsk->flags & PF_EXITING) 1894 if (tsk->flags & PF_EXITING)
@@ -1925,7 +1904,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1925 1904
1926 for_each_subsys(root, ss) { 1905 for_each_subsys(root, ss) {
1927 if (ss->can_attach) { 1906 if (ss->can_attach) {
1928 retval = ss->can_attach(ss, cgrp, &tset); 1907 retval = ss->can_attach(cgrp, &tset);
1929 if (retval) { 1908 if (retval) {
1930 /* 1909 /*
1931 * Remember on which subsystem the can_attach() 1910 * Remember on which subsystem the can_attach()
@@ -1939,13 +1918,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1939 } 1918 }
1940 } 1919 }
1941 1920
1942 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); 1921 newcg = find_css_set(tsk->cgroups, cgrp);
1943 if (retval) 1922 if (!newcg) {
1923 retval = -ENOMEM;
1944 goto out; 1924 goto out;
1925 }
1926
1927 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
1945 1928
1946 for_each_subsys(root, ss) { 1929 for_each_subsys(root, ss) {
1947 if (ss->attach) 1930 if (ss->attach)
1948 ss->attach(ss, cgrp, &tset); 1931 ss->attach(cgrp, &tset);
1949 } 1932 }
1950 1933
1951 synchronize_rcu(); 1934 synchronize_rcu();
@@ -1967,7 +1950,7 @@ out:
1967 */ 1950 */
1968 break; 1951 break;
1969 if (ss->cancel_attach) 1952 if (ss->cancel_attach)
1970 ss->cancel_attach(ss, cgrp, &tset); 1953 ss->cancel_attach(cgrp, &tset);
1971 } 1954 }
1972 } 1955 }
1973 return retval; 1956 return retval;
@@ -1997,66 +1980,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
1997} 1980}
1998EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 1981EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
1999 1982
2000/*
2001 * cgroup_attach_proc works in two stages, the first of which prefetches all
2002 * new css_sets needed (to make sure we have enough memory before committing
2003 * to the move) and stores them in a list of entries of the following type.
2004 * TODO: possible optimization: use css_set->rcu_head for chaining instead
2005 */
2006struct cg_list_entry {
2007 struct css_set *cg;
2008 struct list_head links;
2009};
2010
2011static bool css_set_check_fetched(struct cgroup *cgrp,
2012 struct task_struct *tsk, struct css_set *cg,
2013 struct list_head *newcg_list)
2014{
2015 struct css_set *newcg;
2016 struct cg_list_entry *cg_entry;
2017 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
2018
2019 read_lock(&css_set_lock);
2020 newcg = find_existing_css_set(cg, cgrp, template);
2021 read_unlock(&css_set_lock);
2022
2023 /* doesn't exist at all? */
2024 if (!newcg)
2025 return false;
2026 /* see if it's already in the list */
2027 list_for_each_entry(cg_entry, newcg_list, links)
2028 if (cg_entry->cg == newcg)
2029 return true;
2030
2031 /* not found */
2032 return false;
2033}
2034
2035/*
2036 * Find the new css_set and store it in the list in preparation for moving the
2037 * given task to the given cgroup. Returns 0 or -ENOMEM.
2038 */
2039static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
2040 struct list_head *newcg_list)
2041{
2042 struct css_set *newcg;
2043 struct cg_list_entry *cg_entry;
2044
2045 /* ensure a new css_set will exist for this thread */
2046 newcg = find_css_set(cg, cgrp);
2047 if (!newcg)
2048 return -ENOMEM;
2049 /* add it to the list */
2050 cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
2051 if (!cg_entry) {
2052 put_css_set(newcg);
2053 return -ENOMEM;
2054 }
2055 cg_entry->cg = newcg;
2056 list_add(&cg_entry->links, newcg_list);
2057 return 0;
2058}
2059
2060/** 1983/**
2061 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup 1984 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
2062 * @cgrp: the cgroup to attach to 1985 * @cgrp: the cgroup to attach to
@@ -2070,20 +1993,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2070 int retval, i, group_size; 1993 int retval, i, group_size;
2071 struct cgroup_subsys *ss, *failed_ss = NULL; 1994 struct cgroup_subsys *ss, *failed_ss = NULL;
2072 /* guaranteed to be initialized later, but the compiler needs this */ 1995 /* guaranteed to be initialized later, but the compiler needs this */
2073 struct css_set *oldcg;
2074 struct cgroupfs_root *root = cgrp->root; 1996 struct cgroupfs_root *root = cgrp->root;
2075 /* threadgroup list cursor and array */ 1997 /* threadgroup list cursor and array */
2076 struct task_struct *tsk; 1998 struct task_struct *tsk;
2077 struct task_and_cgroup *tc; 1999 struct task_and_cgroup *tc;
2078 struct flex_array *group; 2000 struct flex_array *group;
2079 struct cgroup_taskset tset = { }; 2001 struct cgroup_taskset tset = { };
2080 /*
2081 * we need to make sure we have css_sets for all the tasks we're
2082 * going to move -before- we actually start moving them, so that in
2083 * case we get an ENOMEM we can bail out before making any changes.
2084 */
2085 struct list_head newcg_list;
2086 struct cg_list_entry *cg_entry, *temp_nobe;
2087 2002
2088 /* 2003 /*
2089 * step 0: in order to do expensive, possibly blocking operations for 2004 * step 0: in order to do expensive, possibly blocking operations for
@@ -2102,23 +2017,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2102 if (retval) 2017 if (retval)
2103 goto out_free_group_list; 2018 goto out_free_group_list;
2104 2019
2105 /* prevent changes to the threadgroup list while we take a snapshot. */
2106 read_lock(&tasklist_lock);
2107 if (!thread_group_leader(leader)) {
2108 /*
2109 * a race with de_thread from another thread's exec() may strip
2110 * us of our leadership, making while_each_thread unsafe to use
2111 * on this task. if this happens, there is no choice but to
2112 * throw this task away and try again (from cgroup_procs_write);
2113 * this is "double-double-toil-and-trouble-check locking".
2114 */
2115 read_unlock(&tasklist_lock);
2116 retval = -EAGAIN;
2117 goto out_free_group_list;
2118 }
2119
2120 tsk = leader; 2020 tsk = leader;
2121 i = 0; 2021 i = 0;
2022 /*
2023 * Prevent freeing of tasks while we take a snapshot. Tasks that are
2024 * already PF_EXITING could be freed from underneath us unless we
2025 * take an rcu_read_lock.
2026 */
2027 rcu_read_lock();
2122 do { 2028 do {
2123 struct task_and_cgroup ent; 2029 struct task_and_cgroup ent;
2124 2030
@@ -2128,24 +2034,24 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2128 2034
2129 /* as per above, nr_threads may decrease, but not increase. */ 2035 /* as per above, nr_threads may decrease, but not increase. */
2130 BUG_ON(i >= group_size); 2036 BUG_ON(i >= group_size);
2131 /*
2132 * saying GFP_ATOMIC has no effect here because we did prealloc
2133 * earlier, but it's good form to communicate our expectations.
2134 */
2135 ent.task = tsk; 2037 ent.task = tsk;
2136 ent.cgrp = task_cgroup_from_root(tsk, root); 2038 ent.cgrp = task_cgroup_from_root(tsk, root);
2137 /* nothing to do if this task is already in the cgroup */ 2039 /* nothing to do if this task is already in the cgroup */
2138 if (ent.cgrp == cgrp) 2040 if (ent.cgrp == cgrp)
2139 continue; 2041 continue;
2042 /*
2043 * saying GFP_ATOMIC has no effect here because we did prealloc
2044 * earlier, but it's good form to communicate our expectations.
2045 */
2140 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2046 retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
2141 BUG_ON(retval != 0); 2047 BUG_ON(retval != 0);
2142 i++; 2048 i++;
2143 } while_each_thread(leader, tsk); 2049 } while_each_thread(leader, tsk);
2050 rcu_read_unlock();
2144 /* remember the number of threads in the array for later. */ 2051 /* remember the number of threads in the array for later. */
2145 group_size = i; 2052 group_size = i;
2146 tset.tc_array = group; 2053 tset.tc_array = group;
2147 tset.tc_array_len = group_size; 2054 tset.tc_array_len = group_size;
2148 read_unlock(&tasklist_lock);
2149 2055
2150 /* methods shouldn't be called if no task is actually migrating */ 2056 /* methods shouldn't be called if no task is actually migrating */
2151 retval = 0; 2057 retval = 0;
@@ -2157,7 +2063,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2157 */ 2063 */
2158 for_each_subsys(root, ss) { 2064 for_each_subsys(root, ss) {
2159 if (ss->can_attach) { 2065 if (ss->can_attach) {
2160 retval = ss->can_attach(ss, cgrp, &tset); 2066 retval = ss->can_attach(cgrp, &tset);
2161 if (retval) { 2067 if (retval) {
2162 failed_ss = ss; 2068 failed_ss = ss;
2163 goto out_cancel_attach; 2069 goto out_cancel_attach;
@@ -2169,17 +2075,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2169 * step 2: make sure css_sets exist for all threads to be migrated. 2075 * step 2: make sure css_sets exist for all threads to be migrated.
2170 * we use find_css_set, which allocates a new one if necessary. 2076 * we use find_css_set, which allocates a new one if necessary.
2171 */ 2077 */
2172 INIT_LIST_HEAD(&newcg_list);
2173 for (i = 0; i < group_size; i++) { 2078 for (i = 0; i < group_size; i++) {
2174 tc = flex_array_get(group, i); 2079 tc = flex_array_get(group, i);
2175 oldcg = tc->task->cgroups; 2080 tc->cg = find_css_set(tc->task->cgroups, cgrp);
2176 2081 if (!tc->cg) {
2177 /* if we don't already have it in the list get a new one */ 2082 retval = -ENOMEM;
2178 if (!css_set_check_fetched(cgrp, tc->task, oldcg, 2083 goto out_put_css_set_refs;
2179 &newcg_list)) {
2180 retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
2181 if (retval)
2182 goto out_list_teardown;
2183 } 2084 }
2184 } 2085 }
2185 2086
@@ -2190,8 +2091,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2190 */ 2091 */
2191 for (i = 0; i < group_size; i++) { 2092 for (i = 0; i < group_size; i++) {
2192 tc = flex_array_get(group, i); 2093 tc = flex_array_get(group, i);
2193 retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); 2094 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg);
2194 BUG_ON(retval);
2195 } 2095 }
2196 /* nothing is sensitive to fork() after this point. */ 2096 /* nothing is sensitive to fork() after this point. */
2197 2097
@@ -2200,7 +2100,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2200 */ 2100 */
2201 for_each_subsys(root, ss) { 2101 for_each_subsys(root, ss) {
2202 if (ss->attach) 2102 if (ss->attach)
2203 ss->attach(ss, cgrp, &tset); 2103 ss->attach(cgrp, &tset);
2204 } 2104 }
2205 2105
2206 /* 2106 /*
@@ -2209,21 +2109,22 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
2209 synchronize_rcu(); 2109 synchronize_rcu();
2210 cgroup_wakeup_rmdir_waiter(cgrp); 2110 cgroup_wakeup_rmdir_waiter(cgrp);
2211 retval = 0; 2111 retval = 0;
2212out_list_teardown: 2112out_put_css_set_refs:
2213 /* clean up the list of prefetched css_sets. */ 2113 if (retval) {
2214 list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { 2114 for (i = 0; i < group_size; i++) {
2215 list_del(&cg_entry->links); 2115 tc = flex_array_get(group, i);
2216 put_css_set(cg_entry->cg); 2116 if (!tc->cg)
2217 kfree(cg_entry); 2117 break;
2118 put_css_set(tc->cg);
2119 }
2218 } 2120 }
2219out_cancel_attach: 2121out_cancel_attach:
2220 /* same deal as in cgroup_attach_task */
2221 if (retval) { 2122 if (retval) {
2222 for_each_subsys(root, ss) { 2123 for_each_subsys(root, ss) {
2223 if (ss == failed_ss) 2124 if (ss == failed_ss)
2224 break; 2125 break;
2225 if (ss->cancel_attach) 2126 if (ss->cancel_attach)
2226 ss->cancel_attach(ss, cgrp, &tset); 2127 ss->cancel_attach(cgrp, &tset);
2227 } 2128 }
2228 } 2129 }
2229out_free_group_list: 2130out_free_group_list:
@@ -2245,22 +2146,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2245 if (!cgroup_lock_live_group(cgrp)) 2146 if (!cgroup_lock_live_group(cgrp))
2246 return -ENODEV; 2147 return -ENODEV;
2247 2148
2149retry_find_task:
2150 rcu_read_lock();
2248 if (pid) { 2151 if (pid) {
2249 rcu_read_lock();
2250 tsk = find_task_by_vpid(pid); 2152 tsk = find_task_by_vpid(pid);
2251 if (!tsk) { 2153 if (!tsk) {
2252 rcu_read_unlock(); 2154 rcu_read_unlock();
2253 cgroup_unlock(); 2155 ret= -ESRCH;
2254 return -ESRCH; 2156 goto out_unlock_cgroup;
2255 }
2256 if (threadgroup) {
2257 /*
2258 * RCU protects this access, since tsk was found in the
2259 * tid map. a race with de_thread may cause group_leader
2260 * to stop being the leader, but cgroup_attach_proc will
2261 * detect it later.
2262 */
2263 tsk = tsk->group_leader;
2264 } 2157 }
2265 /* 2158 /*
2266 * even if we're attaching all tasks in the thread group, we 2159 * even if we're attaching all tasks in the thread group, we
@@ -2271,29 +2164,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
2271 cred->euid != tcred->uid && 2164 cred->euid != tcred->uid &&
2272 cred->euid != tcred->suid) { 2165 cred->euid != tcred->suid) {
2273 rcu_read_unlock(); 2166 rcu_read_unlock();
2274 cgroup_unlock(); 2167 ret = -EACCES;
2275 return -EACCES; 2168 goto out_unlock_cgroup;
2276 } 2169 }
2277 get_task_struct(tsk); 2170 } else
2278 rcu_read_unlock(); 2171 tsk = current;
2279 } else {
2280 if (threadgroup)
2281 tsk = current->group_leader;
2282 else
2283 tsk = current;
2284 get_task_struct(tsk);
2285 }
2286
2287 threadgroup_lock(tsk);
2288 2172
2289 if (threadgroup) 2173 if (threadgroup)
2174 tsk = tsk->group_leader;
2175 get_task_struct(tsk);
2176 rcu_read_unlock();
2177
2178 threadgroup_lock(tsk);
2179 if (threadgroup) {
2180 if (!thread_group_leader(tsk)) {
2181 /*
2182 * a race with de_thread from another thread's exec()
2183 * may strip us of our leadership, if this happens,
2184 * there is no choice but to throw this task away and
2185 * try again; this is
2186 * "double-double-toil-and-trouble-check locking".
2187 */
2188 threadgroup_unlock(tsk);
2189 put_task_struct(tsk);
2190 goto retry_find_task;
2191 }
2290 ret = cgroup_attach_proc(cgrp, tsk); 2192 ret = cgroup_attach_proc(cgrp, tsk);
2291 else 2193 } else
2292 ret = cgroup_attach_task(cgrp, tsk); 2194 ret = cgroup_attach_task(cgrp, tsk);
2293
2294 threadgroup_unlock(tsk); 2195 threadgroup_unlock(tsk);
2295 2196
2296 put_task_struct(tsk); 2197 put_task_struct(tsk);
2198out_unlock_cgroup:
2297 cgroup_unlock(); 2199 cgroup_unlock();
2298 return ret; 2200 return ret;
2299} 2201}
@@ -2305,16 +2207,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
2305 2207
2306static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2208static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
2307{ 2209{
2308 int ret; 2210 return attach_task_by_pid(cgrp, tgid, true);
2309 do {
2310 /*
2311 * attach_proc fails with -EAGAIN if threadgroup leadership
2312 * changes in the middle of the operation, in which case we need
2313 * to find the task_struct for the new leader and start over.
2314 */
2315 ret = attach_task_by_pid(cgrp, tgid, true);
2316 } while (ret == -EAGAIN);
2317 return ret;
2318} 2211}
2319 2212
2320/** 2213/**
@@ -2804,15 +2697,20 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
2804 * using their cgroups capability, we don't maintain the lists running 2697 * using their cgroups capability, we don't maintain the lists running
2805 * through each css_set to its tasks until we see the list actually 2698 * through each css_set to its tasks until we see the list actually
2806 * used - in other words after the first call to cgroup_iter_start(). 2699 * used - in other words after the first call to cgroup_iter_start().
2807 *
2808 * The tasklist_lock is not held here, as do_each_thread() and
2809 * while_each_thread() are protected by RCU.
2810 */ 2700 */
2811static void cgroup_enable_task_cg_lists(void) 2701static void cgroup_enable_task_cg_lists(void)
2812{ 2702{
2813 struct task_struct *p, *g; 2703 struct task_struct *p, *g;
2814 write_lock(&css_set_lock); 2704 write_lock(&css_set_lock);
2815 use_task_css_set_links = 1; 2705 use_task_css_set_links = 1;
2706 /*
2707 * We need tasklist_lock because RCU is not safe against
2708 * while_each_thread(). Besides, a forking task that has passed
2709 * cgroup_post_fork() without seeing use_task_css_set_links = 1
2710 * is not guaranteed to have its child immediately visible in the
2711 * tasklist if we walk through it with RCU.
2712 */
2713 read_lock(&tasklist_lock);
2816 do_each_thread(g, p) { 2714 do_each_thread(g, p) {
2817 task_lock(p); 2715 task_lock(p);
2818 /* 2716 /*
@@ -2824,6 +2722,7 @@ static void cgroup_enable_task_cg_lists(void)
2824 list_add(&p->cg_list, &p->cgroups->tasks); 2722 list_add(&p->cg_list, &p->cgroups->tasks);
2825 task_unlock(p); 2723 task_unlock(p);
2826 } while_each_thread(g, p); 2724 } while_each_thread(g, p);
2725 read_unlock(&tasklist_lock);
2827 write_unlock(&css_set_lock); 2726 write_unlock(&css_set_lock);
2828} 2727}
2829 2728
@@ -3043,6 +2942,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
3043 * 2942 *
3044 */ 2943 */
3045 2944
2945/* which pidlist file are we talking about? */
2946enum cgroup_filetype {
2947 CGROUP_FILE_PROCS,
2948 CGROUP_FILE_TASKS,
2949};
2950
2951/*
2952 * A pidlist is a list of pids that virtually represents the contents of one
2953 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
2954 * a pair (one each for procs, tasks) for each pid namespace that's relevant
2955 * to the cgroup.
2956 */
2957struct cgroup_pidlist {
2958 /*
2959 * used to find which pidlist is wanted. doesn't change as long as
2960 * this particular list stays in the list.
2961 */
2962 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
2963 /* array of xids */
2964 pid_t *list;
2965 /* how many elements the above list has */
2966 int length;
2967 /* how many files are using the current array */
2968 int use_count;
2969 /* each of these stored in a list by its cgroup */
2970 struct list_head links;
2971 /* pointer to the cgroup we belong to, for list removal purposes */
2972 struct cgroup *owner;
2973 /* protects the other fields */
2974 struct rw_semaphore mutex;
2975};
2976
3046/* 2977/*
3047 * The following two functions "fix" the issue where there are more pids 2978 * The following two functions "fix" the issue where there are more pids
3048 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. 2979 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
@@ -3827,7 +3758,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3827 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); 3758 set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
3828 3759
3829 for_each_subsys(root, ss) { 3760 for_each_subsys(root, ss) {
3830 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 3761 struct cgroup_subsys_state *css = ss->create(cgrp);
3831 3762
3832 if (IS_ERR(css)) { 3763 if (IS_ERR(css)) {
3833 err = PTR_ERR(css); 3764 err = PTR_ERR(css);
@@ -3841,7 +3772,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3841 } 3772 }
3842 /* At error, ->destroy() callback has to free assigned ID. */ 3773 /* At error, ->destroy() callback has to free assigned ID. */
3843 if (clone_children(parent) && ss->post_clone) 3774 if (clone_children(parent) && ss->post_clone)
3844 ss->post_clone(ss, cgrp); 3775 ss->post_clone(cgrp);
3845 } 3776 }
3846 3777
3847 cgroup_lock_hierarchy(root); 3778 cgroup_lock_hierarchy(root);
@@ -3875,7 +3806,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
3875 3806
3876 for_each_subsys(root, ss) { 3807 for_each_subsys(root, ss) {
3877 if (cgrp->subsys[ss->subsys_id]) 3808 if (cgrp->subsys[ss->subsys_id])
3878 ss->destroy(ss, cgrp); 3809 ss->destroy(cgrp);
3879 } 3810 }
3880 3811
3881 mutex_unlock(&cgroup_mutex); 3812 mutex_unlock(&cgroup_mutex);
@@ -4099,7 +4030,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
4099 /* Create the top cgroup state for this subsystem */ 4030 /* Create the top cgroup state for this subsystem */
4100 list_add(&ss->sibling, &rootnode.subsys_list); 4031 list_add(&ss->sibling, &rootnode.subsys_list);
4101 ss->root = &rootnode; 4032 ss->root = &rootnode;
4102 css = ss->create(ss, dummytop); 4033 css = ss->create(dummytop);
4103 /* We don't handle early failures gracefully */ 4034 /* We don't handle early failures gracefully */
4104 BUG_ON(IS_ERR(css)); 4035 BUG_ON(IS_ERR(css));
4105 init_cgroup_css(css, ss, dummytop); 4036 init_cgroup_css(css, ss, dummytop);
@@ -4188,7 +4119,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4188 * no ss->create seems to need anything important in the ss struct, so 4119 * no ss->create seems to need anything important in the ss struct, so
4189 * this can happen first (i.e. before the rootnode attachment). 4120 * this can happen first (i.e. before the rootnode attachment).
4190 */ 4121 */
4191 css = ss->create(ss, dummytop); 4122 css = ss->create(dummytop);
4192 if (IS_ERR(css)) { 4123 if (IS_ERR(css)) {
4193 /* failure case - need to deassign the subsys[] slot. */ 4124 /* failure case - need to deassign the subsys[] slot. */
4194 subsys[i] = NULL; 4125 subsys[i] = NULL;
@@ -4206,7 +4137,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
4206 int ret = cgroup_init_idr(ss, css); 4137 int ret = cgroup_init_idr(ss, css);
4207 if (ret) { 4138 if (ret) {
4208 dummytop->subsys[ss->subsys_id] = NULL; 4139 dummytop->subsys[ss->subsys_id] = NULL;
4209 ss->destroy(ss, dummytop); 4140 ss->destroy(dummytop);
4210 subsys[i] = NULL; 4141 subsys[i] = NULL;
4211 mutex_unlock(&cgroup_mutex); 4142 mutex_unlock(&cgroup_mutex);
4212 return ret; 4143 return ret;
@@ -4304,7 +4235,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
4304 * pointer to find their state. note that this also takes care of 4235 * pointer to find their state. note that this also takes care of
4305 * freeing the css_id. 4236 * freeing the css_id.
4306 */ 4237 */
4307 ss->destroy(ss, dummytop); 4238 ss->destroy(dummytop);
4308 dummytop->subsys[ss->subsys_id] = NULL; 4239 dummytop->subsys[ss->subsys_id] = NULL;
4309 4240
4310 mutex_unlock(&cgroup_mutex); 4241 mutex_unlock(&cgroup_mutex);
@@ -4580,7 +4511,7 @@ void cgroup_fork_callbacks(struct task_struct *child)
4580 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { 4511 for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
4581 struct cgroup_subsys *ss = subsys[i]; 4512 struct cgroup_subsys *ss = subsys[i];
4582 if (ss->fork) 4513 if (ss->fork)
4583 ss->fork(ss, child); 4514 ss->fork(child);
4584 } 4515 }
4585 } 4516 }
4586} 4517}
@@ -4596,6 +4527,17 @@ void cgroup_fork_callbacks(struct task_struct *child)
4596 */ 4527 */
4597void cgroup_post_fork(struct task_struct *child) 4528void cgroup_post_fork(struct task_struct *child)
4598{ 4529{
4530 /*
4531 * use_task_css_set_links is set to 1 before we walk the tasklist
4532 * under the tasklist_lock and we read it here after we added the child
4533 * to the tasklist under the tasklist_lock as well. If the child wasn't
4534 * yet in the tasklist when we walked through it from
4535 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value
4536 * should be visible now due to the paired locking and barriers implied
4537 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock
4538 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
4539 * lock on fork.
4540 */
4599 if (use_task_css_set_links) { 4541 if (use_task_css_set_links) {
4600 write_lock(&css_set_lock); 4542 write_lock(&css_set_lock);
4601 if (list_empty(&child->cg_list)) { 4543 if (list_empty(&child->cg_list)) {
@@ -4682,7 +4624,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
4682 struct cgroup *old_cgrp = 4624 struct cgroup *old_cgrp =
4683 rcu_dereference_raw(cg->subsys[i])->cgroup; 4625 rcu_dereference_raw(cg->subsys[i])->cgroup;
4684 struct cgroup *cgrp = task_cgroup(tsk, i); 4626 struct cgroup *cgrp = task_cgroup(tsk, i);
4685 ss->exit(ss, cgrp, old_cgrp, tsk); 4627 ss->exit(cgrp, old_cgrp, tsk);
4686 } 4628 }
4687 } 4629 }
4688 } 4630 }
@@ -4939,9 +4881,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
4939 4881
4940 rcu_assign_pointer(id->css, NULL); 4882 rcu_assign_pointer(id->css, NULL);
4941 rcu_assign_pointer(css->id, NULL); 4883 rcu_assign_pointer(css->id, NULL);
4942 write_lock(&ss->id_lock); 4884 spin_lock(&ss->id_lock);
4943 idr_remove(&ss->idr, id->id); 4885 idr_remove(&ss->idr, id->id);
4944 write_unlock(&ss->id_lock); 4886 spin_unlock(&ss->id_lock);
4945 kfree_rcu(id, rcu_head); 4887 kfree_rcu(id, rcu_head);
4946} 4888}
4947EXPORT_SYMBOL_GPL(free_css_id); 4889EXPORT_SYMBOL_GPL(free_css_id);
@@ -4967,10 +4909,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4967 error = -ENOMEM; 4909 error = -ENOMEM;
4968 goto err_out; 4910 goto err_out;
4969 } 4911 }
4970 write_lock(&ss->id_lock); 4912 spin_lock(&ss->id_lock);
4971 /* Don't use 0. allocates an ID of 1-65535 */ 4913 /* Don't use 0. allocates an ID of 1-65535 */
4972 error = idr_get_new_above(&ss->idr, newid, 1, &myid); 4914 error = idr_get_new_above(&ss->idr, newid, 1, &myid);
4973 write_unlock(&ss->id_lock); 4915 spin_unlock(&ss->id_lock);
4974 4916
4975 /* Returns error when there are no free spaces for new ID.*/ 4917 /* Returns error when there are no free spaces for new ID.*/
4976 if (error) { 4918 if (error) {
@@ -4985,9 +4927,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
4985 return newid; 4927 return newid;
4986remove_idr: 4928remove_idr:
4987 error = -ENOSPC; 4929 error = -ENOSPC;
4988 write_lock(&ss->id_lock); 4930 spin_lock(&ss->id_lock);
4989 idr_remove(&ss->idr, myid); 4931 idr_remove(&ss->idr, myid);
4990 write_unlock(&ss->id_lock); 4932 spin_unlock(&ss->id_lock);
4991err_out: 4933err_out:
4992 kfree(newid); 4934 kfree(newid);
4993 return ERR_PTR(error); 4935 return ERR_PTR(error);
@@ -4999,7 +4941,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
4999{ 4941{
5000 struct css_id *newid; 4942 struct css_id *newid;
5001 4943
5002 rwlock_init(&ss->id_lock); 4944 spin_lock_init(&ss->id_lock);
5003 idr_init(&ss->idr); 4945 idr_init(&ss->idr);
5004 4946
5005 newid = get_new_cssid(ss, 0); 4947 newid = get_new_cssid(ss, 0);
@@ -5087,6 +5029,8 @@ css_get_next(struct cgroup_subsys *ss, int id,
5087 return NULL; 5029 return NULL;
5088 5030
5089 BUG_ON(!ss->use_id); 5031 BUG_ON(!ss->use_id);
5032 WARN_ON_ONCE(!rcu_read_lock_held());
5033
5090 /* fill start point for scan */ 5034 /* fill start point for scan */
5091 tmpid = id; 5035 tmpid = id;
5092 while (1) { 5036 while (1) {
@@ -5094,10 +5038,7 @@ css_get_next(struct cgroup_subsys *ss, int id,
5094 * scan next entry from bitmap(tree), tmpid is updated after 5038 * scan next entry from bitmap(tree), tmpid is updated after
5095 * idr_get_next(). 5039 * idr_get_next().
5096 */ 5040 */
5097 read_lock(&ss->id_lock);
5098 tmp = idr_get_next(&ss->idr, &tmpid); 5041 tmp = idr_get_next(&ss->idr, &tmpid);
5099 read_unlock(&ss->id_lock);
5100
5101 if (!tmp) 5042 if (!tmp)
5102 break; 5043 break;
5103 if (tmp->depth >= depth && tmp->stack[depth] == rootid) { 5044 if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
@@ -5137,8 +5078,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
5137} 5078}
5138 5079
5139#ifdef CONFIG_CGROUP_DEBUG 5080#ifdef CONFIG_CGROUP_DEBUG
5140static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, 5081static struct cgroup_subsys_state *debug_create(struct cgroup *cont)
5141 struct cgroup *cont)
5142{ 5082{
5143 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5083 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
5144 5084
@@ -5148,7 +5088,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
5148 return css; 5088 return css;
5149} 5089}
5150 5090
5151static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 5091static void debug_destroy(struct cgroup *cont)
5152{ 5092{
5153 kfree(cont->subsys[debug_subsys_id]); 5093 kfree(cont->subsys[debug_subsys_id]);
5154} 5094}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index fc0646b78a64..f86e93920b62 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys;
128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) 128 * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
129 * sighand->siglock 129 * sighand->siglock
130 */ 130 */
131static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 131static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
132 struct cgroup *cgroup)
133{ 132{
134 struct freezer *freezer; 133 struct freezer *freezer;
135 134
@@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
142 return &freezer->css; 141 return &freezer->css;
143} 142}
144 143
145static void freezer_destroy(struct cgroup_subsys *ss, 144static void freezer_destroy(struct cgroup *cgroup)
146 struct cgroup *cgroup)
147{ 145{
148 struct freezer *freezer = cgroup_freezer(cgroup); 146 struct freezer *freezer = cgroup_freezer(cgroup);
149 147
@@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task)
164 * a write to that file racing against an attach, and hence the 162 * a write to that file racing against an attach, and hence the
165 * can_attach() result will remain valid until the attach completes. 163 * can_attach() result will remain valid until the attach completes.
166 */ 164 */
167static int freezer_can_attach(struct cgroup_subsys *ss, 165static int freezer_can_attach(struct cgroup *new_cgroup,
168 struct cgroup *new_cgroup,
169 struct cgroup_taskset *tset) 166 struct cgroup_taskset *tset)
170{ 167{
171 struct freezer *freezer; 168 struct freezer *freezer;
@@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
185 return 0; 182 return 0;
186} 183}
187 184
188static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) 185static void freezer_fork(struct task_struct *task)
189{ 186{
190 struct freezer *freezer; 187 struct freezer *freezer;
191 188
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a09ac2b9a661..b96ad75b7e64 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
964{ 964{
965 bool need_loop; 965 bool need_loop;
966 966
967repeat:
968 /* 967 /*
969 * Allow tasks that have access to memory reserves because they have 968 * Allow tasks that have access to memory reserves because they have
970 * been OOM killed to get memory anywhere. 969 * been OOM killed to get memory anywhere.
@@ -983,45 +982,19 @@ repeat:
983 */ 982 */
984 need_loop = task_has_mempolicy(tsk) || 983 need_loop = task_has_mempolicy(tsk) ||
985 !nodes_intersects(*newmems, tsk->mems_allowed); 984 !nodes_intersects(*newmems, tsk->mems_allowed);
986 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
987 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
988
989 /*
990 * ensure checking ->mems_allowed_change_disable after setting all new
991 * allowed nodes.
992 *
993 * the read-side task can see an nodemask with new allowed nodes and
994 * old allowed nodes. and if it allocates page when cpuset clears newly
995 * disallowed ones continuous, it can see the new allowed bits.
996 *
997 * And if setting all new allowed nodes is after the checking, setting
998 * all new allowed nodes and clearing newly disallowed ones will be done
999 * continuous, and the read-side task may find no node to alloc page.
1000 */
1001 smp_mb();
1002 985
1003 /* 986 if (need_loop)
1004 * Allocation of memory is very fast, we needn't sleep when waiting 987 write_seqcount_begin(&tsk->mems_allowed_seq);
1005 * for the read-side.
1006 */
1007 while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
1008 task_unlock(tsk);
1009 if (!task_curr(tsk))
1010 yield();
1011 goto repeat;
1012 }
1013 988
1014 /* 989 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
1015 * ensure checking ->mems_allowed_change_disable before clearing all new 990 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
1016 * disallowed nodes.
1017 *
1018 * if clearing newly disallowed bits before the checking, the read-side
1019 * task may find no node to alloc page.
1020 */
1021 smp_mb();
1022 991
1023 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); 992 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
1024 tsk->mems_allowed = *newmems; 993 tsk->mems_allowed = *newmems;
994
995 if (need_loop)
996 write_seqcount_end(&tsk->mems_allowed_seq);
997
1025 task_unlock(tsk); 998 task_unlock(tsk);
1026} 999}
1027 1000
@@ -1399,8 +1372,7 @@ static nodemask_t cpuset_attach_nodemask_from;
1399static nodemask_t cpuset_attach_nodemask_to; 1372static nodemask_t cpuset_attach_nodemask_to;
1400 1373
1401/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1374/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
1402static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1375static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1403 struct cgroup_taskset *tset)
1404{ 1376{
1405 struct cpuset *cs = cgroup_cs(cgrp); 1377 struct cpuset *cs = cgroup_cs(cgrp);
1406 struct task_struct *task; 1378 struct task_struct *task;
@@ -1436,8 +1408,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
1436 return 0; 1408 return 0;
1437} 1409}
1438 1410
1439static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1411static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
1440 struct cgroup_taskset *tset)
1441{ 1412{
1442 struct mm_struct *mm; 1413 struct mm_struct *mm;
1443 struct task_struct *task; 1414 struct task_struct *task;
@@ -1833,8 +1804,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1833 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex 1804 * (and likewise for mems) to the new cgroup. Called with cgroup_mutex
1834 * held. 1805 * held.
1835 */ 1806 */
1836static void cpuset_post_clone(struct cgroup_subsys *ss, 1807static void cpuset_post_clone(struct cgroup *cgroup)
1837 struct cgroup *cgroup)
1838{ 1808{
1839 struct cgroup *parent, *child; 1809 struct cgroup *parent, *child;
1840 struct cpuset *cs, *parent_cs; 1810 struct cpuset *cs, *parent_cs;
@@ -1857,13 +1827,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
1857 1827
1858/* 1828/*
1859 * cpuset_create - create a cpuset 1829 * cpuset_create - create a cpuset
1860 * ss: cpuset cgroup subsystem
1861 * cont: control group that the new cpuset will be part of 1830 * cont: control group that the new cpuset will be part of
1862 */ 1831 */
1863 1832
1864static struct cgroup_subsys_state *cpuset_create( 1833static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont)
1865 struct cgroup_subsys *ss,
1866 struct cgroup *cont)
1867{ 1834{
1868 struct cpuset *cs; 1835 struct cpuset *cs;
1869 struct cpuset *parent; 1836 struct cpuset *parent;
@@ -1902,7 +1869,7 @@ static struct cgroup_subsys_state *cpuset_create(
1902 * will call async_rebuild_sched_domains(). 1869 * will call async_rebuild_sched_domains().
1903 */ 1870 */
1904 1871
1905static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1872static void cpuset_destroy(struct cgroup *cont)
1906{ 1873{
1907 struct cpuset *cs = cgroup_cs(cont); 1874 struct cpuset *cs = cgroup_cs(cont);
1908 1875
@@ -2195,10 +2162,9 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2195 mutex_unlock(&callback_mutex); 2162 mutex_unlock(&callback_mutex);
2196} 2163}
2197 2164
2198int cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2165void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2199{ 2166{
2200 const struct cpuset *cs; 2167 const struct cpuset *cs;
2201 int cpu;
2202 2168
2203 rcu_read_lock(); 2169 rcu_read_lock();
2204 cs = task_cs(tsk); 2170 cs = task_cs(tsk);
@@ -2219,22 +2185,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2219 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary 2185 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2220 * set any mask even if it is not right from task_cs() pov, 2186 * set any mask even if it is not right from task_cs() pov,
2221 * the pending set_cpus_allowed_ptr() will fix things. 2187 * the pending set_cpus_allowed_ptr() will fix things.
2188 *
2189 * select_fallback_rq() will fix things ups and set cpu_possible_mask
2190 * if required.
2222 */ 2191 */
2223
2224 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2225 if (cpu >= nr_cpu_ids) {
2226 /*
2227 * Either tsk->cpus_allowed is wrong (see above) or it
2228 * is actually empty. The latter case is only possible
2229 * if we are racing with remove_tasks_in_empty_cpuset().
2230 * Like above we can temporary set any mask and rely on
2231 * set_cpus_allowed_ptr() as synchronization point.
2232 */
2233 do_set_cpus_allowed(tsk, cpu_possible_mask);
2234 cpu = cpumask_any(cpu_active_mask);
2235 }
2236
2237 return cpu;
2238} 2192}
2239 2193
2240void cpuset_init_current_mems_allowed(void) 2194void cpuset_init_current_mems_allowed(void)
diff --git a/kernel/cred.c b/kernel/cred.c
index 5791612a4045..97b36eeca4c9 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -16,6 +16,7 @@
16#include <linux/keyctl.h> 16#include <linux/keyctl.h>
17#include <linux/init_task.h> 17#include <linux/init_task.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/binfmts.h>
19#include <linux/cn_proc.h> 20#include <linux/cn_proc.h>
20 21
21#if 0 22#if 0
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0d7c08784efb..1dc53bae56e1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -41,6 +41,7 @@
41#include <linux/delay.h> 41#include <linux/delay.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/sysrq.h> 43#include <linux/sysrq.h>
44#include <linux/reboot.h>
44#include <linux/init.h> 45#include <linux/init.h>
45#include <linux/kgdb.h> 46#include <linux/kgdb.h>
46#include <linux/kdb.h> 47#include <linux/kdb.h>
@@ -52,7 +53,6 @@
52#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
53#include <asm/byteorder.h> 54#include <asm/byteorder.h>
54#include <linux/atomic.h> 55#include <linux/atomic.h>
55#include <asm/system.h>
56 56
57#include "debug_core.h" 57#include "debug_core.h"
58 58
@@ -75,6 +75,8 @@ static int exception_level;
75struct kgdb_io *dbg_io_ops; 75struct kgdb_io *dbg_io_ops;
76static DEFINE_SPINLOCK(kgdb_registration_lock); 76static DEFINE_SPINLOCK(kgdb_registration_lock);
77 77
78/* Action for the reboot notifiter, a global allow kdb to change it */
79static int kgdbreboot;
78/* kgdb console driver is loaded */ 80/* kgdb console driver is loaded */
79static int kgdb_con_registered; 81static int kgdb_con_registered;
80/* determine if kgdb console output should be used */ 82/* determine if kgdb console output should be used */
@@ -96,6 +98,7 @@ static int __init opt_kgdb_con(char *str)
96early_param("kgdbcon", opt_kgdb_con); 98early_param("kgdbcon", opt_kgdb_con);
97 99
98module_param(kgdb_use_con, int, 0644); 100module_param(kgdb_use_con, int, 0644);
101module_param(kgdbreboot, int, 0644);
99 102
100/* 103/*
101 * Holds information about breakpoints in a kernel. These breakpoints are 104 * Holds information about breakpoints in a kernel. These breakpoints are
@@ -784,6 +787,33 @@ void __init dbg_late_init(void)
784 kdb_init(KDB_INIT_FULL); 787 kdb_init(KDB_INIT_FULL);
785} 788}
786 789
790static int
791dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
792{
793 /*
794 * Take the following action on reboot notify depending on value:
795 * 1 == Enter debugger
796 * 0 == [the default] detatch debug client
797 * -1 == Do nothing... and use this until the board resets
798 */
799 switch (kgdbreboot) {
800 case 1:
801 kgdb_breakpoint();
802 case -1:
803 goto done;
804 }
805 if (!dbg_kdb_mode)
806 gdbstub_exit(code);
807done:
808 return NOTIFY_DONE;
809}
810
811static struct notifier_block dbg_reboot_notifier = {
812 .notifier_call = dbg_notify_reboot,
813 .next = NULL,
814 .priority = INT_MAX,
815};
816
787static void kgdb_register_callbacks(void) 817static void kgdb_register_callbacks(void)
788{ 818{
789 if (!kgdb_io_module_registered) { 819 if (!kgdb_io_module_registered) {
@@ -791,6 +821,7 @@ static void kgdb_register_callbacks(void)
791 kgdb_arch_init(); 821 kgdb_arch_init();
792 if (!dbg_is_early) 822 if (!dbg_is_early)
793 kgdb_arch_late(); 823 kgdb_arch_late();
824 register_reboot_notifier(&dbg_reboot_notifier);
794 atomic_notifier_chain_register(&panic_notifier_list, 825 atomic_notifier_chain_register(&panic_notifier_list,
795 &kgdb_panic_event_nb); 826 &kgdb_panic_event_nb);
796#ifdef CONFIG_MAGIC_SYSRQ 827#ifdef CONFIG_MAGIC_SYSRQ
@@ -812,6 +843,7 @@ static void kgdb_unregister_callbacks(void)
812 */ 843 */
813 if (kgdb_io_module_registered) { 844 if (kgdb_io_module_registered) {
814 kgdb_io_module_registered = 0; 845 kgdb_io_module_registered = 0;
846 unregister_reboot_notifier(&dbg_reboot_notifier);
815 atomic_notifier_chain_unregister(&panic_notifier_list, 847 atomic_notifier_chain_unregister(&panic_notifier_list,
816 &kgdb_panic_event_nb); 848 &kgdb_panic_event_nb);
817 kgdb_arch_exit(); 849 kgdb_arch_exit();
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index c22d8c28ad84..ce615e064482 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1111,6 +1111,13 @@ void gdbstub_exit(int status)
1111 unsigned char checksum, ch, buffer[3]; 1111 unsigned char checksum, ch, buffer[3];
1112 int loop; 1112 int loop;
1113 1113
1114 if (!kgdb_connected)
1115 return;
1116 kgdb_connected = 0;
1117
1118 if (!dbg_io_ops || dbg_kdb_mode)
1119 return;
1120
1114 buffer[0] = 'W'; 1121 buffer[0] = 'W';
1115 buffer[1] = hex_asc_hi(status); 1122 buffer[1] = hex_asc_hi(status);
1116 buffer[2] = hex_asc_lo(status); 1123 buffer[2] = hex_asc_lo(status);
@@ -1129,5 +1136,6 @@ void gdbstub_exit(int status)
1129 dbg_io_ops->write_char(hex_asc_lo(checksum)); 1136 dbg_io_ops->write_char(hex_asc_lo(checksum));
1130 1137
1131 /* make sure the output is flushed, lest the bootloader clobber it */ 1138 /* make sure the output is flushed, lest the bootloader clobber it */
1132 dbg_io_ops->flush(); 1139 if (dbg_io_ops->flush)
1140 dbg_io_ops->flush();
1133} 1141}
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 20059ef4459a..8418c2f8ec5d 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
153 } else { 153 } else {
154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n", 154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
155 __func__, bp->bp_addr); 155 __func__, bp->bp_addr);
156#ifdef CONFIG_DEBUG_RODATA
157 if (!bp->bp_type) {
158 kdb_printf("Software breakpoints are unavailable.\n"
159 " Change the kernel CONFIG_DEBUG_RODATA=n\n"
160 " OR use hw breaks: help bph\n");
161 }
162#endif
156 return 1; 163 return 1;
157 } 164 }
158 return 0; 165 return 0;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7179eac7b41c..07c9bbb94a0b 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -15,7 +15,6 @@
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/kdb.h> 16#include <linux/kdb.h>
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <asm/system.h>
19#include "kdb_private.h" 18#include "kdb_private.h"
20 19
21 20
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 4802eb5840e1..9b5f17da1c56 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -689,7 +689,7 @@ kdb_printit:
689 if (!dbg_kdb_mode && kgdb_connected) { 689 if (!dbg_kdb_mode && kgdb_connected) {
690 gdbstub_msg_write(kdb_buffer, retlen); 690 gdbstub_msg_write(kdb_buffer, retlen);
691 } else { 691 } else {
692 if (!dbg_io_ops->is_console) { 692 if (dbg_io_ops && !dbg_io_ops->is_console) {
693 len = strlen(kdb_buffer); 693 len = strlen(kdb_buffer);
694 cp = kdb_buffer; 694 cp = kdb_buffer;
695 while (len--) { 695 while (len--) {
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 4bca634975c0..118527aa60ea 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,7 @@
25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ 25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
26 26
27static int kbd_exists; 27static int kbd_exists;
28static int kbd_last_ret;
28 29
29/* 30/*
30 * Check if the keyboard controller has a keypress for us. 31 * Check if the keyboard controller has a keypress for us.
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)
90 return -1; 91 return -1;
91 } 92 }
92 93
93 if ((scancode & 0x80) != 0) 94 if ((scancode & 0x80) != 0) {
95 if (scancode == 0x9c)
96 kbd_last_ret = 0;
94 return -1; 97 return -1;
98 }
95 99
96 scancode &= 0x7f; 100 scancode &= 0x7f;
97 101
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)
178 return -1; /* ignore unprintables */ 182 return -1; /* ignore unprintables */
179 } 183 }
180 184
181 if ((scancode & 0x7f) == 0x1c) { 185 if (scancode == 0x1c) {
182 /* 186 kbd_last_ret = 1;
183 * enter key. All done. Absorb the release scancode. 187 return 13;
184 */ 188 }
189
190 return keychar & 0xff;
191}
192EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
193
194/*
195 * Best effort cleanup of ENTER break codes on leaving KDB. Called on
196 * exiting KDB, when we know we processed an ENTER or KP ENTER scan
197 * code.
198 */
199void kdb_kbd_cleanup_state(void)
200{
201 int scancode, scanstatus;
202
203 /*
204 * Nothing to clean up, since either
205 * ENTER was never pressed, or has already
206 * gotten cleaned up.
207 */
208 if (!kbd_last_ret)
209 return;
210
211 kbd_last_ret = 0;
212 /*
213 * Enter key. Need to absorb the break code here, lest it gets
214 * leaked out if we exit KDB as the result of processing 'g'.
215 *
216 * This has several interesting implications:
217 * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
218 * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
219 * only get a break code at the end of the repeated
220 * sequence. This means we can't propagate the repeated key
221 * press, and must swallow it away.
222 * + Need to handle possible PS/2 mouse input.
223 * + Need to handle mashed keys.
224 */
225
226 while (1) {
185 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) 227 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
186 ; 228 cpu_relax();
187 229
188 /* 230 /*
189 * Fetch the scancode 231 * Fetch the scancode.
190 */ 232 */
191 scancode = inb(KBD_DATA_REG); 233 scancode = inb(KBD_DATA_REG);
192 scanstatus = inb(KBD_STATUS_REG); 234 scanstatus = inb(KBD_STATUS_REG);
193 235
194 while (scanstatus & KBD_STAT_MOUSE_OBF) { 236 /*
195 scancode = inb(KBD_DATA_REG); 237 * Skip mouse input.
196 scanstatus = inb(KBD_STATUS_REG); 238 */
197 } 239 if (scanstatus & KBD_STAT_MOUSE_OBF)
240 continue;
198 241
199 if (scancode != 0x9c) { 242 /*
200 /* 243 * If we see 0xe0, this is either a break code for KP
201 * Wasn't an enter-release, why not? 244 * ENTER, or a repeat make for KP ENTER. Either way,
202 */ 245 * since the second byte is equivalent to an ENTER,
203 kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", 246 * skip the 0xe0 and try again.
204 scancode, scanstatus); 247 *
205 } 248 * If we see 0x1c, this must be a repeat ENTER or KP
249 * ENTER (and we swallowed 0xe0 before). Try again.
250 *
251 * We can also see make and break codes for other keys
252 * mashed before or after pressing ENTER. Thus, if we
253 * see anything other than 0x9c, we have to try again.
254 *
255 * Note, if you held some key as ENTER was depressed,
256 * that break code would get leaked out.
257 */
258 if (scancode != 0x9c)
259 continue;
206 260
207 return 13; 261 return;
208 } 262 }
209
210 return keychar & 0xff;
211} 263}
212EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e2ae7349437f..67b847dfa2bb 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1400 if (KDB_STATE(DOING_SS)) 1400 if (KDB_STATE(DOING_SS))
1401 KDB_STATE_CLEAR(SSBPT); 1401 KDB_STATE_CLEAR(SSBPT);
1402 1402
1403 /* Clean up any keyboard devices before leaving */
1404 kdb_kbd_cleanup_state();
1405
1403 return result; 1406 return result;
1404} 1407}
1405 1408
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index e381d105b40b..47c4e56e513b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -246,6 +246,13 @@ extern void debug_kusage(void);
246 246
247extern void kdb_set_current_task(struct task_struct *); 247extern void kdb_set_current_task(struct task_struct *);
248extern struct task_struct *kdb_current_task; 248extern struct task_struct *kdb_current_task;
249
250#ifdef CONFIG_KDB_KEYBOARD
251extern void kdb_kbd_cleanup_state(void);
252#else /* ! CONFIG_KDB_KEYBOARD */
253#define kdb_kbd_cleanup_state()
254#endif /* ! CONFIG_KDB_KEYBOARD */
255
249#ifdef CONFIG_MODULES 256#ifdef CONFIG_MODULES
250extern struct list_head *kdb_modules; 257extern struct list_head *kdb_modules;
251#endif /* CONFIG_MODULES */ 258#endif /* CONFIG_MODULES */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 7d6fb40d2188..d35cc2d3a4cc 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
384 if (!pfn_valid(pfn)) 384 if (!pfn_valid(pfn))
385 return 1; 385 return 1;
386 page = pfn_to_page(pfn); 386 page = pfn_to_page(pfn);
387 vaddr = kmap_atomic(page, KM_KDB); 387 vaddr = kmap_atomic(page);
388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); 388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
389 kunmap_atomic(vaddr, KM_KDB); 389 kunmap_atomic(vaddr);
390 390
391 return 0; 391 return 0;
392} 392}
diff --git a/kernel/dma.c b/kernel/dma.c
index 68a2306522c8..6c6262f86c17 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -18,7 +18,6 @@
18#include <linux/proc_fs.h> 18#include <linux/proc_fs.h>
19#include <linux/init.h> 19#include <linux/init.h>
20#include <asm/dma.h> 20#include <asm/dma.h>
21#include <asm/system.h>
22 21
23 22
24 23
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 057e24b665cf..6581a040f399 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -115,8 +115,6 @@ int get_callchain_buffers(void)
115 } 115 }
116 116
117 err = alloc_callchain_buffers(); 117 err = alloc_callchain_buffers();
118 if (err)
119 release_callchain_buffers();
120exit: 118exit:
121 mutex_unlock(&callchain_mutex); 119 mutex_unlock(&callchain_mutex);
122 120
diff --git a/kernel/events/core.c b/kernel/events/core.c
index a8f4ac001a00..4b50357914fb 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
118 PERF_FLAG_FD_OUTPUT |\ 118 PERF_FLAG_FD_OUTPUT |\
119 PERF_FLAG_PID_CGROUP) 119 PERF_FLAG_PID_CGROUP)
120 120
121/*
122 * branch priv levels that need permission checks
123 */
124#define PERF_SAMPLE_BRANCH_PERM_PLM \
125 (PERF_SAMPLE_BRANCH_KERNEL |\
126 PERF_SAMPLE_BRANCH_HV)
127
121enum event_type_t { 128enum event_type_t {
122 EVENT_FLEXIBLE = 0x1, 129 EVENT_FLEXIBLE = 0x1,
123 EVENT_PINNED = 0x2, 130 EVENT_PINNED = 0x2,
@@ -128,8 +135,9 @@ enum event_type_t {
128 * perf_sched_events : >0 events exist 135 * perf_sched_events : >0 events exist
129 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 136 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
130 */ 137 */
131struct jump_label_key_deferred perf_sched_events __read_mostly; 138struct static_key_deferred perf_sched_events __read_mostly;
132static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 139static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
140static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
133 141
134static atomic_t nr_mmap_events __read_mostly; 142static atomic_t nr_mmap_events __read_mostly;
135static atomic_t nr_comm_events __read_mostly; 143static atomic_t nr_comm_events __read_mostly;
@@ -815,7 +823,7 @@ static void update_event_times(struct perf_event *event)
815 * here. 823 * here.
816 */ 824 */
817 if (is_cgroup_event(event)) 825 if (is_cgroup_event(event))
818 run_end = perf_event_time(event); 826 run_end = perf_cgroup_event_time(event);
819 else if (ctx->is_active) 827 else if (ctx->is_active)
820 run_end = ctx->time; 828 run_end = ctx->time;
821 else 829 else
@@ -881,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
881 if (is_cgroup_event(event)) 889 if (is_cgroup_event(event))
882 ctx->nr_cgroups++; 890 ctx->nr_cgroups++;
883 891
892 if (has_branch_stack(event))
893 ctx->nr_branch_stack++;
894
884 list_add_rcu(&event->event_entry, &ctx->event_list); 895 list_add_rcu(&event->event_entry, &ctx->event_list);
885 if (!ctx->nr_events) 896 if (!ctx->nr_events)
886 perf_pmu_rotate_start(ctx->pmu); 897 perf_pmu_rotate_start(ctx->pmu);
@@ -1020,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1020 cpuctx->cgrp = NULL; 1031 cpuctx->cgrp = NULL;
1021 } 1032 }
1022 1033
1034 if (has_branch_stack(event))
1035 ctx->nr_branch_stack--;
1036
1023 ctx->nr_events--; 1037 ctx->nr_events--;
1024 if (event->attr.inherit_stat) 1038 if (event->attr.inherit_stat)
1025 ctx->nr_stat--; 1039 ctx->nr_stat--;
@@ -2195,6 +2209,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2195} 2209}
2196 2210
2197/* 2211/*
2212 * When sampling the branck stack in system-wide, it may be necessary
2213 * to flush the stack on context switch. This happens when the branch
2214 * stack does not tag its entries with the pid of the current task.
2215 * Otherwise it becomes impossible to associate a branch entry with a
2216 * task. This ambiguity is more likely to appear when the branch stack
2217 * supports priv level filtering and the user sets it to monitor only
2218 * at the user level (which could be a useful measurement in system-wide
2219 * mode). In that case, the risk is high of having a branch stack with
2220 * branch from multiple tasks. Flushing may mean dropping the existing
2221 * entries or stashing them somewhere in the PMU specific code layer.
2222 *
2223 * This function provides the context switch callback to the lower code
2224 * layer. It is invoked ONLY when there is at least one system-wide context
2225 * with at least one active event using taken branch sampling.
2226 */
2227static void perf_branch_stack_sched_in(struct task_struct *prev,
2228 struct task_struct *task)
2229{
2230 struct perf_cpu_context *cpuctx;
2231 struct pmu *pmu;
2232 unsigned long flags;
2233
2234 /* no need to flush branch stack if not changing task */
2235 if (prev == task)
2236 return;
2237
2238 local_irq_save(flags);
2239
2240 rcu_read_lock();
2241
2242 list_for_each_entry_rcu(pmu, &pmus, entry) {
2243 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2244
2245 /*
2246 * check if the context has at least one
2247 * event using PERF_SAMPLE_BRANCH_STACK
2248 */
2249 if (cpuctx->ctx.nr_branch_stack > 0
2250 && pmu->flush_branch_stack) {
2251
2252 pmu = cpuctx->ctx.pmu;
2253
2254 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2255
2256 perf_pmu_disable(pmu);
2257
2258 pmu->flush_branch_stack();
2259
2260 perf_pmu_enable(pmu);
2261
2262 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2263 }
2264 }
2265
2266 rcu_read_unlock();
2267
2268 local_irq_restore(flags);
2269}
2270
2271/*
2198 * Called from scheduler to add the events of the current task 2272 * Called from scheduler to add the events of the current task
2199 * with interrupts disabled. 2273 * with interrupts disabled.
2200 * 2274 *
@@ -2225,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2225 */ 2299 */
2226 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2300 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2227 perf_cgroup_sched_in(prev, task); 2301 perf_cgroup_sched_in(prev, task);
2302
2303 /* check for system-wide branch_stack events */
2304 if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2305 perf_branch_stack_sched_in(prev, task);
2228} 2306}
2229 2307
2230static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) 2308static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2300,7 +2378,10 @@ do { \
2300 return div64_u64(dividend, divisor); 2378 return div64_u64(dividend, divisor);
2301} 2379}
2302 2380
2303static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 2381static DEFINE_PER_CPU(int, perf_throttled_count);
2382static DEFINE_PER_CPU(u64, perf_throttled_seq);
2383
2384static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2304{ 2385{
2305 struct hw_perf_event *hwc = &event->hw; 2386 struct hw_perf_event *hwc = &event->hw;
2306 s64 period, sample_period; 2387 s64 period, sample_period;
@@ -2319,22 +2400,40 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
2319 hwc->sample_period = sample_period; 2400 hwc->sample_period = sample_period;
2320 2401
2321 if (local64_read(&hwc->period_left) > 8*sample_period) { 2402 if (local64_read(&hwc->period_left) > 8*sample_period) {
2322 event->pmu->stop(event, PERF_EF_UPDATE); 2403 if (disable)
2404 event->pmu->stop(event, PERF_EF_UPDATE);
2405
2323 local64_set(&hwc->period_left, 0); 2406 local64_set(&hwc->period_left, 0);
2324 event->pmu->start(event, PERF_EF_RELOAD); 2407
2408 if (disable)
2409 event->pmu->start(event, PERF_EF_RELOAD);
2325 } 2410 }
2326} 2411}
2327 2412
2328static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) 2413/*
2414 * combine freq adjustment with unthrottling to avoid two passes over the
2415 * events. At the same time, make sure, having freq events does not change
2416 * the rate of unthrottling as that would introduce bias.
2417 */
2418static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2419 int needs_unthr)
2329{ 2420{
2330 struct perf_event *event; 2421 struct perf_event *event;
2331 struct hw_perf_event *hwc; 2422 struct hw_perf_event *hwc;
2332 u64 interrupts, now; 2423 u64 now, period = TICK_NSEC;
2333 s64 delta; 2424 s64 delta;
2334 2425
2335 if (!ctx->nr_freq) 2426 /*
2427 * only need to iterate over all events iff:
2428 * - context have events in frequency mode (needs freq adjust)
2429 * - there are events to unthrottle on this cpu
2430 */
2431 if (!(ctx->nr_freq || needs_unthr))
2336 return; 2432 return;
2337 2433
2434 raw_spin_lock(&ctx->lock);
2435 perf_pmu_disable(ctx->pmu);
2436
2338 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2437 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2339 if (event->state != PERF_EVENT_STATE_ACTIVE) 2438 if (event->state != PERF_EVENT_STATE_ACTIVE)
2340 continue; 2439 continue;
@@ -2344,13 +2443,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2344 2443
2345 hwc = &event->hw; 2444 hwc = &event->hw;
2346 2445
2347 interrupts = hwc->interrupts; 2446 if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
2348 hwc->interrupts = 0; 2447 hwc->interrupts = 0;
2349
2350 /*
2351 * unthrottle events on the tick
2352 */
2353 if (interrupts == MAX_INTERRUPTS) {
2354 perf_log_throttle(event, 1); 2448 perf_log_throttle(event, 1);
2355 event->pmu->start(event, 0); 2449 event->pmu->start(event, 0);
2356 } 2450 }
@@ -2358,14 +2452,30 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2358 if (!event->attr.freq || !event->attr.sample_freq) 2452 if (!event->attr.freq || !event->attr.sample_freq)
2359 continue; 2453 continue;
2360 2454
2361 event->pmu->read(event); 2455 /*
2456 * stop the event and update event->count
2457 */
2458 event->pmu->stop(event, PERF_EF_UPDATE);
2459
2362 now = local64_read(&event->count); 2460 now = local64_read(&event->count);
2363 delta = now - hwc->freq_count_stamp; 2461 delta = now - hwc->freq_count_stamp;
2364 hwc->freq_count_stamp = now; 2462 hwc->freq_count_stamp = now;
2365 2463
2464 /*
2465 * restart the event
2466 * reload only if value has changed
2467 * we have stopped the event so tell that
2468 * to perf_adjust_period() to avoid stopping it
2469 * twice.
2470 */
2366 if (delta > 0) 2471 if (delta > 0)
2367 perf_adjust_period(event, period, delta); 2472 perf_adjust_period(event, period, delta, false);
2473
2474 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2368 } 2475 }
2476
2477 perf_pmu_enable(ctx->pmu);
2478 raw_spin_unlock(&ctx->lock);
2369} 2479}
2370 2480
2371/* 2481/*
@@ -2388,16 +2498,13 @@ static void rotate_ctx(struct perf_event_context *ctx)
2388 */ 2498 */
2389static void perf_rotate_context(struct perf_cpu_context *cpuctx) 2499static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2390{ 2500{
2391 u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
2392 struct perf_event_context *ctx = NULL; 2501 struct perf_event_context *ctx = NULL;
2393 int rotate = 0, remove = 1, freq = 0; 2502 int rotate = 0, remove = 1;
2394 2503
2395 if (cpuctx->ctx.nr_events) { 2504 if (cpuctx->ctx.nr_events) {
2396 remove = 0; 2505 remove = 0;
2397 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) 2506 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2398 rotate = 1; 2507 rotate = 1;
2399 if (cpuctx->ctx.nr_freq)
2400 freq = 1;
2401 } 2508 }
2402 2509
2403 ctx = cpuctx->task_ctx; 2510 ctx = cpuctx->task_ctx;
@@ -2405,37 +2512,26 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2405 remove = 0; 2512 remove = 0;
2406 if (ctx->nr_events != ctx->nr_active) 2513 if (ctx->nr_events != ctx->nr_active)
2407 rotate = 1; 2514 rotate = 1;
2408 if (ctx->nr_freq)
2409 freq = 1;
2410 } 2515 }
2411 2516
2412 if (!rotate && !freq) 2517 if (!rotate)
2413 goto done; 2518 goto done;
2414 2519
2415 perf_ctx_lock(cpuctx, cpuctx->task_ctx); 2520 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2416 perf_pmu_disable(cpuctx->ctx.pmu); 2521 perf_pmu_disable(cpuctx->ctx.pmu);
2417 2522
2418 if (freq) { 2523 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2419 perf_ctx_adjust_freq(&cpuctx->ctx, interval); 2524 if (ctx)
2420 if (ctx) 2525 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2421 perf_ctx_adjust_freq(ctx, interval);
2422 }
2423
2424 if (rotate) {
2425 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2426 if (ctx)
2427 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2428 2526
2429 rotate_ctx(&cpuctx->ctx); 2527 rotate_ctx(&cpuctx->ctx);
2430 if (ctx) 2528 if (ctx)
2431 rotate_ctx(ctx); 2529 rotate_ctx(ctx);
2432 2530
2433 perf_event_sched_in(cpuctx, ctx, current); 2531 perf_event_sched_in(cpuctx, ctx, current);
2434 }
2435 2532
2436 perf_pmu_enable(cpuctx->ctx.pmu); 2533 perf_pmu_enable(cpuctx->ctx.pmu);
2437 perf_ctx_unlock(cpuctx, cpuctx->task_ctx); 2534 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2438
2439done: 2535done:
2440 if (remove) 2536 if (remove)
2441 list_del_init(&cpuctx->rotation_list); 2537 list_del_init(&cpuctx->rotation_list);
@@ -2445,10 +2541,22 @@ void perf_event_task_tick(void)
2445{ 2541{
2446 struct list_head *head = &__get_cpu_var(rotation_list); 2542 struct list_head *head = &__get_cpu_var(rotation_list);
2447 struct perf_cpu_context *cpuctx, *tmp; 2543 struct perf_cpu_context *cpuctx, *tmp;
2544 struct perf_event_context *ctx;
2545 int throttled;
2448 2546
2449 WARN_ON(!irqs_disabled()); 2547 WARN_ON(!irqs_disabled());
2450 2548
2549 __this_cpu_inc(perf_throttled_seq);
2550 throttled = __this_cpu_xchg(perf_throttled_count, 0);
2551
2451 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { 2552 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2553 ctx = &cpuctx->ctx;
2554 perf_adjust_freq_unthr_context(ctx, throttled);
2555
2556 ctx = cpuctx->task_ctx;
2557 if (ctx)
2558 perf_adjust_freq_unthr_context(ctx, throttled);
2559
2452 if (cpuctx->jiffies_interval == 1 || 2560 if (cpuctx->jiffies_interval == 1 ||
2453 !(jiffies % cpuctx->jiffies_interval)) 2561 !(jiffies % cpuctx->jiffies_interval))
2454 perf_rotate_context(cpuctx); 2562 perf_rotate_context(cpuctx);
@@ -2748,7 +2856,7 @@ static void free_event(struct perf_event *event)
2748 2856
2749 if (!event->parent) { 2857 if (!event->parent) {
2750 if (event->attach_state & PERF_ATTACH_TASK) 2858 if (event->attach_state & PERF_ATTACH_TASK)
2751 jump_label_dec_deferred(&perf_sched_events); 2859 static_key_slow_dec_deferred(&perf_sched_events);
2752 if (event->attr.mmap || event->attr.mmap_data) 2860 if (event->attr.mmap || event->attr.mmap_data)
2753 atomic_dec(&nr_mmap_events); 2861 atomic_dec(&nr_mmap_events);
2754 if (event->attr.comm) 2862 if (event->attr.comm)
@@ -2759,7 +2867,15 @@ static void free_event(struct perf_event *event)
2759 put_callchain_buffers(); 2867 put_callchain_buffers();
2760 if (is_cgroup_event(event)) { 2868 if (is_cgroup_event(event)) {
2761 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); 2869 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
2762 jump_label_dec_deferred(&perf_sched_events); 2870 static_key_slow_dec_deferred(&perf_sched_events);
2871 }
2872
2873 if (has_branch_stack(event)) {
2874 static_key_slow_dec_deferred(&perf_sched_events);
2875 /* is system-wide event */
2876 if (!(event->attach_state & PERF_ATTACH_TASK))
2877 atomic_dec(&per_cpu(perf_branch_stack_events,
2878 event->cpu));
2763 } 2879 }
2764 } 2880 }
2765 2881
@@ -3208,10 +3324,6 @@ int perf_event_task_disable(void)
3208 return 0; 3324 return 0;
3209} 3325}
3210 3326
3211#ifndef PERF_EVENT_INDEX_OFFSET
3212# define PERF_EVENT_INDEX_OFFSET 0
3213#endif
3214
3215static int perf_event_index(struct perf_event *event) 3327static int perf_event_index(struct perf_event *event)
3216{ 3328{
3217 if (event->hw.state & PERF_HES_STOPPED) 3329 if (event->hw.state & PERF_HES_STOPPED)
@@ -3220,21 +3332,26 @@ static int perf_event_index(struct perf_event *event)
3220 if (event->state != PERF_EVENT_STATE_ACTIVE) 3332 if (event->state != PERF_EVENT_STATE_ACTIVE)
3221 return 0; 3333 return 0;
3222 3334
3223 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; 3335 return event->pmu->event_idx(event);
3224} 3336}
3225 3337
3226static void calc_timer_values(struct perf_event *event, 3338static void calc_timer_values(struct perf_event *event,
3339 u64 *now,
3227 u64 *enabled, 3340 u64 *enabled,
3228 u64 *running) 3341 u64 *running)
3229{ 3342{
3230 u64 now, ctx_time; 3343 u64 ctx_time;
3231 3344
3232 now = perf_clock(); 3345 *now = perf_clock();
3233 ctx_time = event->shadow_ctx_time + now; 3346 ctx_time = event->shadow_ctx_time + *now;
3234 *enabled = ctx_time - event->tstamp_enabled; 3347 *enabled = ctx_time - event->tstamp_enabled;
3235 *running = ctx_time - event->tstamp_running; 3348 *running = ctx_time - event->tstamp_running;
3236} 3349}
3237 3350
3351void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
3352{
3353}
3354
3238/* 3355/*
3239 * Callers need to ensure there can be no nesting of this function, otherwise 3356 * Callers need to ensure there can be no nesting of this function, otherwise
3240 * the seqlock logic goes bad. We can not serialize this because the arch 3357 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3244,7 +3361,7 @@ void perf_event_update_userpage(struct perf_event *event)
3244{ 3361{
3245 struct perf_event_mmap_page *userpg; 3362 struct perf_event_mmap_page *userpg;
3246 struct ring_buffer *rb; 3363 struct ring_buffer *rb;
3247 u64 enabled, running; 3364 u64 enabled, running, now;
3248 3365
3249 rcu_read_lock(); 3366 rcu_read_lock();
3250 /* 3367 /*
@@ -3256,7 +3373,7 @@ void perf_event_update_userpage(struct perf_event *event)
3256 * because of locking issue as we can be called in 3373 * because of locking issue as we can be called in
3257 * NMI context 3374 * NMI context
3258 */ 3375 */
3259 calc_timer_values(event, &enabled, &running); 3376 calc_timer_values(event, &now, &enabled, &running);
3260 rb = rcu_dereference(event->rb); 3377 rb = rcu_dereference(event->rb);
3261 if (!rb) 3378 if (!rb)
3262 goto unlock; 3379 goto unlock;
@@ -3272,7 +3389,7 @@ void perf_event_update_userpage(struct perf_event *event)
3272 barrier(); 3389 barrier();
3273 userpg->index = perf_event_index(event); 3390 userpg->index = perf_event_index(event);
3274 userpg->offset = perf_event_count(event); 3391 userpg->offset = perf_event_count(event);
3275 if (event->state == PERF_EVENT_STATE_ACTIVE) 3392 if (userpg->index)
3276 userpg->offset -= local64_read(&event->hw.prev_count); 3393 userpg->offset -= local64_read(&event->hw.prev_count);
3277 3394
3278 userpg->time_enabled = enabled + 3395 userpg->time_enabled = enabled +
@@ -3281,6 +3398,8 @@ void perf_event_update_userpage(struct perf_event *event)
3281 userpg->time_running = running + 3398 userpg->time_running = running +
3282 atomic64_read(&event->child_total_time_running); 3399 atomic64_read(&event->child_total_time_running);
3283 3400
3401 perf_update_user_clock(userpg, now);
3402
3284 barrier(); 3403 barrier();
3285 ++userpg->lock; 3404 ++userpg->lock;
3286 preempt_enable(); 3405 preempt_enable();
@@ -3538,6 +3657,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3538 event->mmap_user = get_current_user(); 3657 event->mmap_user = get_current_user();
3539 vma->vm_mm->pinned_vm += event->mmap_locked; 3658 vma->vm_mm->pinned_vm += event->mmap_locked;
3540 3659
3660 perf_event_update_userpage(event);
3661
3541unlock: 3662unlock:
3542 if (!ret) 3663 if (!ret)
3543 atomic_inc(&event->mmap_count); 3664 atomic_inc(&event->mmap_count);
@@ -3769,7 +3890,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
3769static void perf_output_read(struct perf_output_handle *handle, 3890static void perf_output_read(struct perf_output_handle *handle,
3770 struct perf_event *event) 3891 struct perf_event *event)
3771{ 3892{
3772 u64 enabled = 0, running = 0; 3893 u64 enabled = 0, running = 0, now;
3773 u64 read_format = event->attr.read_format; 3894 u64 read_format = event->attr.read_format;
3774 3895
3775 /* 3896 /*
@@ -3782,7 +3903,7 @@ static void perf_output_read(struct perf_output_handle *handle,
3782 * NMI context 3903 * NMI context
3783 */ 3904 */
3784 if (read_format & PERF_FORMAT_TOTAL_TIMES) 3905 if (read_format & PERF_FORMAT_TOTAL_TIMES)
3785 calc_timer_values(event, &enabled, &running); 3906 calc_timer_values(event, &now, &enabled, &running);
3786 3907
3787 if (event->attr.read_format & PERF_FORMAT_GROUP) 3908 if (event->attr.read_format & PERF_FORMAT_GROUP)
3788 perf_output_read_group(handle, event, enabled, running); 3909 perf_output_read_group(handle, event, enabled, running);
@@ -3872,6 +3993,24 @@ void perf_output_sample(struct perf_output_handle *handle,
3872 } 3993 }
3873 } 3994 }
3874 } 3995 }
3996
3997 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
3998 if (data->br_stack) {
3999 size_t size;
4000
4001 size = data->br_stack->nr
4002 * sizeof(struct perf_branch_entry);
4003
4004 perf_output_put(handle, data->br_stack->nr);
4005 perf_output_copy(handle, data->br_stack->entries, size);
4006 } else {
4007 /*
4008 * we always store at least the value of nr
4009 */
4010 u64 nr = 0;
4011 perf_output_put(handle, nr);
4012 }
4013 }
3875} 4014}
3876 4015
3877void perf_prepare_sample(struct perf_event_header *header, 4016void perf_prepare_sample(struct perf_event_header *header,
@@ -3914,6 +4053,15 @@ void perf_prepare_sample(struct perf_event_header *header,
3914 WARN_ON_ONCE(size & (sizeof(u64)-1)); 4053 WARN_ON_ONCE(size & (sizeof(u64)-1));
3915 header->size += size; 4054 header->size += size;
3916 } 4055 }
4056
4057 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4058 int size = sizeof(u64); /* nr */
4059 if (data->br_stack) {
4060 size += data->br_stack->nr
4061 * sizeof(struct perf_branch_entry);
4062 }
4063 header->size += size;
4064 }
3917} 4065}
3918 4066
3919static void perf_event_output(struct perf_event *event, 4067static void perf_event_output(struct perf_event *event,
@@ -4509,6 +4657,7 @@ static int __perf_event_overflow(struct perf_event *event,
4509{ 4657{
4510 int events = atomic_read(&event->event_limit); 4658 int events = atomic_read(&event->event_limit);
4511 struct hw_perf_event *hwc = &event->hw; 4659 struct hw_perf_event *hwc = &event->hw;
4660 u64 seq;
4512 int ret = 0; 4661 int ret = 0;
4513 4662
4514 /* 4663 /*
@@ -4518,14 +4667,20 @@ static int __perf_event_overflow(struct perf_event *event,
4518 if (unlikely(!is_sampling_event(event))) 4667 if (unlikely(!is_sampling_event(event)))
4519 return 0; 4668 return 0;
4520 4669
4521 if (unlikely(hwc->interrupts >= max_samples_per_tick)) { 4670 seq = __this_cpu_read(perf_throttled_seq);
4522 if (throttle) { 4671 if (seq != hwc->interrupts_seq) {
4672 hwc->interrupts_seq = seq;
4673 hwc->interrupts = 1;
4674 } else {
4675 hwc->interrupts++;
4676 if (unlikely(throttle
4677 && hwc->interrupts >= max_samples_per_tick)) {
4678 __this_cpu_inc(perf_throttled_count);
4523 hwc->interrupts = MAX_INTERRUPTS; 4679 hwc->interrupts = MAX_INTERRUPTS;
4524 perf_log_throttle(event, 0); 4680 perf_log_throttle(event, 0);
4525 ret = 1; 4681 ret = 1;
4526 } 4682 }
4527 } else 4683 }
4528 hwc->interrupts++;
4529 4684
4530 if (event->attr.freq) { 4685 if (event->attr.freq) {
4531 u64 now = perf_clock(); 4686 u64 now = perf_clock();
@@ -4534,7 +4689,7 @@ static int __perf_event_overflow(struct perf_event *event,
4534 hwc->freq_time_stamp = now; 4689 hwc->freq_time_stamp = now;
4535 4690
4536 if (delta > 0 && delta < 2*TICK_NSEC) 4691 if (delta > 0 && delta < 2*TICK_NSEC)
4537 perf_adjust_period(event, delta, hwc->last_period); 4692 perf_adjust_period(event, delta, hwc->last_period, true);
4538 } 4693 }
4539 4694
4540 /* 4695 /*
@@ -4949,7 +5104,7 @@ fail:
4949 return err; 5104 return err;
4950} 5105}
4951 5106
4952struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; 5107struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
4953 5108
4954static void sw_perf_event_destroy(struct perf_event *event) 5109static void sw_perf_event_destroy(struct perf_event *event)
4955{ 5110{
@@ -4957,7 +5112,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4957 5112
4958 WARN_ON(event->parent); 5113 WARN_ON(event->parent);
4959 5114
4960 jump_label_dec(&perf_swevent_enabled[event_id]); 5115 static_key_slow_dec(&perf_swevent_enabled[event_id]);
4961 swevent_hlist_put(event); 5116 swevent_hlist_put(event);
4962} 5117}
4963 5118
@@ -4968,6 +5123,12 @@ static int perf_swevent_init(struct perf_event *event)
4968 if (event->attr.type != PERF_TYPE_SOFTWARE) 5123 if (event->attr.type != PERF_TYPE_SOFTWARE)
4969 return -ENOENT; 5124 return -ENOENT;
4970 5125
5126 /*
5127 * no branch sampling for software events
5128 */
5129 if (has_branch_stack(event))
5130 return -EOPNOTSUPP;
5131
4971 switch (event_id) { 5132 switch (event_id) {
4972 case PERF_COUNT_SW_CPU_CLOCK: 5133 case PERF_COUNT_SW_CPU_CLOCK:
4973 case PERF_COUNT_SW_TASK_CLOCK: 5134 case PERF_COUNT_SW_TASK_CLOCK:
@@ -4987,13 +5148,18 @@ static int perf_swevent_init(struct perf_event *event)
4987 if (err) 5148 if (err)
4988 return err; 5149 return err;
4989 5150
4990 jump_label_inc(&perf_swevent_enabled[event_id]); 5151 static_key_slow_inc(&perf_swevent_enabled[event_id]);
4991 event->destroy = sw_perf_event_destroy; 5152 event->destroy = sw_perf_event_destroy;
4992 } 5153 }
4993 5154
4994 return 0; 5155 return 0;
4995} 5156}
4996 5157
5158static int perf_swevent_event_idx(struct perf_event *event)
5159{
5160 return 0;
5161}
5162
4997static struct pmu perf_swevent = { 5163static struct pmu perf_swevent = {
4998 .task_ctx_nr = perf_sw_context, 5164 .task_ctx_nr = perf_sw_context,
4999 5165
@@ -5003,6 +5169,8 @@ static struct pmu perf_swevent = {
5003 .start = perf_swevent_start, 5169 .start = perf_swevent_start,
5004 .stop = perf_swevent_stop, 5170 .stop = perf_swevent_stop,
5005 .read = perf_swevent_read, 5171 .read = perf_swevent_read,
5172
5173 .event_idx = perf_swevent_event_idx,
5006}; 5174};
5007 5175
5008#ifdef CONFIG_EVENT_TRACING 5176#ifdef CONFIG_EVENT_TRACING
@@ -5071,6 +5239,12 @@ static int perf_tp_event_init(struct perf_event *event)
5071 if (event->attr.type != PERF_TYPE_TRACEPOINT) 5239 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5072 return -ENOENT; 5240 return -ENOENT;
5073 5241
5242 /*
5243 * no branch sampling for tracepoint events
5244 */
5245 if (has_branch_stack(event))
5246 return -EOPNOTSUPP;
5247
5074 err = perf_trace_init(event); 5248 err = perf_trace_init(event);
5075 if (err) 5249 if (err)
5076 return err; 5250 return err;
@@ -5089,6 +5263,8 @@ static struct pmu perf_tracepoint = {
5089 .start = perf_swevent_start, 5263 .start = perf_swevent_start,
5090 .stop = perf_swevent_stop, 5264 .stop = perf_swevent_stop,
5091 .read = perf_swevent_read, 5265 .read = perf_swevent_read,
5266
5267 .event_idx = perf_swevent_event_idx,
5092}; 5268};
5093 5269
5094static inline void perf_tp_register(void) 5270static inline void perf_tp_register(void)
@@ -5294,6 +5470,12 @@ static int cpu_clock_event_init(struct perf_event *event)
5294 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) 5470 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5295 return -ENOENT; 5471 return -ENOENT;
5296 5472
5473 /*
5474 * no branch sampling for software events
5475 */
5476 if (has_branch_stack(event))
5477 return -EOPNOTSUPP;
5478
5297 perf_swevent_init_hrtimer(event); 5479 perf_swevent_init_hrtimer(event);
5298 5480
5299 return 0; 5481 return 0;
@@ -5308,6 +5490,8 @@ static struct pmu perf_cpu_clock = {
5308 .start = cpu_clock_event_start, 5490 .start = cpu_clock_event_start,
5309 .stop = cpu_clock_event_stop, 5491 .stop = cpu_clock_event_stop,
5310 .read = cpu_clock_event_read, 5492 .read = cpu_clock_event_read,
5493
5494 .event_idx = perf_swevent_event_idx,
5311}; 5495};
5312 5496
5313/* 5497/*
@@ -5366,6 +5550,12 @@ static int task_clock_event_init(struct perf_event *event)
5366 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) 5550 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5367 return -ENOENT; 5551 return -ENOENT;
5368 5552
5553 /*
5554 * no branch sampling for software events
5555 */
5556 if (has_branch_stack(event))
5557 return -EOPNOTSUPP;
5558
5369 perf_swevent_init_hrtimer(event); 5559 perf_swevent_init_hrtimer(event);
5370 5560
5371 return 0; 5561 return 0;
@@ -5380,6 +5570,8 @@ static struct pmu perf_task_clock = {
5380 .start = task_clock_event_start, 5570 .start = task_clock_event_start,
5381 .stop = task_clock_event_stop, 5571 .stop = task_clock_event_stop,
5382 .read = task_clock_event_read, 5572 .read = task_clock_event_read,
5573
5574 .event_idx = perf_swevent_event_idx,
5383}; 5575};
5384 5576
5385static void perf_pmu_nop_void(struct pmu *pmu) 5577static void perf_pmu_nop_void(struct pmu *pmu)
@@ -5407,6 +5599,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
5407 perf_pmu_enable(pmu); 5599 perf_pmu_enable(pmu);
5408} 5600}
5409 5601
5602static int perf_event_idx_default(struct perf_event *event)
5603{
5604 return event->hw.idx + 1;
5605}
5606
5410/* 5607/*
5411 * Ensures all contexts with the same task_ctx_nr have the same 5608 * Ensures all contexts with the same task_ctx_nr have the same
5412 * pmu_cpu_context too. 5609 * pmu_cpu_context too.
@@ -5493,6 +5690,7 @@ static int pmu_dev_alloc(struct pmu *pmu)
5493 if (!pmu->dev) 5690 if (!pmu->dev)
5494 goto out; 5691 goto out;
5495 5692
5693 pmu->dev->groups = pmu->attr_groups;
5496 device_initialize(pmu->dev); 5694 device_initialize(pmu->dev);
5497 ret = dev_set_name(pmu->dev, "%s", pmu->name); 5695 ret = dev_set_name(pmu->dev, "%s", pmu->name);
5498 if (ret) 5696 if (ret)
@@ -5596,6 +5794,9 @@ got_cpu_context:
5596 pmu->pmu_disable = perf_pmu_nop_void; 5794 pmu->pmu_disable = perf_pmu_nop_void;
5597 } 5795 }
5598 5796
5797 if (!pmu->event_idx)
5798 pmu->event_idx = perf_event_idx_default;
5799
5599 list_add_rcu(&pmu->entry, &pmus); 5800 list_add_rcu(&pmu->entry, &pmus);
5600 ret = 0; 5801 ret = 0;
5601unlock: 5802unlock:
@@ -5788,7 +5989,7 @@ done:
5788 5989
5789 if (!event->parent) { 5990 if (!event->parent) {
5790 if (event->attach_state & PERF_ATTACH_TASK) 5991 if (event->attach_state & PERF_ATTACH_TASK)
5791 jump_label_inc(&perf_sched_events.key); 5992 static_key_slow_inc(&perf_sched_events.key);
5792 if (event->attr.mmap || event->attr.mmap_data) 5993 if (event->attr.mmap || event->attr.mmap_data)
5793 atomic_inc(&nr_mmap_events); 5994 atomic_inc(&nr_mmap_events);
5794 if (event->attr.comm) 5995 if (event->attr.comm)
@@ -5802,6 +6003,12 @@ done:
5802 return ERR_PTR(err); 6003 return ERR_PTR(err);
5803 } 6004 }
5804 } 6005 }
6006 if (has_branch_stack(event)) {
6007 static_key_slow_inc(&perf_sched_events.key);
6008 if (!(event->attach_state & PERF_ATTACH_TASK))
6009 atomic_inc(&per_cpu(perf_branch_stack_events,
6010 event->cpu));
6011 }
5805 } 6012 }
5806 6013
5807 return event; 6014 return event;
@@ -5871,6 +6078,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
5871 if (attr->read_format & ~(PERF_FORMAT_MAX-1)) 6078 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
5872 return -EINVAL; 6079 return -EINVAL;
5873 6080
6081 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
6082 u64 mask = attr->branch_sample_type;
6083
6084 /* only using defined bits */
6085 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
6086 return -EINVAL;
6087
6088 /* at least one branch bit must be set */
6089 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6090 return -EINVAL;
6091
6092 /* kernel level capture: check permissions */
6093 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6094 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6095 return -EACCES;
6096
6097 /* propagate priv level, when not set for branch */
6098 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6099
6100 /* exclude_kernel checked on syscall entry */
6101 if (!attr->exclude_kernel)
6102 mask |= PERF_SAMPLE_BRANCH_KERNEL;
6103
6104 if (!attr->exclude_user)
6105 mask |= PERF_SAMPLE_BRANCH_USER;
6106
6107 if (!attr->exclude_hv)
6108 mask |= PERF_SAMPLE_BRANCH_HV;
6109 /*
6110 * adjust user setting (for HW filter setup)
6111 */
6112 attr->branch_sample_type = mask;
6113 }
6114 }
5874out: 6115out:
5875 return ret; 6116 return ret;
5876 6117
@@ -6026,7 +6267,7 @@ SYSCALL_DEFINE5(perf_event_open,
6026 * - that may need work on context switch 6267 * - that may need work on context switch
6027 */ 6268 */
6028 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); 6269 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6029 jump_label_inc(&perf_sched_events.key); 6270 static_key_slow_inc(&perf_sched_events.key);
6030 } 6271 }
6031 6272
6032 /* 6273 /*
@@ -6906,8 +7147,7 @@ unlock:
6906device_initcall(perf_event_sysfs_init); 7147device_initcall(perf_event_sysfs_init);
6907 7148
6908#ifdef CONFIG_CGROUP_PERF 7149#ifdef CONFIG_CGROUP_PERF
6909static struct cgroup_subsys_state *perf_cgroup_create( 7150static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont)
6910 struct cgroup_subsys *ss, struct cgroup *cont)
6911{ 7151{
6912 struct perf_cgroup *jc; 7152 struct perf_cgroup *jc;
6913 7153
@@ -6924,8 +7164,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(
6924 return &jc->css; 7164 return &jc->css;
6925} 7165}
6926 7166
6927static void perf_cgroup_destroy(struct cgroup_subsys *ss, 7167static void perf_cgroup_destroy(struct cgroup *cont)
6928 struct cgroup *cont)
6929{ 7168{
6930 struct perf_cgroup *jc; 7169 struct perf_cgroup *jc;
6931 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), 7170 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
@@ -6941,8 +7180,7 @@ static int __perf_cgroup_move(void *info)
6941 return 0; 7180 return 0;
6942} 7181}
6943 7182
6944static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7183static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
6945 struct cgroup_taskset *tset)
6946{ 7184{
6947 struct task_struct *task; 7185 struct task_struct *task;
6948 7186
@@ -6950,8 +7188,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
6950 task_function_call(task, __perf_cgroup_move, task); 7188 task_function_call(task, __perf_cgroup_move, task);
6951} 7189}
6952 7190
6953static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7191static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
6954 struct cgroup *old_cgrp, struct task_struct *task) 7192 struct task_struct *task)
6955{ 7193{
6956 /* 7194 /*
6957 * cgroup_exit() is called in the copy_process() failure path. 7195 * cgroup_exit() is called in the copy_process() failure path.
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b7971d6f38bf..bb38c4d3ee12 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
581 if (bp->attr.type != PERF_TYPE_BREAKPOINT) 581 if (bp->attr.type != PERF_TYPE_BREAKPOINT)
582 return -ENOENT; 582 return -ENOENT;
583 583
584 /*
585 * no branch sampling for breakpoint events
586 */
587 if (has_branch_stack(bp))
588 return -EOPNOTSUPP;
589
584 err = register_perf_hw_breakpoint(bp); 590 err = register_perf_hw_breakpoint(bp);
585 if (err) 591 if (err)
586 return err; 592 return err;
@@ -613,6 +619,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
613 bp->hw.state = PERF_HES_STOPPED; 619 bp->hw.state = PERF_HES_STOPPED;
614} 620}
615 621
622static int hw_breakpoint_event_idx(struct perf_event *bp)
623{
624 return 0;
625}
626
616static struct pmu perf_breakpoint = { 627static struct pmu perf_breakpoint = {
617 .task_ctx_nr = perf_sw_context, /* could eventually get its own */ 628 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
618 629
@@ -622,6 +633,8 @@ static struct pmu perf_breakpoint = {
622 .start = hw_breakpoint_start, 633 .start = hw_breakpoint_start,
623 .stop = hw_breakpoint_stop, 634 .stop = hw_breakpoint_stop,
624 .read = hw_breakpoint_pmu_read, 635 .read = hw_breakpoint_pmu_read,
636
637 .event_idx = hw_breakpoint_event_idx,
625}; 638};
626 639
627int __init init_hw_breakpoint(void) 640int __init init_hw_breakpoint(void)
@@ -651,10 +664,10 @@ int __init init_hw_breakpoint(void)
651 664
652 err_alloc: 665 err_alloc:
653 for_each_possible_cpu(err_cpu) { 666 for_each_possible_cpu(err_cpu) {
654 if (err_cpu == cpu)
655 break;
656 for (i = 0; i < TYPE_MAX; i++) 667 for (i = 0; i < TYPE_MAX; i++)
657 kfree(per_cpu(nr_task_bp_pinned[i], cpu)); 668 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
669 if (err_cpu == cpu)
670 break;
658 } 671 }
659 672
660 return -ENOMEM; 673 return -ENOMEM;
diff --git a/kernel/exit.c b/kernel/exit.c
index 294b1709170d..3db1909faed9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -52,6 +52,7 @@
52#include <linux/hw_breakpoint.h> 52#include <linux/hw_breakpoint.h>
53#include <linux/oom.h> 53#include <linux/oom.h>
54#include <linux/writeback.h> 54#include <linux/writeback.h>
55#include <linux/shm.h>
55 56
56#include <asm/uaccess.h> 57#include <asm/uaccess.h>
57#include <asm/unistd.h> 58#include <asm/unistd.h>
@@ -424,7 +425,7 @@ void daemonize(const char *name, ...)
424 */ 425 */
425 exit_mm(current); 426 exit_mm(current);
426 /* 427 /*
427 * We don't want to have TIF_FREEZE set if the system-wide hibernation 428 * We don't want to get frozen, in case system-wide hibernation
428 * or suspend transition begins right now. 429 * or suspend transition begins right now.
429 */ 430 */
430 current->flags |= (PF_NOFREEZE | PF_KTHREAD); 431 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
@@ -686,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)
686} 687}
687 688
688/* 689/*
689 * When we die, we re-parent all our children. 690 * When we die, we re-parent all our children, and try to:
690 * Try to give them to another thread in our thread 691 * 1. give them to another thread in our thread group, if such a member exists
691 * group, and if no such member exists, give it to 692 * 2. give it to the first ancestor process which prctl'd itself as a
692 * the child reaper process (ie "init") in our pid 693 * child_subreaper for its children (like a service manager)
693 * space. 694 * 3. give it to the init process (PID 1) in our pid namespace
694 */ 695 */
695static struct task_struct *find_new_reaper(struct task_struct *father) 696static struct task_struct *find_new_reaper(struct task_struct *father)
696 __releases(&tasklist_lock) 697 __releases(&tasklist_lock)
@@ -710,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
710 711
711 if (unlikely(pid_ns->child_reaper == father)) { 712 if (unlikely(pid_ns->child_reaper == father)) {
712 write_unlock_irq(&tasklist_lock); 713 write_unlock_irq(&tasklist_lock);
713 if (unlikely(pid_ns == &init_pid_ns)) 714 if (unlikely(pid_ns == &init_pid_ns)) {
714 panic("Attempted to kill init!"); 715 panic("Attempted to kill init! exitcode=0x%08x\n",
716 father->signal->group_exit_code ?:
717 father->exit_code);
718 }
715 719
716 zap_pid_ns_processes(pid_ns); 720 zap_pid_ns_processes(pid_ns);
717 write_lock_irq(&tasklist_lock); 721 write_lock_irq(&tasklist_lock);
@@ -721,6 +725,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
721 * forget_original_parent() must move them somewhere. 725 * forget_original_parent() must move them somewhere.
722 */ 726 */
723 pid_ns->child_reaper = init_pid_ns.child_reaper; 727 pid_ns->child_reaper = init_pid_ns.child_reaper;
728 } else if (father->signal->has_child_subreaper) {
729 struct task_struct *reaper;
730
731 /*
732 * Find the first ancestor marked as child_subreaper.
733 * Note that the code below checks same_thread_group(reaper,
734 * pid_ns->child_reaper). This is what we need to DTRT in a
735 * PID namespace. However we still need the check above, see
736 * http://marc.info/?l=linux-kernel&m=131385460420380
737 */
738 for (reaper = father->real_parent;
739 reaper != &init_task;
740 reaper = reaper->real_parent) {
741 if (same_thread_group(reaper, pid_ns->child_reaper))
742 break;
743 if (!reaper->signal->is_child_subreaper)
744 continue;
745 thread = reaper;
746 do {
747 if (!(thread->flags & PF_EXITING))
748 return reaper;
749 } while_each_thread(reaper, thread);
750 }
724 } 751 }
725 752
726 return pid_ns->child_reaper; 753 return pid_ns->child_reaper;
@@ -818,25 +845,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
818 if (group_dead) 845 if (group_dead)
819 kill_orphaned_pgrp(tsk->group_leader, NULL); 846 kill_orphaned_pgrp(tsk->group_leader, NULL);
820 847
821 /* Let father know we died
822 *
823 * Thread signals are configurable, but you aren't going to use
824 * that to send signals to arbitrary processes.
825 * That stops right now.
826 *
827 * If the parent exec id doesn't match the exec id we saved
828 * when we started then we know the parent has changed security
829 * domain.
830 *
831 * If our self_exec id doesn't match our parent_exec_id then
832 * we have changed execution domain as these two values started
833 * the same after a fork.
834 */
835 if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD &&
836 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
837 tsk->self_exec_id != tsk->parent_exec_id))
838 tsk->exit_signal = SIGCHLD;
839
840 if (unlikely(tsk->ptrace)) { 848 if (unlikely(tsk->ptrace)) {
841 int sig = thread_group_leader(tsk) && 849 int sig = thread_group_leader(tsk) &&
842 thread_group_empty(tsk) && 850 thread_group_empty(tsk) &&
@@ -935,8 +943,6 @@ void do_exit(long code)
935 schedule(); 943 schedule();
936 } 944 }
937 945
938 exit_irq_thread();
939
940 exit_signals(tsk); /* sets PF_EXITING */ 946 exit_signals(tsk); /* sets PF_EXITING */
941 /* 947 /*
942 * tsk->flags are checked in the futex code to protect against 948 * tsk->flags are checked in the futex code to protect against
@@ -945,6 +951,8 @@ void do_exit(long code)
945 smp_mb(); 951 smp_mb();
946 raw_spin_unlock_wait(&tsk->pi_lock); 952 raw_spin_unlock_wait(&tsk->pi_lock);
947 953
954 exit_irq_thread();
955
948 if (unlikely(in_atomic())) 956 if (unlikely(in_atomic()))
949 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 957 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
950 current->comm, task_pid_nr(current), 958 current->comm, task_pid_nr(current),
@@ -953,7 +961,7 @@ void do_exit(long code)
953 acct_update_integrals(tsk); 961 acct_update_integrals(tsk);
954 /* sync mm's RSS info before statistics gathering */ 962 /* sync mm's RSS info before statistics gathering */
955 if (tsk->mm) 963 if (tsk->mm)
956 sync_mm_rss(tsk, tsk->mm); 964 sync_mm_rss(tsk->mm);
957 group_dead = atomic_dec_and_test(&tsk->signal->live); 965 group_dead = atomic_dec_and_test(&tsk->signal->live);
958 if (group_dead) { 966 if (group_dead) {
959 hrtimer_cancel(&tsk->signal->real_timer); 967 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1038,6 +1046,22 @@ void do_exit(long code)
1038 if (tsk->nr_dirtied) 1046 if (tsk->nr_dirtied)
1039 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); 1047 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
1040 exit_rcu(); 1048 exit_rcu();
1049
1050 /*
1051 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
1052 * when the following two conditions become true.
1053 * - There is race condition of mmap_sem (It is acquired by
1054 * exit_mm()), and
1055 * - SMI occurs before setting TASK_RUNINNG.
1056 * (or hypervisor of virtual machine switches to other guest)
1057 * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
1058 *
1059 * To avoid it, we have to wait for releasing tsk->pi_lock which
1060 * is held by try_to_wake_up()
1061 */
1062 smp_mb();
1063 raw_spin_unlock_wait(&tsk->pi_lock);
1064
1041 /* causes final put_task_struct in finish_task_switch(). */ 1065 /* causes final put_task_struct in finish_task_switch(). */
1042 tsk->state = TASK_DEAD; 1066 tsk->state = TASK_DEAD;
1043 tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ 1067 tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
diff --git a/kernel/fork.c b/kernel/fork.c
index 051f090d40c1..b9372a0bff18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
66#include <linux/user-return-notifier.h> 66#include <linux/user-return-notifier.h>
67#include <linux/oom.h> 67#include <linux/oom.h>
68#include <linux/khugepaged.h> 68#include <linux/khugepaged.h>
69#include <linux/signalfd.h>
69 70
70#include <asm/pgtable.h> 71#include <asm/pgtable.h>
71#include <asm/pgalloc.h> 72#include <asm/pgalloc.h>
@@ -192,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)
192 WARN_ON(atomic_read(&tsk->usage)); 193 WARN_ON(atomic_read(&tsk->usage));
193 WARN_ON(tsk == current); 194 WARN_ON(tsk == current);
194 195
196 security_task_free(tsk);
195 exit_creds(tsk); 197 exit_creds(tsk);
196 delayacct_tsk_free(tsk); 198 delayacct_tsk_free(tsk);
197 put_signal_struct(tsk->signal); 199 put_signal_struct(tsk->signal);
@@ -354,7 +356,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
354 charge = 0; 356 charge = 0;
355 if (mpnt->vm_flags & VM_ACCOUNT) { 357 if (mpnt->vm_flags & VM_ACCOUNT) {
356 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; 358 unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
357 if (security_vm_enough_memory(len)) 359 if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
358 goto fail_nomem; 360 goto fail_nomem;
359 charge = len; 361 charge = len;
360 } 362 }
@@ -510,6 +512,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
510 return NULL; 512 return NULL;
511} 513}
512 514
515static void check_mm(struct mm_struct *mm)
516{
517 int i;
518
519 for (i = 0; i < NR_MM_COUNTERS; i++) {
520 long x = atomic_long_read(&mm->rss_stat.count[i]);
521
522 if (unlikely(x))
523 printk(KERN_ALERT "BUG: Bad rss-counter state "
524 "mm:%p idx:%d val:%ld\n", mm, i, x);
525 }
526
527#ifdef CONFIG_TRANSPARENT_HUGEPAGE
528 VM_BUG_ON(mm->pmd_huge_pte);
529#endif
530}
531
513/* 532/*
514 * Allocate and initialize an mm_struct. 533 * Allocate and initialize an mm_struct.
515 */ 534 */
@@ -537,9 +556,7 @@ void __mmdrop(struct mm_struct *mm)
537 mm_free_pgd(mm); 556 mm_free_pgd(mm);
538 destroy_context(mm); 557 destroy_context(mm);
539 mmu_notifier_mm_destroy(mm); 558 mmu_notifier_mm_destroy(mm);
540#ifdef CONFIG_TRANSPARENT_HUGEPAGE 559 check_mm(mm);
541 VM_BUG_ON(mm->pmd_huge_pte);
542#endif
543 free_mm(mm); 560 free_mm(mm);
544} 561}
545EXPORT_SYMBOL_GPL(__mmdrop); 562EXPORT_SYMBOL_GPL(__mmdrop);
@@ -647,6 +664,58 @@ struct mm_struct *get_task_mm(struct task_struct *task)
647} 664}
648EXPORT_SYMBOL_GPL(get_task_mm); 665EXPORT_SYMBOL_GPL(get_task_mm);
649 666
667struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
668{
669 struct mm_struct *mm;
670 int err;
671
672 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
673 if (err)
674 return ERR_PTR(err);
675
676 mm = get_task_mm(task);
677 if (mm && mm != current->mm &&
678 !ptrace_may_access(task, mode)) {
679 mmput(mm);
680 mm = ERR_PTR(-EACCES);
681 }
682 mutex_unlock(&task->signal->cred_guard_mutex);
683
684 return mm;
685}
686
687static void complete_vfork_done(struct task_struct *tsk)
688{
689 struct completion *vfork;
690
691 task_lock(tsk);
692 vfork = tsk->vfork_done;
693 if (likely(vfork)) {
694 tsk->vfork_done = NULL;
695 complete(vfork);
696 }
697 task_unlock(tsk);
698}
699
700static int wait_for_vfork_done(struct task_struct *child,
701 struct completion *vfork)
702{
703 int killed;
704
705 freezer_do_not_count();
706 killed = wait_for_completion_killable(vfork);
707 freezer_count();
708
709 if (killed) {
710 task_lock(child);
711 child->vfork_done = NULL;
712 task_unlock(child);
713 }
714
715 put_task_struct(child);
716 return killed;
717}
718
650/* Please note the differences between mmput and mm_release. 719/* Please note the differences between mmput and mm_release.
651 * mmput is called whenever we stop holding onto a mm_struct, 720 * mmput is called whenever we stop holding onto a mm_struct,
652 * error success whatever. 721 * error success whatever.
@@ -662,8 +731,6 @@ EXPORT_SYMBOL_GPL(get_task_mm);
662 */ 731 */
663void mm_release(struct task_struct *tsk, struct mm_struct *mm) 732void mm_release(struct task_struct *tsk, struct mm_struct *mm)
664{ 733{
665 struct completion *vfork_done = tsk->vfork_done;
666
667 /* Get rid of any futexes when releasing the mm */ 734 /* Get rid of any futexes when releasing the mm */
668#ifdef CONFIG_FUTEX 735#ifdef CONFIG_FUTEX
669 if (unlikely(tsk->robust_list)) { 736 if (unlikely(tsk->robust_list)) {
@@ -683,17 +750,15 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
683 /* Get rid of any cached register state */ 750 /* Get rid of any cached register state */
684 deactivate_mm(tsk, mm); 751 deactivate_mm(tsk, mm);
685 752
686 /* notify parent sleeping on vfork() */ 753 if (tsk->vfork_done)
687 if (vfork_done) { 754 complete_vfork_done(tsk);
688 tsk->vfork_done = NULL;
689 complete(vfork_done);
690 }
691 755
692 /* 756 /*
693 * If we're exiting normally, clear a user-space tid field if 757 * If we're exiting normally, clear a user-space tid field if
694 * requested. We leave this alone when dying by signal, to leave 758 * requested. We leave this alone when dying by signal, to leave
695 * the value intact in a core dump, and to save the unnecessary 759 * the value intact in a core dump, and to save the unnecessary
696 * trouble otherwise. Userland only wants this done for a sys_exit. 760 * trouble, say, a killed vfork parent shouldn't touch this mm.
761 * Userland only wants this done for a sys_exit.
697 */ 762 */
698 if (tsk->clear_child_tid) { 763 if (tsk->clear_child_tid) {
699 if (!(tsk->flags & PF_SIGNALED) && 764 if (!(tsk->flags & PF_SIGNALED) &&
@@ -890,7 +955,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
890 return -ENOMEM; 955 return -ENOMEM;
891 956
892 new_ioc->ioprio = ioc->ioprio; 957 new_ioc->ioprio = ioc->ioprio;
893 put_io_context(new_ioc, NULL); 958 put_io_context(new_ioc);
894 } 959 }
895#endif 960#endif
896 return 0; 961 return 0;
@@ -915,8 +980,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
915 980
916void __cleanup_sighand(struct sighand_struct *sighand) 981void __cleanup_sighand(struct sighand_struct *sighand)
917{ 982{
918 if (atomic_dec_and_test(&sighand->count)) 983 if (atomic_dec_and_test(&sighand->count)) {
984 signalfd_cleanup(sighand);
919 kmem_cache_free(sighand_cachep, sighand); 985 kmem_cache_free(sighand_cachep, sighand);
986 }
920} 987}
921 988
922 989
@@ -984,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
984 sig->oom_score_adj = current->signal->oom_score_adj; 1051 sig->oom_score_adj = current->signal->oom_score_adj;
985 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1052 sig->oom_score_adj_min = current->signal->oom_score_adj_min;
986 1053
1054 sig->has_child_subreaper = current->signal->has_child_subreaper ||
1055 current->signal->is_child_subreaper;
1056
987 mutex_init(&sig->cred_guard_mutex); 1057 mutex_init(&sig->cred_guard_mutex);
988 1058
989 return 0; 1059 return 0;
@@ -995,7 +1065,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
995 1065
996 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); 1066 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
997 new_flags |= PF_FORKNOEXEC; 1067 new_flags |= PF_FORKNOEXEC;
998 new_flags |= PF_STARTING;
999 p->flags = new_flags; 1068 p->flags = new_flags;
1000} 1069}
1001 1070
@@ -1172,6 +1241,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1172#ifdef CONFIG_CPUSETS 1241#ifdef CONFIG_CPUSETS
1173 p->cpuset_mem_spread_rotor = NUMA_NO_NODE; 1242 p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
1174 p->cpuset_slab_spread_rotor = NUMA_NO_NODE; 1243 p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
1244 seqcount_init(&p->mems_allowed_seq);
1175#endif 1245#endif
1176#ifdef CONFIG_TRACE_IRQFLAGS 1246#ifdef CONFIG_TRACE_IRQFLAGS
1177 p->irq_events = 0; 1247 p->irq_events = 0;
@@ -1290,7 +1360,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1290 clear_all_latency_tracing(p); 1360 clear_all_latency_tracing(p);
1291 1361
1292 /* ok, now we should be set up.. */ 1362 /* ok, now we should be set up.. */
1293 p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); 1363 if (clone_flags & CLONE_THREAD)
1364 p->exit_signal = -1;
1365 else if (clone_flags & CLONE_PARENT)
1366 p->exit_signal = current->group_leader->exit_signal;
1367 else
1368 p->exit_signal = (clone_flags & CSIGNAL);
1369
1294 p->pdeath_signal = 0; 1370 p->pdeath_signal = 0;
1295 p->exit_state = 0; 1371 p->exit_state = 0;
1296 1372
@@ -1525,16 +1601,9 @@ long do_fork(unsigned long clone_flags,
1525 if (clone_flags & CLONE_VFORK) { 1601 if (clone_flags & CLONE_VFORK) {
1526 p->vfork_done = &vfork; 1602 p->vfork_done = &vfork;
1527 init_completion(&vfork); 1603 init_completion(&vfork);
1604 get_task_struct(p);
1528 } 1605 }
1529 1606
1530 /*
1531 * We set PF_STARTING at creation in case tracing wants to
1532 * use this to distinguish a fully live task from one that
1533 * hasn't finished SIGSTOP raising yet. Now we clear it
1534 * and set the child going.
1535 */
1536 p->flags &= ~PF_STARTING;
1537
1538 wake_up_new_task(p); 1607 wake_up_new_task(p);
1539 1608
1540 /* forking complete and child started to run, tell ptracer */ 1609 /* forking complete and child started to run, tell ptracer */
@@ -1542,10 +1611,8 @@ long do_fork(unsigned long clone_flags,
1542 ptrace_event(trace, nr); 1611 ptrace_event(trace, nr);
1543 1612
1544 if (clone_flags & CLONE_VFORK) { 1613 if (clone_flags & CLONE_VFORK) {
1545 freezer_do_not_count(); 1614 if (!wait_for_vfork_done(p, &vfork))
1546 wait_for_completion(&vfork); 1615 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
1547 freezer_count();
1548 ptrace_event(PTRACE_EVENT_VFORK_DONE, nr);
1549 } 1616 }
1550 } else { 1617 } else {
1551 nr = PTR_ERR(p); 1618 nr = PTR_ERR(p);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 9815b8d1eed5..11f82a4d4eae 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p)
99 * freeze_task - send a freeze request to given task 99 * freeze_task - send a freeze request to given task
100 * @p: task to send the request to 100 * @p: task to send the request to
101 * 101 *
102 * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE 102 * If @p is freezing, the freeze request is sent either by sending a fake
103 * flag and either sending a fake signal to it or waking it up, depending 103 * signal (if it's not a kernel thread) or waking it up (if it's a kernel
104 * on whether it has %PF_FREEZER_NOSIG set. 104 * thread).
105 * 105 *
106 * RETURNS: 106 * RETURNS:
107 * %false, if @p is not freezing or already frozen; %true, otherwise 107 * %false, if @p is not freezing or already frozen; %true, otherwise
diff --git a/kernel/futex.c b/kernel/futex.c
index 1614be20173d..72efa1e4359a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2628,7 +2628,7 @@ void exit_robust_list(struct task_struct *curr)
2628long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, 2628long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2629 u32 __user *uaddr2, u32 val2, u32 val3) 2629 u32 __user *uaddr2, u32 val2, u32 val3)
2630{ 2630{
2631 int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; 2631 int cmd = op & FUTEX_CMD_MASK;
2632 unsigned int flags = 0; 2632 unsigned int flags = 0;
2633 2633
2634 if (!(op & FUTEX_PRIVATE_FLAG)) 2634 if (!(op & FUTEX_PRIVATE_FLAG))
@@ -2641,49 +2641,44 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2641 } 2641 }
2642 2642
2643 switch (cmd) { 2643 switch (cmd) {
2644 case FUTEX_LOCK_PI:
2645 case FUTEX_UNLOCK_PI:
2646 case FUTEX_TRYLOCK_PI:
2647 case FUTEX_WAIT_REQUEUE_PI:
2648 case FUTEX_CMP_REQUEUE_PI:
2649 if (!futex_cmpxchg_enabled)
2650 return -ENOSYS;
2651 }
2652
2653 switch (cmd) {
2644 case FUTEX_WAIT: 2654 case FUTEX_WAIT:
2645 val3 = FUTEX_BITSET_MATCH_ANY; 2655 val3 = FUTEX_BITSET_MATCH_ANY;
2646 case FUTEX_WAIT_BITSET: 2656 case FUTEX_WAIT_BITSET:
2647 ret = futex_wait(uaddr, flags, val, timeout, val3); 2657 return futex_wait(uaddr, flags, val, timeout, val3);
2648 break;
2649 case FUTEX_WAKE: 2658 case FUTEX_WAKE:
2650 val3 = FUTEX_BITSET_MATCH_ANY; 2659 val3 = FUTEX_BITSET_MATCH_ANY;
2651 case FUTEX_WAKE_BITSET: 2660 case FUTEX_WAKE_BITSET:
2652 ret = futex_wake(uaddr, flags, val, val3); 2661 return futex_wake(uaddr, flags, val, val3);
2653 break;
2654 case FUTEX_REQUEUE: 2662 case FUTEX_REQUEUE:
2655 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); 2663 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2656 break;
2657 case FUTEX_CMP_REQUEUE: 2664 case FUTEX_CMP_REQUEUE:
2658 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); 2665 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2659 break;
2660 case FUTEX_WAKE_OP: 2666 case FUTEX_WAKE_OP:
2661 ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); 2667 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2662 break;
2663 case FUTEX_LOCK_PI: 2668 case FUTEX_LOCK_PI:
2664 if (futex_cmpxchg_enabled) 2669 return futex_lock_pi(uaddr, flags, val, timeout, 0);
2665 ret = futex_lock_pi(uaddr, flags, val, timeout, 0);
2666 break;
2667 case FUTEX_UNLOCK_PI: 2670 case FUTEX_UNLOCK_PI:
2668 if (futex_cmpxchg_enabled) 2671 return futex_unlock_pi(uaddr, flags);
2669 ret = futex_unlock_pi(uaddr, flags);
2670 break;
2671 case FUTEX_TRYLOCK_PI: 2672 case FUTEX_TRYLOCK_PI:
2672 if (futex_cmpxchg_enabled) 2673 return futex_lock_pi(uaddr, flags, 0, timeout, 1);
2673 ret = futex_lock_pi(uaddr, flags, 0, timeout, 1);
2674 break;
2675 case FUTEX_WAIT_REQUEUE_PI: 2674 case FUTEX_WAIT_REQUEUE_PI:
2676 val3 = FUTEX_BITSET_MATCH_ANY; 2675 val3 = FUTEX_BITSET_MATCH_ANY;
2677 ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, 2676 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2678 uaddr2); 2677 uaddr2);
2679 break;
2680 case FUTEX_CMP_REQUEUE_PI: 2678 case FUTEX_CMP_REQUEUE_PI:
2681 ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); 2679 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2682 break;
2683 default:
2684 ret = -ENOSYS;
2685 } 2680 }
2686 return ret; 2681 return -ENOSYS;
2687} 2682}
2688 2683
2689 2684
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 2e48ec0c2e91..c21449f85a2a 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -119,15 +119,20 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
119 * For preemptible RCU it is sufficient to call rcu_read_unlock in order 119 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
120 * to exit the grace period. For classic RCU, a reschedule is required. 120 * to exit the grace period. For classic RCU, a reschedule is required.
121 */ 121 */
122static void rcu_lock_break(struct task_struct *g, struct task_struct *t) 122static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
123{ 123{
124 bool can_cont;
125
124 get_task_struct(g); 126 get_task_struct(g);
125 get_task_struct(t); 127 get_task_struct(t);
126 rcu_read_unlock(); 128 rcu_read_unlock();
127 cond_resched(); 129 cond_resched();
128 rcu_read_lock(); 130 rcu_read_lock();
131 can_cont = pid_alive(g) && pid_alive(t);
129 put_task_struct(t); 132 put_task_struct(t);
130 put_task_struct(g); 133 put_task_struct(g);
134
135 return can_cont;
131} 136}
132 137
133/* 138/*
@@ -154,9 +159,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
154 goto unlock; 159 goto unlock;
155 if (!--batch_count) { 160 if (!--batch_count) {
156 batch_count = HUNG_TASK_BATCHING; 161 batch_count = HUNG_TASK_BATCHING;
157 rcu_lock_break(g, t); 162 if (!rcu_lock_break(g, t))
158 /* Exit if t or g was unhashed during refresh. */
159 if (t->state == TASK_DEAD || g->state == TASK_DEAD)
160 goto unlock; 163 goto unlock;
161 } 164 }
162 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ 165 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 1f2dece9ad4c..cf1a4a68ce44 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -56,6 +56,16 @@ config GENERIC_IRQ_CHIP
56config IRQ_DOMAIN 56config IRQ_DOMAIN
57 bool 57 bool
58 58
59config IRQ_DOMAIN_DEBUG
60 bool "Expose hardware/virtual IRQ mapping via debugfs"
61 depends on IRQ_DOMAIN && DEBUG_FS
62 help
63 This option will show the mapping relationship between hardware irq
64 numbers and Linux irq numbers. The mapping is exposed via debugfs
65 in the file "virq_mapping".
66
67 If you don't know what this means you don't need it.
68
59# Support forced irq threading 69# Support forced irq threading
60config IRQ_FORCED_THREADING 70config IRQ_FORCED_THREADING
61 bool 71 bool
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 342d8f44e401..0119b9d467ae 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)
53 if (desc->irq_data.chip->irq_set_type) 53 if (desc->irq_data.chip->irq_set_type)
54 desc->irq_data.chip->irq_set_type(&desc->irq_data, 54 desc->irq_data.chip->irq_set_type(&desc->irq_data,
55 IRQ_TYPE_PROBE); 55 IRQ_TYPE_PROBE);
56 irq_startup(desc); 56 irq_startup(desc, false);
57 } 57 }
58 raw_spin_unlock_irq(&desc->lock); 58 raw_spin_unlock_irq(&desc->lock);
59 } 59 }
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void)
70 raw_spin_lock_irq(&desc->lock); 70 raw_spin_lock_irq(&desc->lock);
71 if (!desc->action && irq_settings_can_probe(desc)) { 71 if (!desc->action && irq_settings_can_probe(desc)) {
72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; 72 desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
73 if (irq_startup(desc)) 73 if (irq_startup(desc, false))
74 desc->istate |= IRQS_PENDING; 74 desc->istate |= IRQS_PENDING;
75 } 75 }
76 raw_spin_unlock_irq(&desc->lock); 76 raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f7c543a801d9..6080f6bc8c33 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -16,6 +16,8 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
18 18
19#include <trace/events/irq.h>
20
19#include "internals.h" 21#include "internals.h"
20 22
21/** 23/**
@@ -61,8 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
61 return -EINVAL; 63 return -EINVAL;
62 64
63 type &= IRQ_TYPE_SENSE_MASK; 65 type &= IRQ_TYPE_SENSE_MASK;
64 if (type != IRQ_TYPE_NONE) 66 ret = __irq_set_trigger(desc, irq, type);
65 ret = __irq_set_trigger(desc, irq, type);
66 irq_put_desc_busunlock(desc, flags); 67 irq_put_desc_busunlock(desc, flags);
67 return ret; 68 return ret;
68} 69}
@@ -157,19 +158,22 @@ static void irq_state_set_masked(struct irq_desc *desc)
157 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); 158 irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
158} 159}
159 160
160int irq_startup(struct irq_desc *desc) 161int irq_startup(struct irq_desc *desc, bool resend)
161{ 162{
163 int ret = 0;
164
162 irq_state_clr_disabled(desc); 165 irq_state_clr_disabled(desc);
163 desc->depth = 0; 166 desc->depth = 0;
164 167
165 if (desc->irq_data.chip->irq_startup) { 168 if (desc->irq_data.chip->irq_startup) {
166 int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); 169 ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
167 irq_state_clr_masked(desc); 170 irq_state_clr_masked(desc);
168 return ret; 171 } else {
172 irq_enable(desc);
169 } 173 }
170 174 if (resend)
171 irq_enable(desc); 175 check_irq_resend(desc, desc->irq_data.irq);
172 return 0; 176 return ret;
173} 177}
174 178
175void irq_shutdown(struct irq_desc *desc) 179void irq_shutdown(struct irq_desc *desc)
@@ -330,6 +334,24 @@ out_unlock:
330} 334}
331EXPORT_SYMBOL_GPL(handle_simple_irq); 335EXPORT_SYMBOL_GPL(handle_simple_irq);
332 336
337/*
338 * Called unconditionally from handle_level_irq() and only for oneshot
339 * interrupts from handle_fasteoi_irq()
340 */
341static void cond_unmask_irq(struct irq_desc *desc)
342{
343 /*
344 * We need to unmask in the following cases:
345 * - Standard level irq (IRQF_ONESHOT is not set)
346 * - Oneshot irq which did not wake the thread (caused by a
347 * spurious interrupt or a primary handler handling it
348 * completely).
349 */
350 if (!irqd_irq_disabled(&desc->irq_data) &&
351 irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot)
352 unmask_irq(desc);
353}
354
333/** 355/**
334 * handle_level_irq - Level type irq handler 356 * handle_level_irq - Level type irq handler
335 * @irq: the interrupt number 357 * @irq: the interrupt number
@@ -362,8 +384,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
362 384
363 handle_irq_event(desc); 385 handle_irq_event(desc);
364 386
365 if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) 387 cond_unmask_irq(desc);
366 unmask_irq(desc); 388
367out_unlock: 389out_unlock:
368 raw_spin_unlock(&desc->lock); 390 raw_spin_unlock(&desc->lock);
369} 391}
@@ -417,6 +439,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
417 preflow_handler(desc); 439 preflow_handler(desc);
418 handle_irq_event(desc); 440 handle_irq_event(desc);
419 441
442 if (desc->istate & IRQS_ONESHOT)
443 cond_unmask_irq(desc);
444
420out_eoi: 445out_eoi:
421 desc->irq_data.chip->irq_eoi(&desc->irq_data); 446 desc->irq_data.chip->irq_eoi(&desc->irq_data);
422out_unlock: 447out_unlock:
@@ -625,7 +650,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
625 irq_settings_set_noprobe(desc); 650 irq_settings_set_noprobe(desc);
626 irq_settings_set_norequest(desc); 651 irq_settings_set_norequest(desc);
627 irq_settings_set_nothread(desc); 652 irq_settings_set_nothread(desc);
628 irq_startup(desc); 653 irq_startup(desc, true);
629 } 654 }
630out: 655out:
631 irq_put_desc_busunlock(desc, flags); 656 irq_put_desc_busunlock(desc, flags);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 470d08c82bbe..6ff84e6a954c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -60,7 +60,7 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
60 * device interrupt, so no irq storm is lurking. If the 60 * device interrupt, so no irq storm is lurking. If the
61 * RUNTHREAD bit is already set, nothing to do. 61 * RUNTHREAD bit is already set, nothing to do.
62 */ 62 */
63 if (test_bit(IRQTF_DIED, &action->thread_flags) || 63 if ((action->thread->flags & PF_EXITING) ||
64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) 64 test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
65 return; 65 return;
66 66
@@ -110,6 +110,18 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
110 * threads_oneshot untouched and runs the thread another time. 110 * threads_oneshot untouched and runs the thread another time.
111 */ 111 */
112 desc->threads_oneshot |= action->thread_mask; 112 desc->threads_oneshot |= action->thread_mask;
113
114 /*
115 * We increment the threads_active counter in case we wake up
116 * the irq thread. The irq thread decrements the counter when
117 * it returns from the handler or in the exit path and wakes
118 * up waiters which are stuck in synchronize_irq() when the
119 * active count becomes zero. synchronize_irq() is serialized
120 * against this code (hard irq handler) via IRQS_INPROGRESS
121 * like the finalize_oneshot() code. See comment above.
122 */
123 atomic_inc(&desc->threads_active);
124
113 wake_up_process(action->thread); 125 wake_up_process(action->thread);
114} 126}
115 127
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index b7952316016a..8e5c56b3b7d9 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -20,14 +20,12 @@ extern bool noirqdebug;
20/* 20/*
21 * Bits used by threaded handlers: 21 * Bits used by threaded handlers:
22 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run 22 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
23 * IRQTF_DIED - handler thread died
24 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed 23 * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
25 * IRQTF_AFFINITY - irq thread is requested to adjust affinity 24 * IRQTF_AFFINITY - irq thread is requested to adjust affinity
26 * IRQTF_FORCED_THREAD - irq action is force threaded 25 * IRQTF_FORCED_THREAD - irq action is force threaded
27 */ 26 */
28enum { 27enum {
29 IRQTF_RUNTHREAD, 28 IRQTF_RUNTHREAD,
30 IRQTF_DIED,
31 IRQTF_WARNED, 29 IRQTF_WARNED,
32 IRQTF_AFFINITY, 30 IRQTF_AFFINITY,
33 IRQTF_FORCED_THREAD, 31 IRQTF_FORCED_THREAD,
@@ -67,7 +65,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
67extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 65extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
68extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 66extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
69 67
70extern int irq_startup(struct irq_desc *desc); 68extern int irq_startup(struct irq_desc *desc, bool resend);
71extern void irq_shutdown(struct irq_desc *desc); 69extern void irq_shutdown(struct irq_desc *desc);
72extern void irq_enable(struct irq_desc *desc); 70extern void irq_enable(struct irq_desc *desc);
73extern void irq_disable(struct irq_desc *desc); 71extern void irq_disable(struct irq_desc *desc);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 1f9e26526b69..3601f3fbf67c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,189 +1,793 @@
1#include <linux/debugfs.h>
2#include <linux/hardirq.h>
3#include <linux/interrupt.h>
1#include <linux/irq.h> 4#include <linux/irq.h>
5#include <linux/irqdesc.h>
2#include <linux/irqdomain.h> 6#include <linux/irqdomain.h>
3#include <linux/module.h> 7#include <linux/module.h>
4#include <linux/mutex.h> 8#include <linux/mutex.h>
5#include <linux/of.h> 9#include <linux/of.h>
6#include <linux/of_address.h> 10#include <linux/of_address.h>
11#include <linux/seq_file.h>
7#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/smp.h>
14#include <linux/fs.h>
15
16#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs.
17 * ie. legacy 8259, gets irqs 1..15 */
18#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */
19#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */
20#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */
8 21
9static LIST_HEAD(irq_domain_list); 22static LIST_HEAD(irq_domain_list);
10static DEFINE_MUTEX(irq_domain_mutex); 23static DEFINE_MUTEX(irq_domain_mutex);
11 24
25static DEFINE_MUTEX(revmap_trees_mutex);
26static unsigned int irq_virq_count = NR_IRQS;
27static struct irq_domain *irq_default_domain;
28
12/** 29/**
13 * irq_domain_add() - Register an irq_domain 30 * irq_domain_alloc() - Allocate a new irq_domain data structure
14 * @domain: ptr to initialized irq_domain structure 31 * @of_node: optional device-tree node of the interrupt controller
32 * @revmap_type: type of reverse mapping to use
33 * @ops: map/unmap domain callbacks
34 * @host_data: Controller private data pointer
15 * 35 *
16 * Registers an irq_domain structure. The irq_domain must at a minimum be 36 * Allocates and initialize and irq_domain structure. Caller is expected to
17 * initialized with an ops structure pointer, and either a ->to_irq hook or 37 * register allocated irq_domain with irq_domain_register(). Returns pointer
18 * a valid irq_base value. Everything else is optional. 38 * to IRQ domain, or NULL on failure.
19 */ 39 */
20void irq_domain_add(struct irq_domain *domain) 40static struct irq_domain *irq_domain_alloc(struct device_node *of_node,
41 unsigned int revmap_type,
42 const struct irq_domain_ops *ops,
43 void *host_data)
21{ 44{
22 struct irq_data *d; 45 struct irq_domain *domain;
23 int hwirq, irq;
24 46
25 /* 47 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
26 * This assumes that the irq_domain owner has already allocated 48 if (WARN_ON(!domain))
27 * the irq_descs. This block will be removed when support for dynamic 49 return NULL;
28 * allocation of irq_descs is added to irq_domain. 50
29 */ 51 /* Fill structure */
30 irq_domain_for_each_irq(domain, hwirq, irq) { 52 domain->revmap_type = revmap_type;
31 d = irq_get_irq_data(irq); 53 domain->ops = ops;
32 if (!d) { 54 domain->host_data = host_data;
33 WARN(1, "error: assigning domain to non existant irq_desc"); 55 domain->of_node = of_node_get(of_node);
34 return; 56
35 } 57 return domain;
36 if (d->domain) { 58}
37 /* things are broken; just report, don't clean up */ 59
38 WARN(1, "error: irq_desc already assigned to a domain"); 60static void irq_domain_add(struct irq_domain *domain)
39 return; 61{
62 mutex_lock(&irq_domain_mutex);
63 list_add(&domain->link, &irq_domain_list);
64 mutex_unlock(&irq_domain_mutex);
65 pr_debug("irq: Allocated domain of type %d @0x%p\n",
66 domain->revmap_type, domain);
67}
68
69static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
70 irq_hw_number_t hwirq)
71{
72 irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq;
73 int size = domain->revmap_data.legacy.size;
74
75 if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size))
76 return 0;
77 return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq;
78}
79
80/**
81 * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
82 * @of_node: pointer to interrupt controller's device tree node.
83 * @size: total number of irqs in legacy mapping
84 * @first_irq: first number of irq block assigned to the domain
85 * @first_hwirq: first hwirq number to use for the translation. Should normally
86 * be '0', but a positive integer can be used if the effective
87 * hwirqs numbering does not begin at zero.
88 * @ops: map/unmap domain callbacks
89 * @host_data: Controller private data pointer
90 *
91 * Note: the map() callback will be called before this function returns
92 * for all legacy interrupts except 0 (which is always the invalid irq for
93 * a legacy controller).
94 */
95struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
96 unsigned int size,
97 unsigned int first_irq,
98 irq_hw_number_t first_hwirq,
99 const struct irq_domain_ops *ops,
100 void *host_data)
101{
102 struct irq_domain *domain;
103 unsigned int i;
104
105 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data);
106 if (!domain)
107 return NULL;
108
109 domain->revmap_data.legacy.first_irq = first_irq;
110 domain->revmap_data.legacy.first_hwirq = first_hwirq;
111 domain->revmap_data.legacy.size = size;
112
113 mutex_lock(&irq_domain_mutex);
114 /* Verify that all the irqs are available */
115 for (i = 0; i < size; i++) {
116 int irq = first_irq + i;
117 struct irq_data *irq_data = irq_get_irq_data(irq);
118
119 if (WARN_ON(!irq_data || irq_data->domain)) {
120 mutex_unlock(&irq_domain_mutex);
121 of_node_put(domain->of_node);
122 kfree(domain);
123 return NULL;
40 } 124 }
41 d->domain = domain;
42 d->hwirq = hwirq;
43 } 125 }
44 126
45 mutex_lock(&irq_domain_mutex); 127 /* Claim all of the irqs before registering a legacy domain */
46 list_add(&domain->list, &irq_domain_list); 128 for (i = 0; i < size; i++) {
129 struct irq_data *irq_data = irq_get_irq_data(first_irq + i);
130 irq_data->hwirq = first_hwirq + i;
131 irq_data->domain = domain;
132 }
47 mutex_unlock(&irq_domain_mutex); 133 mutex_unlock(&irq_domain_mutex);
134
135 for (i = 0; i < size; i++) {
136 int irq = first_irq + i;
137 int hwirq = first_hwirq + i;
138
139 /* IRQ0 gets ignored */
140 if (!irq)
141 continue;
142
143 /* Legacy flags are left to default at this point,
144 * one can then use irq_create_mapping() to
145 * explicitly change them
146 */
147 ops->map(domain, irq, hwirq);
148
149 /* Clear norequest flags */
150 irq_clear_status_flags(irq, IRQ_NOREQUEST);
151 }
152
153 irq_domain_add(domain);
154 return domain;
155}
156
157/**
158 * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain.
159 * @of_node: pointer to interrupt controller's device tree node.
160 * @ops: map/unmap domain callbacks
161 * @host_data: Controller private data pointer
162 */
163struct irq_domain *irq_domain_add_linear(struct device_node *of_node,
164 unsigned int size,
165 const struct irq_domain_ops *ops,
166 void *host_data)
167{
168 struct irq_domain *domain;
169 unsigned int *revmap;
170
171 revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL);
172 if (WARN_ON(!revmap))
173 return NULL;
174
175 domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data);
176 if (!domain) {
177 kfree(revmap);
178 return NULL;
179 }
180 domain->revmap_data.linear.size = size;
181 domain->revmap_data.linear.revmap = revmap;
182 irq_domain_add(domain);
183 return domain;
184}
185
186struct irq_domain *irq_domain_add_nomap(struct device_node *of_node,
187 const struct irq_domain_ops *ops,
188 void *host_data)
189{
190 struct irq_domain *domain = irq_domain_alloc(of_node,
191 IRQ_DOMAIN_MAP_NOMAP, ops, host_data);
192 if (domain)
193 irq_domain_add(domain);
194 return domain;
195}
196
197/**
198 * irq_domain_add_tree()
199 * @of_node: pointer to interrupt controller's device tree node.
200 * @ops: map/unmap domain callbacks
201 *
202 * Note: The radix tree will be allocated later during boot automatically
203 * (the reverse mapping will use the slow path until that happens).
204 */
205struct irq_domain *irq_domain_add_tree(struct device_node *of_node,
206 const struct irq_domain_ops *ops,
207 void *host_data)
208{
209 struct irq_domain *domain = irq_domain_alloc(of_node,
210 IRQ_DOMAIN_MAP_TREE, ops, host_data);
211 if (domain) {
212 INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL);
213 irq_domain_add(domain);
214 }
215 return domain;
48} 216}
49 217
50/** 218/**
51 * irq_domain_del() - Unregister an irq_domain 219 * irq_find_host() - Locates a domain for a given device node
52 * @domain: ptr to registered irq_domain. 220 * @node: device-tree node of the interrupt controller
53 */ 221 */
54void irq_domain_del(struct irq_domain *domain) 222struct irq_domain *irq_find_host(struct device_node *node)
55{ 223{
56 struct irq_data *d; 224 struct irq_domain *h, *found = NULL;
57 int hwirq, irq; 225 int rc;
58 226
227 /* We might want to match the legacy controller last since
228 * it might potentially be set to match all interrupts in
229 * the absence of a device node. This isn't a problem so far
230 * yet though...
231 */
59 mutex_lock(&irq_domain_mutex); 232 mutex_lock(&irq_domain_mutex);
60 list_del(&domain->list); 233 list_for_each_entry(h, &irq_domain_list, link) {
234 if (h->ops->match)
235 rc = h->ops->match(h, node);
236 else
237 rc = (h->of_node != NULL) && (h->of_node == node);
238
239 if (rc) {
240 found = h;
241 break;
242 }
243 }
61 mutex_unlock(&irq_domain_mutex); 244 mutex_unlock(&irq_domain_mutex);
245 return found;
246}
247EXPORT_SYMBOL_GPL(irq_find_host);
248
249/**
250 * irq_set_default_host() - Set a "default" irq domain
251 * @domain: default domain pointer
252 *
253 * For convenience, it's possible to set a "default" domain that will be used
254 * whenever NULL is passed to irq_create_mapping(). It makes life easier for
255 * platforms that want to manipulate a few hard coded interrupt numbers that
256 * aren't properly represented in the device-tree.
257 */
258void irq_set_default_host(struct irq_domain *domain)
259{
260 pr_debug("irq: Default domain set to @0x%p\n", domain);
261
262 irq_default_domain = domain;
263}
264
265/**
266 * irq_set_virq_count() - Set the maximum number of linux irqs
267 * @count: number of linux irqs, capped with NR_IRQS
268 *
269 * This is mainly for use by platforms like iSeries who want to program
270 * the virtual irq number in the controller to avoid the reverse mapping
271 */
272void irq_set_virq_count(unsigned int count)
273{
274 pr_debug("irq: Trying to set virq count to %d\n", count);
62 275
63 /* Clear the irq_domain assignments */ 276 BUG_ON(count < NUM_ISA_INTERRUPTS);
64 irq_domain_for_each_irq(domain, hwirq, irq) { 277 if (count < NR_IRQS)
65 d = irq_get_irq_data(irq); 278 irq_virq_count = count;
66 d->domain = NULL; 279}
280
281static int irq_setup_virq(struct irq_domain *domain, unsigned int virq,
282 irq_hw_number_t hwirq)
283{
284 struct irq_data *irq_data = irq_get_irq_data(virq);
285
286 irq_data->hwirq = hwirq;
287 irq_data->domain = domain;
288 if (domain->ops->map(domain, virq, hwirq)) {
289 pr_debug("irq: -> mapping failed, freeing\n");
290 irq_data->domain = NULL;
291 irq_data->hwirq = 0;
292 return -1;
67 } 293 }
294
295 irq_clear_status_flags(virq, IRQ_NOREQUEST);
296
297 return 0;
68} 298}
69 299
70#if defined(CONFIG_OF_IRQ)
71/** 300/**
72 * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec 301 * irq_create_direct_mapping() - Allocate an irq for direct mapping
302 * @domain: domain to allocate the irq for or NULL for default domain
73 * 303 *
74 * Used by the device tree interrupt mapping code to translate a device tree 304 * This routine is used for irq controllers which can choose the hardware
75 * interrupt specifier to a valid linux irq number. Returns either a valid 305 * interrupt numbers they generate. In such a case it's simplest to use
76 * linux IRQ number or 0. 306 * the linux irq as the hardware interrupt number.
307 */
308unsigned int irq_create_direct_mapping(struct irq_domain *domain)
309{
310 unsigned int virq;
311
312 if (domain == NULL)
313 domain = irq_default_domain;
314
315 BUG_ON(domain == NULL);
316 WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP);
317
318 virq = irq_alloc_desc_from(1, 0);
319 if (!virq) {
320 pr_debug("irq: create_direct virq allocation failed\n");
321 return 0;
322 }
323 if (virq >= irq_virq_count) {
324 pr_err("ERROR: no free irqs available below %i maximum\n",
325 irq_virq_count);
326 irq_free_desc(virq);
327 return 0;
328 }
329
330 pr_debug("irq: create_direct obtained virq %d\n", virq);
331
332 if (irq_setup_virq(domain, virq, virq)) {
333 irq_free_desc(virq);
334 return 0;
335 }
336
337 return virq;
338}
339
340/**
341 * irq_create_mapping() - Map a hardware interrupt into linux irq space
342 * @domain: domain owning this hardware interrupt or NULL for default domain
343 * @hwirq: hardware irq number in that domain space
77 * 344 *
78 * When the caller no longer need the irq number returned by this function it 345 * Only one mapping per hardware interrupt is permitted. Returns a linux
79 * should arrange to call irq_dispose_mapping(). 346 * irq number.
347 * If the sense/trigger is to be specified, set_irq_type() should be called
348 * on the number returned from that call.
80 */ 349 */
350unsigned int irq_create_mapping(struct irq_domain *domain,
351 irq_hw_number_t hwirq)
352{
353 unsigned int virq, hint;
354
355 pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
356
357 /* Look for default domain if nececssary */
358 if (domain == NULL)
359 domain = irq_default_domain;
360 if (domain == NULL) {
361 printk(KERN_WARNING "irq_create_mapping called for"
362 " NULL domain, hwirq=%lx\n", hwirq);
363 WARN_ON(1);
364 return 0;
365 }
366 pr_debug("irq: -> using domain @%p\n", domain);
367
368 /* Check if mapping already exists */
369 virq = irq_find_mapping(domain, hwirq);
370 if (virq) {
371 pr_debug("irq: -> existing mapping on virq %d\n", virq);
372 return virq;
373 }
374
375 /* Get a virtual interrupt number */
376 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
377 return irq_domain_legacy_revmap(domain, hwirq);
378
379 /* Allocate a virtual interrupt number */
380 hint = hwirq % irq_virq_count;
381 if (hint == 0)
382 hint++;
383 virq = irq_alloc_desc_from(hint, 0);
384 if (!virq)
385 virq = irq_alloc_desc_from(1, 0);
386 if (!virq) {
387 pr_debug("irq: -> virq allocation failed\n");
388 return 0;
389 }
390
391 if (irq_setup_virq(domain, virq, hwirq)) {
392 if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY)
393 irq_free_desc(virq);
394 return 0;
395 }
396
397 pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n",
398 hwirq, domain->of_node ? domain->of_node->full_name : "null", virq);
399
400 return virq;
401}
402EXPORT_SYMBOL_GPL(irq_create_mapping);
403
81unsigned int irq_create_of_mapping(struct device_node *controller, 404unsigned int irq_create_of_mapping(struct device_node *controller,
82 const u32 *intspec, unsigned int intsize) 405 const u32 *intspec, unsigned int intsize)
83{ 406{
84 struct irq_domain *domain; 407 struct irq_domain *domain;
85 unsigned long hwirq; 408 irq_hw_number_t hwirq;
86 unsigned int irq, type; 409 unsigned int type = IRQ_TYPE_NONE;
87 int rc = -EINVAL; 410 unsigned int virq;
88 411
89 /* Find a domain which can translate the irq spec */ 412 domain = controller ? irq_find_host(controller) : irq_default_domain;
90 mutex_lock(&irq_domain_mutex); 413 if (!domain) {
91 list_for_each_entry(domain, &irq_domain_list, list) { 414#ifdef CONFIG_MIPS
92 if (!domain->ops->dt_translate) 415 /*
93 continue; 416 * Workaround to avoid breaking interrupt controller drivers
94 rc = domain->ops->dt_translate(domain, controller, 417 * that don't yet register an irq_domain. This is temporary
95 intspec, intsize, &hwirq, &type); 418 * code. ~~~gcl, Feb 24, 2012
96 if (rc == 0) 419 *
97 break; 420 * Scheduled for removal in Linux v3.6. That should be enough
421 * time.
422 */
423 if (intsize > 0)
424 return intspec[0];
425#endif
426 printk(KERN_WARNING "irq: no irq domain found for %s !\n",
427 controller->full_name);
428 return 0;
98 } 429 }
99 mutex_unlock(&irq_domain_mutex);
100 430
101 if (rc != 0) 431 /* If domain has no translation, then we assume interrupt line */
102 return 0; 432 if (domain->ops->xlate == NULL)
433 hwirq = intspec[0];
434 else {
435 if (domain->ops->xlate(domain, controller, intspec, intsize,
436 &hwirq, &type))
437 return 0;
438 }
439
440 /* Create mapping */
441 virq = irq_create_mapping(domain, hwirq);
442 if (!virq)
443 return virq;
103 444
104 irq = irq_domain_to_irq(domain, hwirq); 445 /* Set type if specified and different than the current one */
105 if (type != IRQ_TYPE_NONE) 446 if (type != IRQ_TYPE_NONE &&
106 irq_set_irq_type(irq, type); 447 type != (irqd_get_trigger_type(irq_get_irq_data(virq))))
107 pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", 448 irq_set_irq_type(virq, type);
108 controller->full_name, (int)hwirq, irq, type); 449 return virq;
109 return irq;
110} 450}
111EXPORT_SYMBOL_GPL(irq_create_of_mapping); 451EXPORT_SYMBOL_GPL(irq_create_of_mapping);
112 452
113/** 453/**
114 * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() 454 * irq_dispose_mapping() - Unmap an interrupt
115 * @irq: linux irq number to be discarded 455 * @virq: linux irq number of the interrupt to unmap
456 */
457void irq_dispose_mapping(unsigned int virq)
458{
459 struct irq_data *irq_data = irq_get_irq_data(virq);
460 struct irq_domain *domain;
461 irq_hw_number_t hwirq;
462
463 if (!virq || !irq_data)
464 return;
465
466 domain = irq_data->domain;
467 if (WARN_ON(domain == NULL))
468 return;
469
470 /* Never unmap legacy interrupts */
471 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
472 return;
473
474 irq_set_status_flags(virq, IRQ_NOREQUEST);
475
476 /* remove chip and handler */
477 irq_set_chip_and_handler(virq, NULL, NULL);
478
479 /* Make sure it's completed */
480 synchronize_irq(virq);
481
482 /* Tell the PIC about it */
483 if (domain->ops->unmap)
484 domain->ops->unmap(domain, virq);
485 smp_mb();
486
487 /* Clear reverse map */
488 hwirq = irq_data->hwirq;
489 switch(domain->revmap_type) {
490 case IRQ_DOMAIN_MAP_LINEAR:
491 if (hwirq < domain->revmap_data.linear.size)
492 domain->revmap_data.linear.revmap[hwirq] = 0;
493 break;
494 case IRQ_DOMAIN_MAP_TREE:
495 mutex_lock(&revmap_trees_mutex);
496 radix_tree_delete(&domain->revmap_data.tree, hwirq);
497 mutex_unlock(&revmap_trees_mutex);
498 break;
499 }
500
501 irq_free_desc(virq);
502}
503EXPORT_SYMBOL_GPL(irq_dispose_mapping);
504
505/**
506 * irq_find_mapping() - Find a linux irq from an hw irq number.
507 * @domain: domain owning this hardware interrupt
508 * @hwirq: hardware irq number in that domain space
509 *
510 * This is a slow path, for use by generic code. It's expected that an
511 * irq controller implementation directly calls the appropriate low level
512 * mapping function.
513 */
514unsigned int irq_find_mapping(struct irq_domain *domain,
515 irq_hw_number_t hwirq)
516{
517 unsigned int i;
518 unsigned int hint = hwirq % irq_virq_count;
519
520 /* Look for default domain if nececssary */
521 if (domain == NULL)
522 domain = irq_default_domain;
523 if (domain == NULL)
524 return 0;
525
526 /* legacy -> bail early */
527 if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY)
528 return irq_domain_legacy_revmap(domain, hwirq);
529
530 /* Slow path does a linear search of the map */
531 if (hint == 0)
532 hint = 1;
533 i = hint;
534 do {
535 struct irq_data *data = irq_get_irq_data(i);
536 if (data && (data->domain == domain) && (data->hwirq == hwirq))
537 return i;
538 i++;
539 if (i >= irq_virq_count)
540 i = 1;
541 } while(i != hint);
542 return 0;
543}
544EXPORT_SYMBOL_GPL(irq_find_mapping);
545
546/**
547 * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number.
548 * @domain: domain owning this hardware interrupt
549 * @hwirq: hardware irq number in that domain space
116 * 550 *
117 * Calling this function indicates the caller no longer needs a reference to 551 * This is a fast path, for use by irq controller code that uses radix tree
118 * the linux irq number returned by a prior call to irq_create_of_mapping(). 552 * revmaps
119 */ 553 */
120void irq_dispose_mapping(unsigned int irq) 554unsigned int irq_radix_revmap_lookup(struct irq_domain *domain,
555 irq_hw_number_t hwirq)
121{ 556{
557 struct irq_data *irq_data;
558
559 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
560 return irq_find_mapping(domain, hwirq);
561
562 /*
563 * Freeing an irq can delete nodes along the path to
564 * do the lookup via call_rcu.
565 */
566 rcu_read_lock();
567 irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq);
568 rcu_read_unlock();
569
122 /* 570 /*
123 * nothing yet; will be filled when support for dynamic allocation of 571 * If found in radix tree, then fine.
124 * irq_descs is added to irq_domain 572 * Else fallback to linear lookup - this should not happen in practice
573 * as it means that we failed to insert the node in the radix tree.
125 */ 574 */
575 return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq);
126} 576}
127EXPORT_SYMBOL_GPL(irq_dispose_mapping);
128 577
129int irq_domain_simple_dt_translate(struct irq_domain *d, 578/**
130 struct device_node *controller, 579 * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping.
131 const u32 *intspec, unsigned int intsize, 580 * @domain: domain owning this hardware interrupt
132 unsigned long *out_hwirq, unsigned int *out_type) 581 * @virq: linux irq number
582 * @hwirq: hardware irq number in that domain space
583 *
584 * This is for use by irq controllers that use a radix tree reverse
585 * mapping for fast lookup.
586 */
587void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq,
588 irq_hw_number_t hwirq)
133{ 589{
134 if (d->of_node != controller) 590 struct irq_data *irq_data = irq_get_irq_data(virq);
135 return -EINVAL; 591
136 if (intsize < 1) 592 if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE))
137 return -EINVAL; 593 return;
138 if (d->nr_irq && ((intspec[0] < d->hwirq_base) || 594
139 (intspec[0] >= d->hwirq_base + d->nr_irq))) 595 if (virq) {
140 return -EINVAL; 596 mutex_lock(&revmap_trees_mutex);
597 radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data);
598 mutex_unlock(&revmap_trees_mutex);
599 }
600}
601
602/**
603 * irq_linear_revmap() - Find a linux irq from a hw irq number.
604 * @domain: domain owning this hardware interrupt
605 * @hwirq: hardware irq number in that domain space
606 *
607 * This is a fast path, for use by irq controller code that uses linear
608 * revmaps. It does fallback to the slow path if the revmap doesn't exist
609 * yet and will create the revmap entry with appropriate locking
610 */
611unsigned int irq_linear_revmap(struct irq_domain *domain,
612 irq_hw_number_t hwirq)
613{
614 unsigned int *revmap;
615
616 if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR))
617 return irq_find_mapping(domain, hwirq);
618
619 /* Check revmap bounds */
620 if (unlikely(hwirq >= domain->revmap_data.linear.size))
621 return irq_find_mapping(domain, hwirq);
622
623 /* Check if revmap was allocated */
624 revmap = domain->revmap_data.linear.revmap;
625 if (unlikely(revmap == NULL))
626 return irq_find_mapping(domain, hwirq);
627
628 /* Fill up revmap with slow path if no mapping found */
629 if (unlikely(!revmap[hwirq]))
630 revmap[hwirq] = irq_find_mapping(domain, hwirq);
631
632 return revmap[hwirq];
633}
634
635#ifdef CONFIG_IRQ_DOMAIN_DEBUG
636static int virq_debug_show(struct seq_file *m, void *private)
637{
638 unsigned long flags;
639 struct irq_desc *desc;
640 const char *p;
641 static const char none[] = "none";
642 void *data;
643 int i;
644
645 seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq",
646 "chip name", "chip data", "domain name");
647
648 for (i = 1; i < nr_irqs; i++) {
649 desc = irq_to_desc(i);
650 if (!desc)
651 continue;
652
653 raw_spin_lock_irqsave(&desc->lock, flags);
654
655 if (desc->action && desc->action->handler) {
656 struct irq_chip *chip;
657
658 seq_printf(m, "%5d ", i);
659 seq_printf(m, "0x%05lx ", desc->irq_data.hwirq);
660
661 chip = irq_desc_get_chip(desc);
662 if (chip && chip->name)
663 p = chip->name;
664 else
665 p = none;
666 seq_printf(m, "%-15s ", p);
667
668 data = irq_desc_get_chip_data(desc);
669 seq_printf(m, "0x%16p ", data);
670
671 if (desc->irq_data.domain && desc->irq_data.domain->of_node)
672 p = desc->irq_data.domain->of_node->full_name;
673 else
674 p = none;
675 seq_printf(m, "%s\n", p);
676 }
677
678 raw_spin_unlock_irqrestore(&desc->lock, flags);
679 }
680
681 return 0;
682}
141 683
684static int virq_debug_open(struct inode *inode, struct file *file)
685{
686 return single_open(file, virq_debug_show, inode->i_private);
687}
688
689static const struct file_operations virq_debug_fops = {
690 .open = virq_debug_open,
691 .read = seq_read,
692 .llseek = seq_lseek,
693 .release = single_release,
694};
695
696static int __init irq_debugfs_init(void)
697{
698 if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
699 NULL, &virq_debug_fops) == NULL)
700 return -ENOMEM;
701
702 return 0;
703}
704__initcall(irq_debugfs_init);
705#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
706
707int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
708 irq_hw_number_t hwirq)
709{
710 return 0;
711}
712
713/**
714 * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings
715 *
716 * Device Tree IRQ specifier translation function which works with one cell
717 * bindings where the cell value maps directly to the hwirq number.
718 */
719int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr,
720 const u32 *intspec, unsigned int intsize,
721 unsigned long *out_hwirq, unsigned int *out_type)
722{
723 if (WARN_ON(intsize < 1))
724 return -EINVAL;
142 *out_hwirq = intspec[0]; 725 *out_hwirq = intspec[0];
143 *out_type = IRQ_TYPE_NONE; 726 *out_type = IRQ_TYPE_NONE;
144 if (intsize > 1)
145 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
146 return 0; 727 return 0;
147} 728}
729EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell);
148 730
149/** 731/**
150 * irq_domain_create_simple() - Set up a 'simple' translation range 732 * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings
733 *
734 * Device Tree IRQ specifier translation function which works with two cell
735 * bindings where the cell values map directly to the hwirq number
736 * and linux irq flags.
151 */ 737 */
152void irq_domain_add_simple(struct device_node *controller, int irq_base) 738int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr,
739 const u32 *intspec, unsigned int intsize,
740 irq_hw_number_t *out_hwirq, unsigned int *out_type)
153{ 741{
154 struct irq_domain *domain; 742 if (WARN_ON(intsize < 2))
155 743 return -EINVAL;
156 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 744 *out_hwirq = intspec[0];
157 if (!domain) { 745 *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK;
158 WARN_ON(1); 746 return 0;
159 return; 747}
160 } 748EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell);
161 749
162 domain->irq_base = irq_base; 750/**
163 domain->of_node = of_node_get(controller); 751 * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings
164 domain->ops = &irq_domain_simple_ops; 752 *
165 irq_domain_add(domain); 753 * Device Tree IRQ specifier translation function which works with either one
754 * or two cell bindings where the cell values map directly to the hwirq number
755 * and linux irq flags.
756 *
757 * Note: don't use this function unless your interrupt controller explicitly
758 * supports both one and two cell bindings. For the majority of controllers
759 * the _onecell() or _twocell() variants above should be used.
760 */
761int irq_domain_xlate_onetwocell(struct irq_domain *d,
762 struct device_node *ctrlr,
763 const u32 *intspec, unsigned int intsize,
764 unsigned long *out_hwirq, unsigned int *out_type)
765{
766 if (WARN_ON(intsize < 1))
767 return -EINVAL;
768 *out_hwirq = intspec[0];
769 *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE;
770 return 0;
166} 771}
167EXPORT_SYMBOL_GPL(irq_domain_add_simple); 772EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell);
168 773
774const struct irq_domain_ops irq_domain_simple_ops = {
775 .map = irq_domain_simple_map,
776 .xlate = irq_domain_xlate_onetwocell,
777};
778EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
779
780#ifdef CONFIG_OF_IRQ
169void irq_domain_generate_simple(const struct of_device_id *match, 781void irq_domain_generate_simple(const struct of_device_id *match,
170 u64 phys_base, unsigned int irq_start) 782 u64 phys_base, unsigned int irq_start)
171{ 783{
172 struct device_node *node; 784 struct device_node *node;
173 pr_info("looking for phys_base=%llx, irq_start=%i\n", 785 pr_debug("looking for phys_base=%llx, irq_start=%i\n",
174 (unsigned long long) phys_base, (int) irq_start); 786 (unsigned long long) phys_base, (int) irq_start);
175 node = of_find_matching_node_by_address(NULL, match, phys_base); 787 node = of_find_matching_node_by_address(NULL, match, phys_base);
176 if (node) 788 if (node)
177 irq_domain_add_simple(node, irq_start); 789 irq_domain_add_legacy(node, 32, irq_start, 0,
178 else 790 &irq_domain_simple_ops, NULL);
179 pr_info("no node found\n");
180} 791}
181EXPORT_SYMBOL_GPL(irq_domain_generate_simple); 792EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
182#endif /* CONFIG_OF_IRQ */ 793#endif
183
184struct irq_domain_ops irq_domain_simple_ops = {
185#ifdef CONFIG_OF_IRQ
186 .dt_translate = irq_domain_simple_dt_translate,
187#endif /* CONFIG_OF_IRQ */
188};
189EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a9a9dbe49fea..b0ccd1ac2d6a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -759,6 +759,13 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
759 return ret; 759 return ret;
760} 760}
761 761
762static void wake_threads_waitq(struct irq_desc *desc)
763{
764 if (atomic_dec_and_test(&desc->threads_active) &&
765 waitqueue_active(&desc->wait_for_threads))
766 wake_up(&desc->wait_for_threads);
767}
768
762/* 769/*
763 * Interrupt handler thread 770 * Interrupt handler thread
764 */ 771 */
@@ -771,57 +778,41 @@ static int irq_thread(void *data)
771 struct irq_desc *desc = irq_to_desc(action->irq); 778 struct irq_desc *desc = irq_to_desc(action->irq);
772 irqreturn_t (*handler_fn)(struct irq_desc *desc, 779 irqreturn_t (*handler_fn)(struct irq_desc *desc,
773 struct irqaction *action); 780 struct irqaction *action);
774 int wake;
775 781
776 if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, 782 if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
777 &action->thread_flags)) 783 &action->thread_flags))
778 handler_fn = irq_forced_thread_fn; 784 handler_fn = irq_forced_thread_fn;
779 else 785 else
780 handler_fn = irq_thread_fn; 786 handler_fn = irq_thread_fn;
781 787
782 sched_setscheduler(current, SCHED_FIFO, &param); 788 sched_setscheduler(current, SCHED_FIFO, &param);
783 current->irqaction = action; 789 current->irq_thread = 1;
784 790
785 while (!irq_wait_for_interrupt(action)) { 791 while (!irq_wait_for_interrupt(action)) {
792 irqreturn_t action_ret;
786 793
787 irq_thread_check_affinity(desc, action); 794 irq_thread_check_affinity(desc, action);
788 795
789 atomic_inc(&desc->threads_active); 796 action_ret = handler_fn(desc, action);
790 797 if (!noirqdebug)
791 raw_spin_lock_irq(&desc->lock); 798 note_interrupt(action->irq, desc, action_ret);
792 if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
793 /*
794 * CHECKME: We might need a dedicated
795 * IRQ_THREAD_PENDING flag here, which
796 * retriggers the thread in check_irq_resend()
797 * but AFAICT IRQS_PENDING should be fine as it
798 * retriggers the interrupt itself --- tglx
799 */
800 desc->istate |= IRQS_PENDING;
801 raw_spin_unlock_irq(&desc->lock);
802 } else {
803 irqreturn_t action_ret;
804
805 raw_spin_unlock_irq(&desc->lock);
806 action_ret = handler_fn(desc, action);
807 if (!noirqdebug)
808 note_interrupt(action->irq, desc, action_ret);
809 }
810 799
811 wake = atomic_dec_and_test(&desc->threads_active); 800 wake_threads_waitq(desc);
812
813 if (wake && waitqueue_active(&desc->wait_for_threads))
814 wake_up(&desc->wait_for_threads);
815 } 801 }
816 802
817 /* Prevent a stale desc->threads_oneshot */
818 irq_finalize_oneshot(desc, action, true);
819
820 /* 803 /*
821 * Clear irqaction. Otherwise exit_irq_thread() would make 804 * This is the regular exit path. __free_irq() is stopping the
805 * thread via kthread_stop() after calling
806 * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the
807 * oneshot mask bit can be set. We cannot verify that as we
808 * cannot touch the oneshot mask at this point anymore as
809 * __setup_irq() might have given out currents thread_mask
810 * again.
811 *
812 * Clear irq_thread. Otherwise exit_irq_thread() would make
822 * fuzz about an active irq thread going into nirvana. 813 * fuzz about an active irq thread going into nirvana.
823 */ 814 */
824 current->irqaction = NULL; 815 current->irq_thread = 0;
825 return 0; 816 return 0;
826} 817}
827 818
@@ -832,27 +823,28 @@ void exit_irq_thread(void)
832{ 823{
833 struct task_struct *tsk = current; 824 struct task_struct *tsk = current;
834 struct irq_desc *desc; 825 struct irq_desc *desc;
826 struct irqaction *action;
835 827
836 if (!tsk->irqaction) 828 if (!tsk->irq_thread)
837 return; 829 return;
838 830
831 action = kthread_data(tsk);
832
839 printk(KERN_ERR 833 printk(KERN_ERR
840 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", 834 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
841 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); 835 tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
842 836
843 desc = irq_to_desc(tsk->irqaction->irq); 837 desc = irq_to_desc(action->irq);
844 838
845 /* 839 /*
846 * Prevent a stale desc->threads_oneshot. Must be called 840 * If IRQTF_RUNTHREAD is set, we need to decrement
847 * before setting the IRQTF_DIED flag. 841 * desc->threads_active and wake possible waiters.
848 */ 842 */
849 irq_finalize_oneshot(desc, tsk->irqaction, true); 843 if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
844 wake_threads_waitq(desc);
850 845
851 /* 846 /* Prevent a stale desc->threads_oneshot */
852 * Set the THREAD DIED flag to prevent further wakeups of the 847 irq_finalize_oneshot(desc, action, true);
853 * soon to be gone threaded handler.
854 */
855 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
856} 848}
857 849
858static void irq_setup_forced_threading(struct irqaction *new) 850static void irq_setup_forced_threading(struct irqaction *new)
@@ -985,6 +977,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
985 977
986 /* add new interrupt at end of irq queue */ 978 /* add new interrupt at end of irq queue */
987 do { 979 do {
980 /*
981 * Or all existing action->thread_mask bits,
982 * so we can find the next zero bit for this
983 * new action.
984 */
988 thread_mask |= old->thread_mask; 985 thread_mask |= old->thread_mask;
989 old_ptr = &old->next; 986 old_ptr = &old->next;
990 old = *old_ptr; 987 old = *old_ptr;
@@ -993,14 +990,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
993 } 990 }
994 991
995 /* 992 /*
996 * Setup the thread mask for this irqaction. Unlikely to have 993 * Setup the thread mask for this irqaction for ONESHOT. For
997 * 32 resp 64 irqs sharing one line, but who knows. 994 * !ONESHOT irqs the thread mask is 0 so we can avoid a
995 * conditional in irq_wake_thread().
998 */ 996 */
999 if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { 997 if (new->flags & IRQF_ONESHOT) {
1000 ret = -EBUSY; 998 /*
1001 goto out_mask; 999 * Unlikely to have 32 resp 64 irqs sharing one line,
1000 * but who knows.
1001 */
1002 if (thread_mask == ~0UL) {
1003 ret = -EBUSY;
1004 goto out_mask;
1005 }
1006 /*
1007 * The thread_mask for the action is or'ed to
1008 * desc->thread_active to indicate that the
1009 * IRQF_ONESHOT thread handler has been woken, but not
1010 * yet finished. The bit is cleared when a thread
1011 * completes. When all threads of a shared interrupt
1012 * line have completed desc->threads_active becomes
1013 * zero and the interrupt line is unmasked. See
1014 * handle.c:irq_wake_thread() for further information.
1015 *
1016 * If no thread is woken by primary (hard irq context)
1017 * interrupt handlers, then desc->threads_active is
1018 * also checked for zero to unmask the irq line in the
1019 * affected hard irq flow handlers
1020 * (handle_[fasteoi|level]_irq).
1021 *
1022 * The new action gets the first zero bit of
1023 * thread_mask assigned. See the loop above which or's
1024 * all existing action->thread_mask bits.
1025 */
1026 new->thread_mask = 1 << ffz(thread_mask);
1002 } 1027 }
1003 new->thread_mask = 1 << ffz(thread_mask);
1004 1028
1005 if (!shared) { 1029 if (!shared) {
1006 init_waitqueue_head(&desc->wait_for_threads); 1030 init_waitqueue_head(&desc->wait_for_threads);
@@ -1027,7 +1051,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1027 desc->istate |= IRQS_ONESHOT; 1051 desc->istate |= IRQS_ONESHOT;
1028 1052
1029 if (irq_settings_can_autoenable(desc)) 1053 if (irq_settings_can_autoenable(desc))
1030 irq_startup(desc); 1054 irq_startup(desc, true);
1031 else 1055 else
1032 /* Undo nested disables: */ 1056 /* Undo nested disables: */
1033 desc->depth = 1; 1057 desc->depth = 1;
@@ -1103,8 +1127,7 @@ out_thread:
1103 struct task_struct *t = new->thread; 1127 struct task_struct *t = new->thread;
1104 1128
1105 new->thread = NULL; 1129 new->thread = NULL;
1106 if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) 1130 kthread_stop(t);
1107 kthread_stop(t);
1108 put_task_struct(t); 1131 put_task_struct(t);
1109 } 1132 }
1110out_mput: 1133out_mput:
@@ -1214,8 +1237,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1214#endif 1237#endif
1215 1238
1216 if (action->thread) { 1239 if (action->thread) {
1217 if (!test_bit(IRQTF_DIED, &action->thread_flags)) 1240 kthread_stop(action->thread);
1218 kthread_stop(action->thread);
1219 put_task_struct(action->thread); 1241 put_task_struct(action->thread);
1220 } 1242 }
1221 1243
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 01d3b70fc98a..43049192b5ec 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -12,7 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/sort.h> 13#include <linux/sort.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/jump_label.h> 15#include <linux/static_key.h>
16 16
17#ifdef HAVE_JUMP_LABEL 17#ifdef HAVE_JUMP_LABEL
18 18
@@ -29,11 +29,6 @@ void jump_label_unlock(void)
29 mutex_unlock(&jump_label_mutex); 29 mutex_unlock(&jump_label_mutex);
30} 30}
31 31
32bool jump_label_enabled(struct jump_label_key *key)
33{
34 return !!atomic_read(&key->enabled);
35}
36
37static int jump_label_cmp(const void *a, const void *b) 32static int jump_label_cmp(const void *a, const void *b)
38{ 33{
39 const struct jump_entry *jea = a; 34 const struct jump_entry *jea = a;
@@ -58,56 +53,66 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
58 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); 53 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
59} 54}
60 55
61static void jump_label_update(struct jump_label_key *key, int enable); 56static void jump_label_update(struct static_key *key, int enable);
62 57
63void jump_label_inc(struct jump_label_key *key) 58void static_key_slow_inc(struct static_key *key)
64{ 59{
65 if (atomic_inc_not_zero(&key->enabled)) 60 if (atomic_inc_not_zero(&key->enabled))
66 return; 61 return;
67 62
68 jump_label_lock(); 63 jump_label_lock();
69 if (atomic_read(&key->enabled) == 0) 64 if (atomic_read(&key->enabled) == 0) {
70 jump_label_update(key, JUMP_LABEL_ENABLE); 65 if (!jump_label_get_branch_default(key))
66 jump_label_update(key, JUMP_LABEL_ENABLE);
67 else
68 jump_label_update(key, JUMP_LABEL_DISABLE);
69 }
71 atomic_inc(&key->enabled); 70 atomic_inc(&key->enabled);
72 jump_label_unlock(); 71 jump_label_unlock();
73} 72}
74EXPORT_SYMBOL_GPL(jump_label_inc); 73EXPORT_SYMBOL_GPL(static_key_slow_inc);
75 74
76static void __jump_label_dec(struct jump_label_key *key, 75static void __static_key_slow_dec(struct static_key *key,
77 unsigned long rate_limit, struct delayed_work *work) 76 unsigned long rate_limit, struct delayed_work *work)
78{ 77{
79 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) 78 if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
79 WARN(atomic_read(&key->enabled) < 0,
80 "jump label: negative count!\n");
80 return; 81 return;
82 }
81 83
82 if (rate_limit) { 84 if (rate_limit) {
83 atomic_inc(&key->enabled); 85 atomic_inc(&key->enabled);
84 schedule_delayed_work(work, rate_limit); 86 schedule_delayed_work(work, rate_limit);
85 } else 87 } else {
86 jump_label_update(key, JUMP_LABEL_DISABLE); 88 if (!jump_label_get_branch_default(key))
87 89 jump_label_update(key, JUMP_LABEL_DISABLE);
90 else
91 jump_label_update(key, JUMP_LABEL_ENABLE);
92 }
88 jump_label_unlock(); 93 jump_label_unlock();
89} 94}
90EXPORT_SYMBOL_GPL(jump_label_dec);
91 95
92static void jump_label_update_timeout(struct work_struct *work) 96static void jump_label_update_timeout(struct work_struct *work)
93{ 97{
94 struct jump_label_key_deferred *key = 98 struct static_key_deferred *key =
95 container_of(work, struct jump_label_key_deferred, work.work); 99 container_of(work, struct static_key_deferred, work.work);
96 __jump_label_dec(&key->key, 0, NULL); 100 __static_key_slow_dec(&key->key, 0, NULL);
97} 101}
98 102
99void jump_label_dec(struct jump_label_key *key) 103void static_key_slow_dec(struct static_key *key)
100{ 104{
101 __jump_label_dec(key, 0, NULL); 105 __static_key_slow_dec(key, 0, NULL);
102} 106}
107EXPORT_SYMBOL_GPL(static_key_slow_dec);
103 108
104void jump_label_dec_deferred(struct jump_label_key_deferred *key) 109void static_key_slow_dec_deferred(struct static_key_deferred *key)
105{ 110{
106 __jump_label_dec(&key->key, key->timeout, &key->work); 111 __static_key_slow_dec(&key->key, key->timeout, &key->work);
107} 112}
113EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
108 114
109 115void jump_label_rate_limit(struct static_key_deferred *key,
110void jump_label_rate_limit(struct jump_label_key_deferred *key,
111 unsigned long rl) 116 unsigned long rl)
112{ 117{
113 key->timeout = rl; 118 key->timeout = rl;
@@ -150,7 +155,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry
150 arch_jump_label_transform(entry, type); 155 arch_jump_label_transform(entry, type);
151} 156}
152 157
153static void __jump_label_update(struct jump_label_key *key, 158static void __jump_label_update(struct static_key *key,
154 struct jump_entry *entry, 159 struct jump_entry *entry,
155 struct jump_entry *stop, int enable) 160 struct jump_entry *stop, int enable)
156{ 161{
@@ -167,27 +172,40 @@ static void __jump_label_update(struct jump_label_key *key,
167 } 172 }
168} 173}
169 174
175static enum jump_label_type jump_label_type(struct static_key *key)
176{
177 bool true_branch = jump_label_get_branch_default(key);
178 bool state = static_key_enabled(key);
179
180 if ((!true_branch && state) || (true_branch && !state))
181 return JUMP_LABEL_ENABLE;
182
183 return JUMP_LABEL_DISABLE;
184}
185
170void __init jump_label_init(void) 186void __init jump_label_init(void)
171{ 187{
172 struct jump_entry *iter_start = __start___jump_table; 188 struct jump_entry *iter_start = __start___jump_table;
173 struct jump_entry *iter_stop = __stop___jump_table; 189 struct jump_entry *iter_stop = __stop___jump_table;
174 struct jump_label_key *key = NULL; 190 struct static_key *key = NULL;
175 struct jump_entry *iter; 191 struct jump_entry *iter;
176 192
177 jump_label_lock(); 193 jump_label_lock();
178 jump_label_sort_entries(iter_start, iter_stop); 194 jump_label_sort_entries(iter_start, iter_stop);
179 195
180 for (iter = iter_start; iter < iter_stop; iter++) { 196 for (iter = iter_start; iter < iter_stop; iter++) {
181 struct jump_label_key *iterk; 197 struct static_key *iterk;
182 198
183 iterk = (struct jump_label_key *)(unsigned long)iter->key; 199 iterk = (struct static_key *)(unsigned long)iter->key;
184 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? 200 arch_jump_label_transform_static(iter, jump_label_type(iterk));
185 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
186 if (iterk == key) 201 if (iterk == key)
187 continue; 202 continue;
188 203
189 key = iterk; 204 key = iterk;
190 key->entries = iter; 205 /*
206 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
207 */
208 *((unsigned long *)&key->entries) += (unsigned long)iter;
191#ifdef CONFIG_MODULES 209#ifdef CONFIG_MODULES
192 key->next = NULL; 210 key->next = NULL;
193#endif 211#endif
@@ -197,8 +215,8 @@ void __init jump_label_init(void)
197 215
198#ifdef CONFIG_MODULES 216#ifdef CONFIG_MODULES
199 217
200struct jump_label_mod { 218struct static_key_mod {
201 struct jump_label_mod *next; 219 struct static_key_mod *next;
202 struct jump_entry *entries; 220 struct jump_entry *entries;
203 struct module *mod; 221 struct module *mod;
204}; 222};
@@ -218,9 +236,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
218 start, end); 236 start, end);
219} 237}
220 238
221static void __jump_label_mod_update(struct jump_label_key *key, int enable) 239static void __jump_label_mod_update(struct static_key *key, int enable)
222{ 240{
223 struct jump_label_mod *mod = key->next; 241 struct static_key_mod *mod = key->next;
224 242
225 while (mod) { 243 while (mod) {
226 struct module *m = mod->mod; 244 struct module *m = mod->mod;
@@ -251,11 +269,7 @@ void jump_label_apply_nops(struct module *mod)
251 return; 269 return;
252 270
253 for (iter = iter_start; iter < iter_stop; iter++) { 271 for (iter = iter_start; iter < iter_stop; iter++) {
254 struct jump_label_key *iterk; 272 arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
255
256 iterk = (struct jump_label_key *)(unsigned long)iter->key;
257 arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
258 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
259 } 273 }
260} 274}
261 275
@@ -264,8 +278,8 @@ static int jump_label_add_module(struct module *mod)
264 struct jump_entry *iter_start = mod->jump_entries; 278 struct jump_entry *iter_start = mod->jump_entries;
265 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; 279 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
266 struct jump_entry *iter; 280 struct jump_entry *iter;
267 struct jump_label_key *key = NULL; 281 struct static_key *key = NULL;
268 struct jump_label_mod *jlm; 282 struct static_key_mod *jlm;
269 283
270 /* if the module doesn't have jump label entries, just return */ 284 /* if the module doesn't have jump label entries, just return */
271 if (iter_start == iter_stop) 285 if (iter_start == iter_stop)
@@ -274,28 +288,30 @@ static int jump_label_add_module(struct module *mod)
274 jump_label_sort_entries(iter_start, iter_stop); 288 jump_label_sort_entries(iter_start, iter_stop);
275 289
276 for (iter = iter_start; iter < iter_stop; iter++) { 290 for (iter = iter_start; iter < iter_stop; iter++) {
277 if (iter->key == (jump_label_t)(unsigned long)key) 291 struct static_key *iterk;
278 continue;
279 292
280 key = (struct jump_label_key *)(unsigned long)iter->key; 293 iterk = (struct static_key *)(unsigned long)iter->key;
294 if (iterk == key)
295 continue;
281 296
297 key = iterk;
282 if (__module_address(iter->key) == mod) { 298 if (__module_address(iter->key) == mod) {
283 atomic_set(&key->enabled, 0); 299 /*
284 key->entries = iter; 300 * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH.
301 */
302 *((unsigned long *)&key->entries) += (unsigned long)iter;
285 key->next = NULL; 303 key->next = NULL;
286 continue; 304 continue;
287 } 305 }
288 306 jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
289 jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
290 if (!jlm) 307 if (!jlm)
291 return -ENOMEM; 308 return -ENOMEM;
292
293 jlm->mod = mod; 309 jlm->mod = mod;
294 jlm->entries = iter; 310 jlm->entries = iter;
295 jlm->next = key->next; 311 jlm->next = key->next;
296 key->next = jlm; 312 key->next = jlm;
297 313
298 if (jump_label_enabled(key)) 314 if (jump_label_type(key) == JUMP_LABEL_ENABLE)
299 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); 315 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
300 } 316 }
301 317
@@ -307,14 +323,14 @@ static void jump_label_del_module(struct module *mod)
307 struct jump_entry *iter_start = mod->jump_entries; 323 struct jump_entry *iter_start = mod->jump_entries;
308 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; 324 struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
309 struct jump_entry *iter; 325 struct jump_entry *iter;
310 struct jump_label_key *key = NULL; 326 struct static_key *key = NULL;
311 struct jump_label_mod *jlm, **prev; 327 struct static_key_mod *jlm, **prev;
312 328
313 for (iter = iter_start; iter < iter_stop; iter++) { 329 for (iter = iter_start; iter < iter_stop; iter++) {
314 if (iter->key == (jump_label_t)(unsigned long)key) 330 if (iter->key == (jump_label_t)(unsigned long)key)
315 continue; 331 continue;
316 332
317 key = (struct jump_label_key *)(unsigned long)iter->key; 333 key = (struct static_key *)(unsigned long)iter->key;
318 334
319 if (__module_address(iter->key) == mod) 335 if (__module_address(iter->key) == mod)
320 continue; 336 continue;
@@ -416,12 +432,13 @@ int jump_label_text_reserved(void *start, void *end)
416 return ret; 432 return ret;
417} 433}
418 434
419static void jump_label_update(struct jump_label_key *key, int enable) 435static void jump_label_update(struct static_key *key, int enable)
420{ 436{
421 struct jump_entry *entry = key->entries, *stop = __stop___jump_table; 437 struct jump_entry *stop = __stop___jump_table;
438 struct jump_entry *entry = jump_label_get_entries(key);
422 439
423#ifdef CONFIG_MODULES 440#ifdef CONFIG_MODULES
424 struct module *mod = __module_address((jump_label_t)key); 441 struct module *mod = __module_address((unsigned long)key);
425 442
426 __jump_label_mod_update(key, enable); 443 __jump_label_mod_update(key, enable);
427 444
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 7b0886786701..4e2e472f6aeb 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -37,7 +37,6 @@
37#include <asm/page.h> 37#include <asm/page.h>
38#include <asm/uaccess.h> 38#include <asm/uaccess.h>
39#include <asm/io.h> 39#include <asm/io.h>
40#include <asm/system.h>
41#include <asm/sections.h> 40#include <asm/sections.h>
42 41
43/* Per cpu memory for storing cpu states in case of system crash. */ 42/* Per cpu memory for storing cpu states in case of system crash. */
@@ -1359,6 +1358,10 @@ static int __init parse_crashkernel_simple(char *cmdline,
1359 1358
1360 if (*cur == '@') 1359 if (*cur == '@')
1361 *crash_base = memparse(cur+1, &cur); 1360 *crash_base = memparse(cur+1, &cur);
1361 else if (*cur != ' ' && *cur != '\0') {
1362 pr_warning("crashkernel: unrecognized char\n");
1363 return -EINVAL;
1364 }
1362 1365
1363 return 0; 1366 return 0;
1364} 1367}
@@ -1462,7 +1465,9 @@ static int __init crash_save_vmcoreinfo_init(void)
1462 1465
1463 VMCOREINFO_SYMBOL(init_uts_ns); 1466 VMCOREINFO_SYMBOL(init_uts_ns);
1464 VMCOREINFO_SYMBOL(node_online_map); 1467 VMCOREINFO_SYMBOL(node_online_map);
1468#ifdef CONFIG_MMU
1465 VMCOREINFO_SYMBOL(swapper_pg_dir); 1469 VMCOREINFO_SYMBOL(swapper_pg_dir);
1470#endif
1466 VMCOREINFO_SYMBOL(_stext); 1471 VMCOREINFO_SYMBOL(_stext);
1467 VMCOREINFO_SYMBOL(vmlist); 1472 VMCOREINFO_SYMBOL(vmlist);
1468 1473
@@ -1546,13 +1551,13 @@ int kernel_kexec(void)
1546 if (error) 1551 if (error)
1547 goto Resume_console; 1552 goto Resume_console;
1548 /* At this point, dpm_suspend_start() has been called, 1553 /* At this point, dpm_suspend_start() has been called,
1549 * but *not* dpm_suspend_noirq(). We *must* call 1554 * but *not* dpm_suspend_end(). We *must* call
1550 * dpm_suspend_noirq() now. Otherwise, drivers for 1555 * dpm_suspend_end() now. Otherwise, drivers for
1551 * some devices (e.g. interrupt controllers) become 1556 * some devices (e.g. interrupt controllers) become
1552 * desynchronized with the actual state of the 1557 * desynchronized with the actual state of the
1553 * hardware at resume time, and evil weirdness ensues. 1558 * hardware at resume time, and evil weirdness ensues.
1554 */ 1559 */
1555 error = dpm_suspend_noirq(PMSG_FREEZE); 1560 error = dpm_suspend_end(PMSG_FREEZE);
1556 if (error) 1561 if (error)
1557 goto Resume_devices; 1562 goto Resume_devices;
1558 error = disable_nonboot_cpus(); 1563 error = disable_nonboot_cpus();
@@ -1579,7 +1584,7 @@ int kernel_kexec(void)
1579 local_irq_enable(); 1584 local_irq_enable();
1580 Enable_cpus: 1585 Enable_cpus:
1581 enable_nonboot_cpus(); 1586 enable_nonboot_cpus();
1582 dpm_resume_noirq(PMSG_RESTORE); 1587 dpm_resume_start(PMSG_RESTORE);
1583 Resume_devices: 1588 Resume_devices:
1584 dpm_resume_end(PMSG_RESTORE); 1589 dpm_resume_end(PMSG_RESTORE);
1585 Resume_console: 1590 Resume_console:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a0a88543934e..957a7aab8ebc 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -60,6 +60,43 @@ static DECLARE_RWSEM(umhelper_sem);
60*/ 60*/
61char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; 61char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
62 62
63static void free_modprobe_argv(struct subprocess_info *info)
64{
65 kfree(info->argv[3]); /* check call_modprobe() */
66 kfree(info->argv);
67}
68
69static int call_modprobe(char *module_name, int wait)
70{
71 static char *envp[] = {
72 "HOME=/",
73 "TERM=linux",
74 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
75 NULL
76 };
77
78 char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
79 if (!argv)
80 goto out;
81
82 module_name = kstrdup(module_name, GFP_KERNEL);
83 if (!module_name)
84 goto free_argv;
85
86 argv[0] = modprobe_path;
87 argv[1] = "-q";
88 argv[2] = "--";
89 argv[3] = module_name; /* check free_modprobe_argv() */
90 argv[4] = NULL;
91
92 return call_usermodehelper_fns(modprobe_path, argv, envp,
93 wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
94free_argv:
95 kfree(argv);
96out:
97 return -ENOMEM;
98}
99
63/** 100/**
64 * __request_module - try to load a kernel module 101 * __request_module - try to load a kernel module
65 * @wait: wait (or not) for the operation to complete 102 * @wait: wait (or not) for the operation to complete
@@ -81,11 +118,6 @@ int __request_module(bool wait, const char *fmt, ...)
81 char module_name[MODULE_NAME_LEN]; 118 char module_name[MODULE_NAME_LEN];
82 unsigned int max_modprobes; 119 unsigned int max_modprobes;
83 int ret; 120 int ret;
84 char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
85 static char *envp[] = { "HOME=/",
86 "TERM=linux",
87 "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
88 NULL };
89 static atomic_t kmod_concurrent = ATOMIC_INIT(0); 121 static atomic_t kmod_concurrent = ATOMIC_INIT(0);
90#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ 122#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
91 static int kmod_loop_msg; 123 static int kmod_loop_msg;
@@ -128,9 +160,7 @@ int __request_module(bool wait, const char *fmt, ...)
128 160
129 trace_module_request(module_name, wait, _RET_IP_); 161 trace_module_request(module_name, wait, _RET_IP_);
130 162
131 ret = call_usermodehelper_fns(modprobe_path, argv, envp, 163 ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
132 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
133 NULL, NULL, NULL);
134 164
135 atomic_dec(&kmod_concurrent); 165 atomic_dec(&kmod_concurrent);
136 return ret; 166 return ret;
@@ -188,7 +218,7 @@ static int ____call_usermodehelper(void *data)
188 /* Exec failed? */ 218 /* Exec failed? */
189fail: 219fail:
190 sub_info->retval = retval; 220 sub_info->retval = retval;
191 do_exit(0); 221 return 0;
192} 222}
193 223
194void call_usermodehelper_freeinfo(struct subprocess_info *info) 224void call_usermodehelper_freeinfo(struct subprocess_info *info)
@@ -199,6 +229,19 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
199} 229}
200EXPORT_SYMBOL(call_usermodehelper_freeinfo); 230EXPORT_SYMBOL(call_usermodehelper_freeinfo);
201 231
232static void umh_complete(struct subprocess_info *sub_info)
233{
234 struct completion *comp = xchg(&sub_info->complete, NULL);
235 /*
236 * See call_usermodehelper_exec(). If xchg() returns NULL
237 * we own sub_info, the UMH_KILLABLE caller has gone away.
238 */
239 if (comp)
240 complete(comp);
241 else
242 call_usermodehelper_freeinfo(sub_info);
243}
244
202/* Keventd can't block, but this (a child) can. */ 245/* Keventd can't block, but this (a child) can. */
203static int wait_for_helper(void *data) 246static int wait_for_helper(void *data)
204{ 247{
@@ -235,7 +278,7 @@ static int wait_for_helper(void *data)
235 sub_info->retval = ret; 278 sub_info->retval = ret;
236 } 279 }
237 280
238 complete(sub_info->complete); 281 umh_complete(sub_info);
239 return 0; 282 return 0;
240} 283}
241 284
@@ -244,7 +287,7 @@ static void __call_usermodehelper(struct work_struct *work)
244{ 287{
245 struct subprocess_info *sub_info = 288 struct subprocess_info *sub_info =
246 container_of(work, struct subprocess_info, work); 289 container_of(work, struct subprocess_info, work);
247 enum umh_wait wait = sub_info->wait; 290 int wait = sub_info->wait & ~UMH_KILLABLE;
248 pid_t pid; 291 pid_t pid;
249 292
250 /* CLONE_VFORK: wait until the usermode helper has execve'd 293 /* CLONE_VFORK: wait until the usermode helper has execve'd
@@ -269,7 +312,7 @@ static void __call_usermodehelper(struct work_struct *work)
269 case UMH_WAIT_EXEC: 312 case UMH_WAIT_EXEC:
270 if (pid < 0) 313 if (pid < 0)
271 sub_info->retval = pid; 314 sub_info->retval = pid;
272 complete(sub_info->complete); 315 umh_complete(sub_info);
273 } 316 }
274} 317}
275 318
@@ -435,8 +478,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
435 * asynchronously if wait is not set, and runs as a child of keventd. 478 * asynchronously if wait is not set, and runs as a child of keventd.
436 * (ie. it runs with full root capabilities). 479 * (ie. it runs with full root capabilities).
437 */ 480 */
438int call_usermodehelper_exec(struct subprocess_info *sub_info, 481int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
439 enum umh_wait wait)
440{ 482{
441 DECLARE_COMPLETION_ONSTACK(done); 483 DECLARE_COMPLETION_ONSTACK(done);
442 int retval = 0; 484 int retval = 0;
@@ -456,9 +498,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
456 queue_work(khelper_wq, &sub_info->work); 498 queue_work(khelper_wq, &sub_info->work);
457 if (wait == UMH_NO_WAIT) /* task has freed sub_info */ 499 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
458 goto unlock; 500 goto unlock;
501
502 if (wait & UMH_KILLABLE) {
503 retval = wait_for_completion_killable(&done);
504 if (!retval)
505 goto wait_done;
506
507 /* umh_complete() will see NULL and free sub_info */
508 if (xchg(&sub_info->complete, NULL))
509 goto unlock;
510 /* fallthrough, umh_complete() was already called */
511 }
512
459 wait_for_completion(&done); 513 wait_for_completion(&done);
514wait_done:
460 retval = sub_info->retval; 515 retval = sub_info->retval;
461
462out: 516out:
463 call_usermodehelper_freeinfo(sub_info); 517 call_usermodehelper_freeinfo(sub_info);
464unlock: 518unlock:
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 95dd7212e610..c62b8546cc90 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1077 /* Early boot. kretprobe_table_locks not yet initialized. */ 1077 /* Early boot. kretprobe_table_locks not yet initialized. */
1078 return; 1078 return;
1079 1079
1080 INIT_HLIST_HEAD(&empty_rp);
1080 hash = hash_ptr(tk, KPROBE_HASH_BITS); 1081 hash = hash_ptr(tk, KPROBE_HASH_BITS);
1081 head = &kretprobe_inst_table[hash]; 1082 head = &kretprobe_inst_table[hash];
1082 kretprobe_table_lock(hash, &flags); 1083 kretprobe_table_lock(hash, &flags);
@@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
1085 recycle_rp_inst(ri, &empty_rp); 1086 recycle_rp_inst(ri, &empty_rp);
1086 } 1087 }
1087 kretprobe_table_unlock(hash, &flags); 1088 kretprobe_table_unlock(hash, &flags);
1088 INIT_HLIST_HEAD(&empty_rp);
1089 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 1089 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
1090 hlist_del(&ri->hlist); 1090 hlist_del(&ri->hlist);
1091 kfree(ri); 1091 kfree(ri);
@@ -1334,8 +1334,10 @@ int __kprobes register_kprobe(struct kprobe *p)
1334 if (!kernel_text_address((unsigned long) p->addr) || 1334 if (!kernel_text_address((unsigned long) p->addr) ||
1335 in_kprobes_functions((unsigned long) p->addr) || 1335 in_kprobes_functions((unsigned long) p->addr) ||
1336 ftrace_text_reserved(p->addr, p->addr) || 1336 ftrace_text_reserved(p->addr, p->addr) ||
1337 jump_label_text_reserved(p->addr, p->addr)) 1337 jump_label_text_reserved(p->addr, p->addr)) {
1338 goto fail_with_jump_label; 1338 ret = -EINVAL;
1339 goto cannot_probe;
1340 }
1339 1341
1340 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ 1342 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
1341 p->flags &= KPROBE_FLAG_DISABLED; 1343 p->flags &= KPROBE_FLAG_DISABLED;
@@ -1352,7 +1354,7 @@ int __kprobes register_kprobe(struct kprobe *p)
1352 * its code to prohibit unexpected unloading. 1354 * its code to prohibit unexpected unloading.
1353 */ 1355 */
1354 if (unlikely(!try_module_get(probed_mod))) 1356 if (unlikely(!try_module_get(probed_mod)))
1355 goto fail_with_jump_label; 1357 goto cannot_probe;
1356 1358
1357 /* 1359 /*
1358 * If the module freed .init.text, we couldn't insert 1360 * If the module freed .init.text, we couldn't insert
@@ -1361,7 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p)
1361 if (within_module_init((unsigned long)p->addr, probed_mod) && 1363 if (within_module_init((unsigned long)p->addr, probed_mod) &&
1362 probed_mod->state != MODULE_STATE_COMING) { 1364 probed_mod->state != MODULE_STATE_COMING) {
1363 module_put(probed_mod); 1365 module_put(probed_mod);
1364 goto fail_with_jump_label; 1366 goto cannot_probe;
1365 } 1367 }
1366 /* ret will be updated by following code */ 1368 /* ret will be updated by following code */
1367 } 1369 }
@@ -1409,7 +1411,7 @@ out:
1409 1411
1410 return ret; 1412 return ret;
1411 1413
1412fail_with_jump_label: 1414cannot_probe:
1413 preempt_enable(); 1415 preempt_enable();
1414 jump_label_unlock(); 1416 jump_label_unlock();
1415 return ret; 1417 return ret;
@@ -1673,8 +1675,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1673 ri->rp = rp; 1675 ri->rp = rp;
1674 ri->task = current; 1676 ri->task = current;
1675 1677
1676 if (rp->entry_handler && rp->entry_handler(ri, regs)) 1678 if (rp->entry_handler && rp->entry_handler(ri, regs)) {
1679 raw_spin_lock_irqsave(&rp->lock, flags);
1680 hlist_add_head(&ri->hlist, &rp->free_instances);
1681 raw_spin_unlock_irqrestore(&rp->lock, flags);
1677 return 0; 1682 return 0;
1683 }
1678 1684
1679 arch_prepare_kretprobe(ri, regs); 1685 arch_prepare_kretprobe(ri, regs);
1680 1686
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8889f7dd7c46..ea9ee4518c35 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4176,7 +4176,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4176 printk("-------------------------------\n"); 4176 printk("-------------------------------\n");
4177 printk("%s:%d %s!\n", file, line, s); 4177 printk("%s:%d %s!\n", file, line, s);
4178 printk("\nother info that might help us debug this:\n\n"); 4178 printk("\nother info that might help us debug this:\n\n");
4179 printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); 4179 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
4180 !rcu_lockdep_current_cpu_online()
4181 ? "RCU used illegally from offline CPU!\n"
4182 : rcu_is_cpu_idle()
4183 ? "RCU used illegally from idle CPU!\n"
4184 : "",
4185 rcu_scheduler_active, debug_locks);
4180 4186
4181 /* 4187 /*
4182 * If a CPU is in the RCU-free window in idle (ie: in the section 4188 * If a CPU is in the RCU-free window in idle (ie: in the section
diff --git a/kernel/module.c b/kernel/module.c
index 2c932760fd33..78ac6ec1e425 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -105,6 +105,7 @@ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
105 105
106/* Block module loading/unloading? */ 106/* Block module loading/unloading? */
107int modules_disabled = 0; 107int modules_disabled = 0;
108core_param(nomodule, modules_disabled, bint, 0);
108 109
109/* Waiting for a module to finish initializing? */ 110/* Waiting for a module to finish initializing? */
110static DECLARE_WAIT_QUEUE_HEAD(module_wq); 111static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -903,6 +904,36 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
903static struct module_attribute modinfo_refcnt = 904static struct module_attribute modinfo_refcnt =
904 __ATTR(refcnt, 0444, show_refcnt, NULL); 905 __ATTR(refcnt, 0444, show_refcnt, NULL);
905 906
907void __module_get(struct module *module)
908{
909 if (module) {
910 preempt_disable();
911 __this_cpu_inc(module->refptr->incs);
912 trace_module_get(module, _RET_IP_);
913 preempt_enable();
914 }
915}
916EXPORT_SYMBOL(__module_get);
917
918bool try_module_get(struct module *module)
919{
920 bool ret = true;
921
922 if (module) {
923 preempt_disable();
924
925 if (likely(module_is_live(module))) {
926 __this_cpu_inc(module->refptr->incs);
927 trace_module_get(module, _RET_IP_);
928 } else
929 ret = false;
930
931 preempt_enable();
932 }
933 return ret;
934}
935EXPORT_SYMBOL(try_module_get);
936
906void module_put(struct module *module) 937void module_put(struct module *module)
907{ 938{
908 if (module) { 939 if (module) {
@@ -2380,8 +2411,7 @@ static int copy_and_check(struct load_info *info,
2380 return -ENOEXEC; 2411 return -ENOEXEC;
2381 2412
2382 /* Suck in entire file: we'll want most of it. */ 2413 /* Suck in entire file: we'll want most of it. */
2383 /* vmalloc barfs on "unusual" numbers. Check here */ 2414 if ((hdr = vmalloc(len)) == NULL)
2384 if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
2385 return -ENOMEM; 2415 return -ENOMEM;
2386 2416
2387 if (copy_from_user(hdr, umod, len) != 0) { 2417 if (copy_from_user(hdr, umod, len) != 0) {
@@ -2922,7 +2952,8 @@ static struct module *load_module(void __user *umod,
2922 mutex_unlock(&module_mutex); 2952 mutex_unlock(&module_mutex);
2923 2953
2924 /* Module is ready to execute: parsing args may do that. */ 2954 /* Module is ready to execute: parsing args may do that. */
2925 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2955 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
2956 -32768, 32767, NULL);
2926 if (err < 0) 2957 if (err < 0)
2927 goto unlink; 2958 goto unlink;
2928 2959
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 89096dd8786f..a307cc9c9526 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
240 240
241 /* didn't get the lock, go to sleep: */ 241 /* didn't get the lock, go to sleep: */
242 spin_unlock_mutex(&lock->wait_lock, flags); 242 spin_unlock_mutex(&lock->wait_lock, flags);
243 preempt_enable_no_resched(); 243 schedule_preempt_disabled();
244 schedule();
245 preempt_disable();
246 spin_lock_mutex(&lock->wait_lock, flags); 244 spin_lock_mutex(&lock->wait_lock, flags);
247 } 245 }
248 246
diff --git a/kernel/padata.c b/kernel/padata.c
index b45259931512..6f10eb285ece 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -29,7 +29,6 @@
29#include <linux/sysfs.h> 29#include <linux/sysfs.h>
30#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
31 31
32#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
33#define MAX_OBJ_NUM 1000 32#define MAX_OBJ_NUM 1000
34 33
35static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
@@ -43,18 +42,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
43 return target_cpu; 42 return target_cpu;
44} 43}
45 44
46static int padata_cpu_hash(struct padata_priv *padata) 45static int padata_cpu_hash(struct parallel_data *pd)
47{ 46{
48 int cpu_index; 47 int cpu_index;
49 struct parallel_data *pd;
50
51 pd = padata->pd;
52 48
53 /* 49 /*
54 * Hash the sequence numbers to the cpus by taking 50 * Hash the sequence numbers to the cpus by taking
55 * seq_nr mod. number of cpus in use. 51 * seq_nr mod. number of cpus in use.
56 */ 52 */
57 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); 53
54 spin_lock(&pd->seq_lock);
55 cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu);
56 pd->seq_nr++;
57 spin_unlock(&pd->seq_lock);
58 58
59 return padata_index_to_cpu(pd, cpu_index); 59 return padata_index_to_cpu(pd, cpu_index);
60} 60}
@@ -132,12 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst,
132 padata->pd = pd; 132 padata->pd = pd;
133 padata->cb_cpu = cb_cpu; 133 padata->cb_cpu = cb_cpu;
134 134
135 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) 135 target_cpu = padata_cpu_hash(pd);
136 atomic_set(&pd->seq_nr, -1);
137
138 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
139
140 target_cpu = padata_cpu_hash(padata);
141 queue = per_cpu_ptr(pd->pqueue, target_cpu); 136 queue = per_cpu_ptr(pd->pqueue, target_cpu);
142 137
143 spin_lock(&queue->parallel.lock); 138 spin_lock(&queue->parallel.lock);
@@ -173,7 +168,7 @@ EXPORT_SYMBOL(padata_do_parallel);
173static struct padata_priv *padata_get_next(struct parallel_data *pd) 168static struct padata_priv *padata_get_next(struct parallel_data *pd)
174{ 169{
175 int cpu, num_cpus; 170 int cpu, num_cpus;
176 int next_nr, next_index; 171 unsigned int next_nr, next_index;
177 struct padata_parallel_queue *queue, *next_queue; 172 struct padata_parallel_queue *queue, *next_queue;
178 struct padata_priv *padata; 173 struct padata_priv *padata;
179 struct padata_list *reorder; 174 struct padata_list *reorder;
@@ -189,14 +184,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
189 cpu = padata_index_to_cpu(pd, next_index); 184 cpu = padata_index_to_cpu(pd, next_index);
190 next_queue = per_cpu_ptr(pd->pqueue, cpu); 185 next_queue = per_cpu_ptr(pd->pqueue, cpu);
191 186
192 if (unlikely(next_nr > pd->max_seq_nr)) {
193 next_nr = next_nr - pd->max_seq_nr - 1;
194 next_index = next_nr % num_cpus;
195 cpu = padata_index_to_cpu(pd, next_index);
196 next_queue = per_cpu_ptr(pd->pqueue, cpu);
197 pd->processed = 0;
198 }
199
200 padata = NULL; 187 padata = NULL;
201 188
202 reorder = &next_queue->reorder; 189 reorder = &next_queue->reorder;
@@ -205,8 +192,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
205 padata = list_entry(reorder->list.next, 192 padata = list_entry(reorder->list.next,
206 struct padata_priv, list); 193 struct padata_priv, list);
207 194
208 BUG_ON(next_nr != padata->seq_nr);
209
210 spin_lock(&reorder->lock); 195 spin_lock(&reorder->lock);
211 list_del_init(&padata->list); 196 list_del_init(&padata->list);
212 atomic_dec(&pd->reorder_objects); 197 atomic_dec(&pd->reorder_objects);
@@ -230,6 +215,7 @@ out:
230 215
231static void padata_reorder(struct parallel_data *pd) 216static void padata_reorder(struct parallel_data *pd)
232{ 217{
218 int cb_cpu;
233 struct padata_priv *padata; 219 struct padata_priv *padata;
234 struct padata_serial_queue *squeue; 220 struct padata_serial_queue *squeue;
235 struct padata_instance *pinst = pd->pinst; 221 struct padata_instance *pinst = pd->pinst;
@@ -270,13 +256,14 @@ static void padata_reorder(struct parallel_data *pd)
270 return; 256 return;
271 } 257 }
272 258
273 squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); 259 cb_cpu = padata->cb_cpu;
260 squeue = per_cpu_ptr(pd->squeue, cb_cpu);
274 261
275 spin_lock(&squeue->serial.lock); 262 spin_lock(&squeue->serial.lock);
276 list_add_tail(&padata->list, &squeue->serial.list); 263 list_add_tail(&padata->list, &squeue->serial.list);
277 spin_unlock(&squeue->serial.lock); 264 spin_unlock(&squeue->serial.lock);
278 265
279 queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); 266 queue_work_on(cb_cpu, pinst->wq, &squeue->work);
280 } 267 }
281 268
282 spin_unlock_bh(&pd->lock); 269 spin_unlock_bh(&pd->lock);
@@ -400,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd)
400/* Initialize all percpu queues used by parallel workers */ 387/* Initialize all percpu queues used by parallel workers */
401static void padata_init_pqueues(struct parallel_data *pd) 388static void padata_init_pqueues(struct parallel_data *pd)
402{ 389{
403 int cpu_index, num_cpus, cpu; 390 int cpu_index, cpu;
404 struct padata_parallel_queue *pqueue; 391 struct padata_parallel_queue *pqueue;
405 392
406 cpu_index = 0; 393 cpu_index = 0;
@@ -415,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd)
415 INIT_WORK(&pqueue->work, padata_parallel_worker); 402 INIT_WORK(&pqueue->work, padata_parallel_worker);
416 atomic_set(&pqueue->num_obj, 0); 403 atomic_set(&pqueue->num_obj, 0);
417 } 404 }
418
419 num_cpus = cpumask_weight(pd->cpumask.pcpu);
420 pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
421} 405}
422 406
423/* Allocate and initialize the internal cpumask dependend resources. */ 407/* Allocate and initialize the internal cpumask dependend resources. */
@@ -444,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
444 padata_init_pqueues(pd); 428 padata_init_pqueues(pd);
445 padata_init_squeues(pd); 429 padata_init_squeues(pd);
446 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); 430 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
447 atomic_set(&pd->seq_nr, -1); 431 pd->seq_nr = 0;
448 atomic_set(&pd->reorder_objects, 0); 432 atomic_set(&pd->reorder_objects, 0);
449 atomic_set(&pd->refcnt, 0); 433 atomic_set(&pd->refcnt, 0);
450 pd->pinst = pinst; 434 pd->pinst = pinst;
diff --git a/kernel/params.c b/kernel/params.c
index 32ee04308285..f37d82631347 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/module.h>
19#include <linux/kernel.h> 18#include <linux/kernel.h>
20#include <linux/string.h> 19#include <linux/string.h>
21#include <linux/errno.h> 20#include <linux/errno.h>
@@ -88,6 +87,8 @@ static int parse_one(char *param,
88 char *val, 87 char *val,
89 const struct kernel_param *params, 88 const struct kernel_param *params,
90 unsigned num_params, 89 unsigned num_params,
90 s16 min_level,
91 s16 max_level,
91 int (*handle_unknown)(char *param, char *val)) 92 int (*handle_unknown)(char *param, char *val))
92{ 93{
93 unsigned int i; 94 unsigned int i;
@@ -96,8 +97,12 @@ static int parse_one(char *param,
96 /* Find parameter */ 97 /* Find parameter */
97 for (i = 0; i < num_params; i++) { 98 for (i = 0; i < num_params; i++) {
98 if (parameq(param, params[i].name)) { 99 if (parameq(param, params[i].name)) {
100 if (params[i].level < min_level
101 || params[i].level > max_level)
102 return 0;
99 /* No one handled NULL, so do it here. */ 103 /* No one handled NULL, so do it here. */
100 if (!val && params[i].ops->set != param_set_bool) 104 if (!val && params[i].ops->set != param_set_bool
105 && params[i].ops->set != param_set_bint)
101 return -EINVAL; 106 return -EINVAL;
102 pr_debug("They are equal! Calling %p\n", 107 pr_debug("They are equal! Calling %p\n",
103 params[i].ops->set); 108 params[i].ops->set);
@@ -174,6 +179,8 @@ int parse_args(const char *name,
174 char *args, 179 char *args,
175 const struct kernel_param *params, 180 const struct kernel_param *params,
176 unsigned num, 181 unsigned num,
182 s16 min_level,
183 s16 max_level,
177 int (*unknown)(char *param, char *val)) 184 int (*unknown)(char *param, char *val))
178{ 185{
179 char *param, *val; 186 char *param, *val;
@@ -189,7 +196,8 @@ int parse_args(const char *name,
189 196
190 args = next_arg(args, &param, &val); 197 args = next_arg(args, &param, &val);
191 irq_was_disabled = irqs_disabled(); 198 irq_was_disabled = irqs_disabled();
192 ret = parse_one(param, val, params, num, unknown); 199 ret = parse_one(param, val, params, num,
200 min_level, max_level, unknown);
193 if (irq_was_disabled && !irqs_disabled()) { 201 if (irq_was_disabled && !irqs_disabled()) {
194 printk(KERN_WARNING "parse_args(): option '%s' enabled " 202 printk(KERN_WARNING "parse_args(): option '%s' enabled "
195 "irq's!\n", param); 203 "irq's!\n", param);
@@ -297,35 +305,18 @@ EXPORT_SYMBOL(param_ops_charp);
297/* Actually could be a bool or an int, for historical reasons. */ 305/* Actually could be a bool or an int, for historical reasons. */
298int param_set_bool(const char *val, const struct kernel_param *kp) 306int param_set_bool(const char *val, const struct kernel_param *kp)
299{ 307{
300 bool v;
301 int ret;
302
303 /* No equals means "set"... */ 308 /* No equals means "set"... */
304 if (!val) val = "1"; 309 if (!val) val = "1";
305 310
306 /* One of =[yYnN01] */ 311 /* One of =[yYnN01] */
307 ret = strtobool(val, &v); 312 return strtobool(val, kp->arg);
308 if (ret)
309 return ret;
310
311 if (kp->flags & KPARAM_ISBOOL)
312 *(bool *)kp->arg = v;
313 else
314 *(int *)kp->arg = v;
315 return 0;
316} 313}
317EXPORT_SYMBOL(param_set_bool); 314EXPORT_SYMBOL(param_set_bool);
318 315
319int param_get_bool(char *buffer, const struct kernel_param *kp) 316int param_get_bool(char *buffer, const struct kernel_param *kp)
320{ 317{
321 bool val;
322 if (kp->flags & KPARAM_ISBOOL)
323 val = *(bool *)kp->arg;
324 else
325 val = *(int *)kp->arg;
326
327 /* Y and N chosen as being relatively non-coder friendly */ 318 /* Y and N chosen as being relatively non-coder friendly */
328 return sprintf(buffer, "%c", val ? 'Y' : 'N'); 319 return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
329} 320}
330EXPORT_SYMBOL(param_get_bool); 321EXPORT_SYMBOL(param_get_bool);
331 322
@@ -343,7 +334,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp)
343 struct kernel_param dummy; 334 struct kernel_param dummy;
344 335
345 dummy.arg = &boolval; 336 dummy.arg = &boolval;
346 dummy.flags = KPARAM_ISBOOL;
347 ret = param_set_bool(val, &dummy); 337 ret = param_set_bool(val, &dummy);
348 if (ret == 0) 338 if (ret == 0)
349 *(bool *)kp->arg = !boolval; 339 *(bool *)kp->arg = !boolval;
@@ -372,7 +362,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
372 /* Match bool exactly, by re-using it. */ 362 /* Match bool exactly, by re-using it. */
373 boolkp = *kp; 363 boolkp = *kp;
374 boolkp.arg = &v; 364 boolkp.arg = &v;
375 boolkp.flags |= KPARAM_ISBOOL;
376 365
377 ret = param_set_bool(val, &boolkp); 366 ret = param_set_bool(val, &boolkp);
378 if (ret == 0) 367 if (ret == 0)
@@ -393,7 +382,7 @@ static int param_array(const char *name,
393 unsigned int min, unsigned int max, 382 unsigned int min, unsigned int max,
394 void *elem, int elemsize, 383 void *elem, int elemsize,
395 int (*set)(const char *, const struct kernel_param *kp), 384 int (*set)(const char *, const struct kernel_param *kp),
396 u16 flags, 385 s16 level,
397 unsigned int *num) 386 unsigned int *num)
398{ 387{
399 int ret; 388 int ret;
@@ -403,7 +392,7 @@ static int param_array(const char *name,
403 /* Get the name right for errors. */ 392 /* Get the name right for errors. */
404 kp.name = name; 393 kp.name = name;
405 kp.arg = elem; 394 kp.arg = elem;
406 kp.flags = flags; 395 kp.level = level;
407 396
408 *num = 0; 397 *num = 0;
409 /* We expect a comma-separated list of values. */ 398 /* We expect a comma-separated list of values. */
@@ -444,7 +433,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
444 unsigned int temp_num; 433 unsigned int temp_num;
445 434
446 return param_array(kp->name, val, 1, arr->max, arr->elem, 435 return param_array(kp->name, val, 1, arr->max, arr->elem,
447 arr->elemsize, arr->ops->set, kp->flags, 436 arr->elemsize, arr->ops->set, kp->level,
448 arr->num ?: &temp_num); 437 arr->num ?: &temp_num);
449} 438}
450 439
diff --git a/kernel/pid.c b/kernel/pid.c
index ce8e00deaccb..9f08dfabaf13 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
543 */ 543 */
544void __init pidhash_init(void) 544void __init pidhash_init(void)
545{ 545{
546 int i, pidhash_size; 546 unsigned int i, pidhash_size;
547 547
548 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, 548 pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
549 HASH_EARLY | HASH_SMALL, 549 HASH_EARLY | HASH_SMALL,
550 &pidhash_shift, NULL, 4096); 550 &pidhash_shift, NULL, 4096);
551 pidhash_size = 1 << pidhash_shift; 551 pidhash_size = 1U << pidhash_shift;
552 552
553 for (i = 0; i < pidhash_size; i++) 553 for (i = 0; i < pidhash_size; i++)
554 INIT_HLIST_HEAD(&pid_hash[i]); 554 INIT_HLIST_HEAD(&pid_hash[i]);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a8968396046d..57bc1fd35b3c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,6 +15,7 @@
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/reboot.h>
18 19
19#define BITS_PER_PAGE (PAGE_SIZE*8) 20#define BITS_PER_PAGE (PAGE_SIZE*8)
20 21
@@ -168,13 +169,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
168 while (nr > 0) { 169 while (nr > 0) {
169 rcu_read_lock(); 170 rcu_read_lock();
170 171
171 /*
172 * Any nested-container's init processes won't ignore the
173 * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
174 */
175 task = pid_task(find_vpid(nr), PIDTYPE_PID); 172 task = pid_task(find_vpid(nr), PIDTYPE_PID);
176 if (task) 173 if (task && !__fatal_signal_pending(task))
177 send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); 174 send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
178 175
179 rcu_read_unlock(); 176 rcu_read_unlock();
180 177
@@ -187,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
187 rc = sys_wait4(-1, NULL, __WALL, NULL); 184 rc = sys_wait4(-1, NULL, __WALL, NULL);
188 } while (rc != -ECHILD); 185 } while (rc != -ECHILD);
189 186
187 if (pid_ns->reboot)
188 current->signal->group_exit_code = pid_ns->reboot;
189
190 acct_exit_ns(pid_ns); 190 acct_exit_ns(pid_ns);
191 return; 191 return;
192} 192}
@@ -221,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = {
221 221
222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; 222static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
223 223
224int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
225{
226 if (pid_ns == &init_pid_ns)
227 return 0;
228
229 switch (cmd) {
230 case LINUX_REBOOT_CMD_RESTART2:
231 case LINUX_REBOOT_CMD_RESTART:
232 pid_ns->reboot = SIGHUP;
233 break;
234
235 case LINUX_REBOOT_CMD_POWER_OFF:
236 case LINUX_REBOOT_CMD_HALT:
237 pid_ns->reboot = SIGINT;
238 break;
239 default:
240 return -EINVAL;
241 }
242
243 read_lock(&tasklist_lock);
244 force_sig(SIGKILL, pid_ns->child_reaper);
245 read_unlock(&tasklist_lock);
246
247 do_exit(0);
248
249 /* Not reached */
250 return 0;
251}
252
224static __init int pid_namespaces_init(void) 253static __init int pid_namespaces_init(void)
225{ 254{
226 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); 255 pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 07e0e28ffba7..66d808ec5252 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,7 +1,8 @@
1 1
2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG 2ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
3 3
4obj-$(CONFIG_PM) += main.o qos.o 4obj-y += qos.o
5obj-$(CONFIG_PM) += main.o
5obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o 6obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
6obj-$(CONFIG_FREEZER) += process.o 7obj-$(CONFIG_FREEZER) += process.o
7obj-$(CONFIG_SUSPEND) += suspend.o 8obj-$(CONFIG_SUSPEND) += suspend.o
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 6d6d28870335..0a186cfde788 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
245 * create_image - Create a hibernation image. 245 * create_image - Create a hibernation image.
246 * @platform_mode: Whether or not to use the platform driver. 246 * @platform_mode: Whether or not to use the platform driver.
247 * 247 *
248 * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image 248 * Execute device drivers' "late" and "noirq" freeze callbacks, create a
249 * and execute the drivers' .thaw_noirq() callbacks. 249 * hibernation image and run the drivers' "noirq" and "early" thaw callbacks.
250 * 250 *
251 * Control reappears in this routine after the subsequent restore. 251 * Control reappears in this routine after the subsequent restore.
252 */ 252 */
@@ -254,7 +254,7 @@ static int create_image(int platform_mode)
254{ 254{
255 int error; 255 int error;
256 256
257 error = dpm_suspend_noirq(PMSG_FREEZE); 257 error = dpm_suspend_end(PMSG_FREEZE);
258 if (error) { 258 if (error) {
259 printk(KERN_ERR "PM: Some devices failed to power down, " 259 printk(KERN_ERR "PM: Some devices failed to power down, "
260 "aborting hibernation\n"); 260 "aborting hibernation\n");
@@ -306,7 +306,7 @@ static int create_image(int platform_mode)
306 Platform_finish: 306 Platform_finish:
307 platform_finish(platform_mode); 307 platform_finish(platform_mode);
308 308
309 dpm_resume_noirq(in_suspend ? 309 dpm_resume_start(in_suspend ?
310 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 310 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
311 311
312 return error; 312 return error;
@@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode)
343 * successful freezer test. 343 * successful freezer test.
344 */ 344 */
345 freezer_test_done = true; 345 freezer_test_done = true;
346 goto Cleanup; 346 goto Thaw;
347 } 347 }
348 348
349 error = dpm_prepare(PMSG_FREEZE); 349 error = dpm_prepare(PMSG_FREEZE);
350 if (error) { 350 if (error) {
351 dpm_complete(PMSG_RECOVER); 351 dpm_complete(PMSG_RECOVER);
352 goto Cleanup; 352 goto Thaw;
353 } 353 }
354 354
355 suspend_console(); 355 suspend_console();
@@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode)
385 platform_end(platform_mode); 385 platform_end(platform_mode);
386 return error; 386 return error;
387 387
388 Thaw:
389 thaw_kernel_threads();
388 Cleanup: 390 Cleanup:
389 swsusp_free(); 391 swsusp_free();
390 goto Close; 392 goto Close;
@@ -394,16 +396,16 @@ int hibernation_snapshot(int platform_mode)
394 * resume_target_kernel - Restore system state from a hibernation image. 396 * resume_target_kernel - Restore system state from a hibernation image.
395 * @platform_mode: Whether or not to use the platform driver. 397 * @platform_mode: Whether or not to use the platform driver.
396 * 398 *
397 * Execute device drivers' .freeze_noirq() callbacks, restore the contents of 399 * Execute device drivers' "noirq" and "late" freeze callbacks, restore the
398 * highmem that have not been restored yet from the image and run the low-level 400 * contents of highmem that have not been restored yet from the image and run
399 * code that will restore the remaining contents of memory and switch to the 401 * the low-level code that will restore the remaining contents of memory and
400 * just restored target kernel. 402 * switch to the just restored target kernel.
401 */ 403 */
402static int resume_target_kernel(bool platform_mode) 404static int resume_target_kernel(bool platform_mode)
403{ 405{
404 int error; 406 int error;
405 407
406 error = dpm_suspend_noirq(PMSG_QUIESCE); 408 error = dpm_suspend_end(PMSG_QUIESCE);
407 if (error) { 409 if (error) {
408 printk(KERN_ERR "PM: Some devices failed to power down, " 410 printk(KERN_ERR "PM: Some devices failed to power down, "
409 "aborting resume\n"); 411 "aborting resume\n");
@@ -460,7 +462,7 @@ static int resume_target_kernel(bool platform_mode)
460 Cleanup: 462 Cleanup:
461 platform_restore_cleanup(platform_mode); 463 platform_restore_cleanup(platform_mode);
462 464
463 dpm_resume_noirq(PMSG_RECOVER); 465 dpm_resume_start(PMSG_RECOVER);
464 466
465 return error; 467 return error;
466} 468}
@@ -518,7 +520,7 @@ int hibernation_platform_enter(void)
518 goto Resume_devices; 520 goto Resume_devices;
519 } 521 }
520 522
521 error = dpm_suspend_noirq(PMSG_HIBERNATE); 523 error = dpm_suspend_end(PMSG_HIBERNATE);
522 if (error) 524 if (error)
523 goto Resume_devices; 525 goto Resume_devices;
524 526
@@ -549,7 +551,7 @@ int hibernation_platform_enter(void)
549 Platform_finish: 551 Platform_finish:
550 hibernation_ops->finish(); 552 hibernation_ops->finish();
551 553
552 dpm_resume_noirq(PMSG_RESTORE); 554 dpm_resume_start(PMSG_RESTORE);
553 555
554 Resume_devices: 556 Resume_devices:
555 entering_platform_hibernation = false; 557 entering_platform_hibernation = false;
@@ -616,7 +618,7 @@ int hibernate(void)
616 /* Allocate memory management structures */ 618 /* Allocate memory management structures */
617 error = create_basic_memory_bitmaps(); 619 error = create_basic_memory_bitmaps();
618 if (error) 620 if (error)
619 goto Exit; 621 goto Enable_umh;
620 622
621 printk(KERN_INFO "PM: Syncing filesystems ... "); 623 printk(KERN_INFO "PM: Syncing filesystems ... ");
622 sys_sync(); 624 sys_sync();
@@ -624,15 +626,11 @@ int hibernate(void)
624 626
625 error = freeze_processes(); 627 error = freeze_processes();
626 if (error) 628 if (error)
627 goto Finish; 629 goto Free_bitmaps;
628 630
629 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 631 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
630 if (error) 632 if (error || freezer_test_done)
631 goto Thaw;
632 if (freezer_test_done) {
633 freezer_test_done = false;
634 goto Thaw; 633 goto Thaw;
635 }
636 634
637 if (in_suspend) { 635 if (in_suspend) {
638 unsigned int flags = 0; 636 unsigned int flags = 0;
@@ -657,8 +655,13 @@ int hibernate(void)
657 655
658 Thaw: 656 Thaw:
659 thaw_processes(); 657 thaw_processes();
660 Finish: 658
659 /* Don't bother checking whether freezer_test_done is true */
660 freezer_test_done = false;
661
662 Free_bitmaps:
661 free_basic_memory_bitmaps(); 663 free_basic_memory_bitmaps();
664 Enable_umh:
662 usermodehelper_enable(); 665 usermodehelper_enable();
663 Exit: 666 Exit:
664 pm_notifier_call_chain(PM_POST_HIBERNATION); 667 pm_notifier_call_chain(PM_POST_HIBERNATION);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9824b41e5a18..1c12581f1c62 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused)
165 last_errno %= REC_FAILED_NUM; 165 last_errno %= REC_FAILED_NUM;
166 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; 166 last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
167 last_step %= REC_FAILED_NUM; 167 last_step %= REC_FAILED_NUM;
168 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" 168 seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
169 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", 169 "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
170 "success", suspend_stats.success, 170 "success", suspend_stats.success,
171 "fail", suspend_stats.fail, 171 "fail", suspend_stats.fail,
172 "failed_freeze", suspend_stats.failed_freeze, 172 "failed_freeze", suspend_stats.failed_freeze,
173 "failed_prepare", suspend_stats.failed_prepare, 173 "failed_prepare", suspend_stats.failed_prepare,
174 "failed_suspend", suspend_stats.failed_suspend, 174 "failed_suspend", suspend_stats.failed_suspend,
175 "failed_suspend_late",
176 suspend_stats.failed_suspend_late,
175 "failed_suspend_noirq", 177 "failed_suspend_noirq",
176 suspend_stats.failed_suspend_noirq, 178 suspend_stats.failed_suspend_noirq,
177 "failed_resume", suspend_stats.failed_resume, 179 "failed_resume", suspend_stats.failed_resume,
180 "failed_resume_early",
181 suspend_stats.failed_resume_early,
178 "failed_resume_noirq", 182 "failed_resume_noirq",
179 suspend_stats.failed_resume_noirq); 183 suspend_stats.failed_resume_noirq);
180 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", 184 seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
@@ -287,16 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
287 291
288#ifdef CONFIG_SUSPEND 292#ifdef CONFIG_SUSPEND
289 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { 293 for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
290 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) 294 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
295 error = pm_suspend(state);
291 break; 296 break;
292 } 297 }
293 if (state < PM_SUSPEND_MAX && *s) {
294 error = enter_state(state);
295 if (error) {
296 suspend_stats.fail++;
297 dpm_save_failed_errno(error);
298 } else
299 suspend_stats.success++;
300 } 298 }
301#endif 299#endif
302 300
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 0c4defe6d3b8..98f3622d7407 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -177,13 +177,11 @@ extern const char *const pm_states[];
177 177
178extern bool valid_state(suspend_state_t state); 178extern bool valid_state(suspend_state_t state);
179extern int suspend_devices_and_enter(suspend_state_t state); 179extern int suspend_devices_and_enter(suspend_state_t state);
180extern int enter_state(suspend_state_t state);
181#else /* !CONFIG_SUSPEND */ 180#else /* !CONFIG_SUSPEND */
182static inline int suspend_devices_and_enter(suspend_state_t state) 181static inline int suspend_devices_and_enter(suspend_state_t state)
183{ 182{
184 return -ENOSYS; 183 return -ENOSYS;
185} 184}
186static inline int enter_state(suspend_state_t state) { return -ENOSYS; }
187static inline bool valid_state(suspend_state_t state) { return false; } 185static inline bool valid_state(suspend_state_t state) { return false; }
188#endif /* !CONFIG_SUSPEND */ 186#endif /* !CONFIG_SUSPEND */
189 187
@@ -231,8 +229,25 @@ extern int pm_test_level;
231#ifdef CONFIG_SUSPEND_FREEZER 229#ifdef CONFIG_SUSPEND_FREEZER
232static inline int suspend_freeze_processes(void) 230static inline int suspend_freeze_processes(void)
233{ 231{
234 int error = freeze_processes(); 232 int error;
235 return error ? : freeze_kernel_threads(); 233
234 error = freeze_processes();
235 /*
236 * freeze_processes() automatically thaws every task if freezing
237 * fails. So we need not do anything extra upon error.
238 */
239 if (error)
240 return error;
241
242 error = freeze_kernel_threads();
243 /*
244 * freeze_kernel_threads() thaws only kernel threads upon freezing
245 * failure. So we have to thaw the userspace tasks ourselves.
246 */
247 if (error)
248 thaw_processes();
249
250 return error;
236} 251}
237 252
238static inline void suspend_thaw_processes(void) 253static inline void suspend_thaw_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 77274c9ba2f1..0d2aeb226108 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only)
53 * It is "frozen enough". If the task does wake 53 * It is "frozen enough". If the task does wake
54 * up, it will immediately call try_to_freeze. 54 * up, it will immediately call try_to_freeze.
55 * 55 *
56 * Because freeze_task() goes through p's 56 * Because freeze_task() goes through p's scheduler lock, it's
57 * scheduler lock after setting TIF_FREEZE, it's 57 * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
58 * guaranteed that either we see TASK_RUNNING or 58 * transition can't race with task state testing here.
59 * try_to_stop() after schedule() in ptrace/signal
60 * stop sees TIF_FREEZE.
61 */ 59 */
62 if (!task_is_stopped_or_traced(p) && 60 if (!task_is_stopped_or_traced(p) &&
63 !freezer_should_skip(p)) 61 !freezer_should_skip(p))
@@ -98,13 +96,15 @@ static int try_to_freeze_tasks(bool user_only)
98 elapsed_csecs / 100, elapsed_csecs % 100, 96 elapsed_csecs / 100, elapsed_csecs % 100,
99 todo - wq_busy, wq_busy); 97 todo - wq_busy, wq_busy);
100 98
101 read_lock(&tasklist_lock); 99 if (!wakeup) {
102 do_each_thread(g, p) { 100 read_lock(&tasklist_lock);
103 if (!wakeup && !freezer_should_skip(p) && 101 do_each_thread(g, p) {
104 p != current && freezing(p) && !frozen(p)) 102 if (p != current && !freezer_should_skip(p)
105 sched_show_task(p); 103 && freezing(p) && !frozen(p))
106 } while_each_thread(g, p); 104 sched_show_task(p);
107 read_unlock(&tasklist_lock); 105 } while_each_thread(g, p);
106 read_unlock(&tasklist_lock);
107 }
108 } else { 108 } else {
109 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, 109 printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100,
110 elapsed_csecs % 100); 110 elapsed_csecs % 100);
@@ -143,7 +143,10 @@ int freeze_processes(void)
143/** 143/**
144 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. 144 * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
145 * 145 *
146 * On success, returns 0. On failure, -errno and system is fully thawed. 146 * On success, returns 0. On failure, -errno and only the kernel threads are
147 * thawed, so as to give a chance to the caller to do additional cleanups
148 * (if any) before thawing the userspace tasks. So, it is the responsibility
149 * of the caller to thaw the userspace tasks, when the time is right.
147 */ 150 */
148int freeze_kernel_threads(void) 151int freeze_kernel_threads(void)
149{ 152{
@@ -159,7 +162,7 @@ int freeze_kernel_threads(void)
159 BUG_ON(in_atomic()); 162 BUG_ON(in_atomic());
160 163
161 if (error) 164 if (error)
162 thaw_processes(); 165 thaw_kernel_threads();
163 return error; 166 return error;
164} 167}
165 168
@@ -188,3 +191,22 @@ void thaw_processes(void)
188 printk("done.\n"); 191 printk("done.\n");
189} 192}
190 193
194void thaw_kernel_threads(void)
195{
196 struct task_struct *g, *p;
197
198 pm_nosig_freezing = false;
199 printk("Restarting kernel threads ... ");
200
201 thaw_workqueues();
202
203 read_lock(&tasklist_lock);
204 do_each_thread(g, p) {
205 if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
206 __thaw_task(p);
207 } while_each_thread(g, p);
208 read_unlock(&tasklist_lock);
209
210 schedule();
211 printk("done.\n");
212}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 995e3bd3417b..d6d6dbd1ecc0 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
469static int __init pm_qos_power_init(void) 469static int __init pm_qos_power_init(void)
470{ 470{
471 int ret = 0; 471 int ret = 0;
472 int i;
472 473
473 ret = register_pm_qos_misc(&cpu_dma_pm_qos); 474 BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES);
474 if (ret < 0) { 475
475 printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); 476 for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
476 return ret; 477 ret = register_pm_qos_misc(pm_qos_array[i]);
477 } 478 if (ret < 0) {
478 ret = register_pm_qos_misc(&network_lat_pm_qos); 479 printk(KERN_ERR "pm_qos_param: %s setup failed\n",
479 if (ret < 0) { 480 pm_qos_array[i]->name);
480 printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); 481 return ret;
481 return ret; 482 }
482 } 483 }
483 ret = register_pm_qos_misc(&network_throughput_pm_qos);
484 if (ret < 0)
485 printk(KERN_ERR
486 "pm_qos_param: network_throughput setup failed\n");
487 484
488 return ret; 485 return ret;
489} 486}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1cf88900ec4f..0de28576807d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
711 list_for_each_entry(region, &nosave_regions, list) { 711 list_for_each_entry(region, &nosave_regions, list) {
712 unsigned long pfn; 712 unsigned long pfn;
713 713
714 pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", 714 pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
715 region->start_pfn << PAGE_SHIFT, 715 (unsigned long long) region->start_pfn << PAGE_SHIFT,
716 region->end_pfn << PAGE_SHIFT); 716 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
717 - 1);
717 718
718 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) 719 for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
719 if (pfn_valid(pfn)) { 720 if (pfn_valid(pfn)) {
@@ -812,7 +813,8 @@ unsigned int snapshot_additional_pages(struct zone *zone)
812 unsigned int res; 813 unsigned int res;
813 814
814 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); 815 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
815 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); 816 res += DIV_ROUND_UP(res * sizeof(struct bm_block),
817 LINKED_PAGE_DATA_SIZE);
816 return 2 * res; 818 return 2 * res;
817} 819}
818 820
@@ -999,20 +1001,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
999 s_page = pfn_to_page(src_pfn); 1001 s_page = pfn_to_page(src_pfn);
1000 d_page = pfn_to_page(dst_pfn); 1002 d_page = pfn_to_page(dst_pfn);
1001 if (PageHighMem(s_page)) { 1003 if (PageHighMem(s_page)) {
1002 src = kmap_atomic(s_page, KM_USER0); 1004 src = kmap_atomic(s_page);
1003 dst = kmap_atomic(d_page, KM_USER1); 1005 dst = kmap_atomic(d_page);
1004 do_copy_page(dst, src); 1006 do_copy_page(dst, src);
1005 kunmap_atomic(dst, KM_USER1); 1007 kunmap_atomic(dst);
1006 kunmap_atomic(src, KM_USER0); 1008 kunmap_atomic(src);
1007 } else { 1009 } else {
1008 if (PageHighMem(d_page)) { 1010 if (PageHighMem(d_page)) {
1009 /* Page pointed to by src may contain some kernel 1011 /* Page pointed to by src may contain some kernel
1010 * data modified by kmap_atomic() 1012 * data modified by kmap_atomic()
1011 */ 1013 */
1012 safe_copy_page(buffer, s_page); 1014 safe_copy_page(buffer, s_page);
1013 dst = kmap_atomic(d_page, KM_USER0); 1015 dst = kmap_atomic(d_page);
1014 copy_page(dst, buffer); 1016 copy_page(dst, buffer);
1015 kunmap_atomic(dst, KM_USER0); 1017 kunmap_atomic(dst);
1016 } else { 1018 } else {
1017 safe_copy_page(page_address(d_page), s_page); 1019 safe_copy_page(page_address(d_page), s_page);
1018 } 1020 }
@@ -1727,9 +1729,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
1727 */ 1729 */
1728 void *kaddr; 1730 void *kaddr;
1729 1731
1730 kaddr = kmap_atomic(page, KM_USER0); 1732 kaddr = kmap_atomic(page);
1731 copy_page(buffer, kaddr); 1733 copy_page(buffer, kaddr);
1732 kunmap_atomic(kaddr, KM_USER0); 1734 kunmap_atomic(kaddr);
1733 handle->buffer = buffer; 1735 handle->buffer = buffer;
1734 } else { 1736 } else {
1735 handle->buffer = page_address(page); 1737 handle->buffer = page_address(page);
@@ -2013,9 +2015,9 @@ static void copy_last_highmem_page(void)
2013 if (last_highmem_page) { 2015 if (last_highmem_page) {
2014 void *dst; 2016 void *dst;
2015 2017
2016 dst = kmap_atomic(last_highmem_page, KM_USER0); 2018 dst = kmap_atomic(last_highmem_page);
2017 copy_page(dst, buffer); 2019 copy_page(dst, buffer);
2018 kunmap_atomic(dst, KM_USER0); 2020 kunmap_atomic(dst);
2019 last_highmem_page = NULL; 2021 last_highmem_page = NULL;
2020 } 2022 }
2021} 2023}
@@ -2308,13 +2310,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
2308{ 2310{
2309 void *kaddr1, *kaddr2; 2311 void *kaddr1, *kaddr2;
2310 2312
2311 kaddr1 = kmap_atomic(p1, KM_USER0); 2313 kaddr1 = kmap_atomic(p1);
2312 kaddr2 = kmap_atomic(p2, KM_USER1); 2314 kaddr2 = kmap_atomic(p2);
2313 copy_page(buf, kaddr1); 2315 copy_page(buf, kaddr1);
2314 copy_page(kaddr1, kaddr2); 2316 copy_page(kaddr1, kaddr2);
2315 copy_page(kaddr2, buf); 2317 copy_page(kaddr2, buf);
2316 kunmap_atomic(kaddr2, KM_USER1); 2318 kunmap_atomic(kaddr2);
2317 kunmap_atomic(kaddr1, KM_USER0); 2319 kunmap_atomic(kaddr1);
2318} 2320}
2319 2321
2320/** 2322/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4fd51beed879..88e5c967370d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = {
37static const struct platform_suspend_ops *suspend_ops; 37static const struct platform_suspend_ops *suspend_ops;
38 38
39/** 39/**
40 * suspend_set_ops - Set the global suspend method table. 40 * suspend_set_ops - Set the global suspend method table.
41 * @ops: Pointer to ops structure. 41 * @ops: Suspend operations to use.
42 */ 42 */
43void suspend_set_ops(const struct platform_suspend_ops *ops) 43void suspend_set_ops(const struct platform_suspend_ops *ops)
44{ 44{
@@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state)
58} 58}
59 59
60/** 60/**
61 * suspend_valid_only_mem - generic memory-only valid callback 61 * suspend_valid_only_mem - Generic memory-only valid callback.
62 * 62 *
63 * Platform drivers that implement mem suspend only and only need 63 * Platform drivers that implement mem suspend only and only need to check for
64 * to check for that in their .valid callback can use this instead 64 * that in their .valid() callback can use this instead of rolling their own
65 * of rolling their own .valid callback. 65 * .valid() callback.
66 */ 66 */
67int suspend_valid_only_mem(suspend_state_t state) 67int suspend_valid_only_mem(suspend_state_t state)
68{ 68{
@@ -83,10 +83,11 @@ static int suspend_test(int level)
83} 83}
84 84
85/** 85/**
86 * suspend_prepare - Do prep work before entering low-power state. 86 * suspend_prepare - Prepare for entering system sleep state.
87 * 87 *
88 * This is common code that is called for each state that we're entering. 88 * Common code run for every system sleep state that can be entered (except for
89 * Run suspend notifiers, allocate a console and stop all processes. 89 * hibernation). Run suspend notifiers, allocate the "suspend" console and
90 * freeze processes.
90 */ 91 */
91static int suspend_prepare(void) 92static int suspend_prepare(void)
92{ 93{
@@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
131} 132}
132 133
133/** 134/**
134 * suspend_enter - enter the desired system sleep state. 135 * suspend_enter - Make the system enter the given sleep state.
135 * @state: State to enter 136 * @state: System sleep state to enter.
136 * @wakeup: Returns information that suspend should not be entered again. 137 * @wakeup: Returns information that the sleep state should not be re-entered.
137 * 138 *
138 * This function should be called after devices have been suspended. 139 * This function should be called after devices have been suspended.
139 */ 140 */
@@ -147,7 +148,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
147 goto Platform_finish; 148 goto Platform_finish;
148 } 149 }
149 150
150 error = dpm_suspend_noirq(PMSG_SUSPEND); 151 error = dpm_suspend_end(PMSG_SUSPEND);
151 if (error) { 152 if (error) {
152 printk(KERN_ERR "PM: Some devices failed to power down\n"); 153 printk(KERN_ERR "PM: Some devices failed to power down\n");
153 goto Platform_finish; 154 goto Platform_finish;
@@ -189,7 +190,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
189 if (suspend_ops->wake) 190 if (suspend_ops->wake)
190 suspend_ops->wake(); 191 suspend_ops->wake();
191 192
192 dpm_resume_noirq(PMSG_RESUME); 193 dpm_resume_start(PMSG_RESUME);
193 194
194 Platform_finish: 195 Platform_finish:
195 if (suspend_ops->finish) 196 if (suspend_ops->finish)
@@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
199} 200}
200 201
201/** 202/**
202 * suspend_devices_and_enter - suspend devices and enter the desired system 203 * suspend_devices_and_enter - Suspend devices and enter system sleep state.
203 * sleep state. 204 * @state: System sleep state to enter.
204 * @state: state to enter
205 */ 205 */
206int suspend_devices_and_enter(suspend_state_t state) 206int suspend_devices_and_enter(suspend_state_t state)
207{ 207{
@@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state)
251} 251}
252 252
253/** 253/**
254 * suspend_finish - Do final work before exiting suspend sequence. 254 * suspend_finish - Clean up before finishing the suspend sequence.
255 * 255 *
256 * Call platform code to clean up, restart processes, and free the 256 * Call platform code to clean up, restart processes, and free the console that
257 * console that we've allocated. This is not called for suspend-to-disk. 257 * we've allocated. This routine is not called for hibernation.
258 */ 258 */
259static void suspend_finish(void) 259static void suspend_finish(void)
260{ 260{
@@ -265,16 +265,14 @@ static void suspend_finish(void)
265} 265}
266 266
267/** 267/**
268 * enter_state - Do common work of entering low-power state. 268 * enter_state - Do common work needed to enter system sleep state.
269 * @state: pm_state structure for state we're entering. 269 * @state: System sleep state to enter.
270 * 270 *
271 * Make sure we're the only ones trying to enter a sleep state. Fail 271 * Make sure that no one else is trying to put the system into a sleep state.
272 * if someone has beat us to it, since we don't want anything weird to 272 * Fail if that's not the case. Otherwise, prepare for system suspend, make the
273 * happen when we wake up. 273 * system enter the given sleep state and clean up after wakeup.
274 * Then, do the setup for suspend, enter the state, and cleaup (after
275 * we've woken up).
276 */ 274 */
277int enter_state(suspend_state_t state) 275static int enter_state(suspend_state_t state)
278{ 276{
279 int error; 277 int error;
280 278
@@ -310,24 +308,26 @@ int enter_state(suspend_state_t state)
310} 308}
311 309
312/** 310/**
313 * pm_suspend - Externally visible function for suspending system. 311 * pm_suspend - Externally visible function for suspending the system.
314 * @state: Enumerated value of state to enter. 312 * @state: System sleep state to enter.
315 * 313 *
316 * Determine whether or not value is within range, get state 314 * Check if the value of @state represents one of the supported states,
317 * structure, and enter (above). 315 * execute enter_state() and update system suspend statistics.
318 */ 316 */
319int pm_suspend(suspend_state_t state) 317int pm_suspend(suspend_state_t state)
320{ 318{
321 int ret; 319 int error;
322 if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { 320
323 ret = enter_state(state); 321 if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
324 if (ret) { 322 return -EINVAL;
325 suspend_stats.fail++; 323
326 dpm_save_failed_errno(ret); 324 error = enter_state(state);
327 } else 325 if (error) {
328 suspend_stats.success++; 326 suspend_stats.fail++;
329 return ret; 327 dpm_save_failed_errno(error);
328 } else {
329 suspend_stats.success++;
330 } 330 }
331 return -EINVAL; 331 return error;
332} 332}
333EXPORT_SYMBOL(pm_suspend); 333EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6b1ab7a88522..33c4329205af 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -251,12 +251,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
251 error = hibernation_snapshot(data->platform_support); 251 error = hibernation_snapshot(data->platform_support);
252 if (!error) { 252 if (!error) {
253 error = put_user(in_suspend, (int __user *)arg); 253 error = put_user(in_suspend, (int __user *)arg);
254 if (!error && !freezer_test_done) 254 data->ready = !freezer_test_done && !error;
255 data->ready = 1; 255 freezer_test_done = false;
256 if (freezer_test_done) {
257 freezer_test_done = false;
258 thaw_processes();
259 }
260 } 256 }
261 break; 257 break;
262 258
@@ -274,6 +270,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
274 swsusp_free(); 270 swsusp_free();
275 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 271 memset(&data->handle, 0, sizeof(struct snapshot_handle));
276 data->ready = 0; 272 data->ready = 0;
273 /*
274 * It is necessary to thaw kernel threads here, because
275 * SNAPSHOT_CREATE_IMAGE may be invoked directly after
276 * SNAPSHOT_FREE. In that case, if kernel threads were not
277 * thawed, the preallocation of memory carried out by
278 * hibernation_snapshot() might run into problems (i.e. it
279 * might fail or even deadlock).
280 */
281 thaw_kernel_threads();
277 break; 282 break;
278 283
279 case SNAPSHOT_PREF_IMAGE_SIZE: 284 case SNAPSHOT_PREF_IMAGE_SIZE:
diff --git a/kernel/printk.c b/kernel/printk.c
index 13c0a1143f49..b663c2c95d39 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -44,6 +44,9 @@
44 44
45#include <asm/uaccess.h> 45#include <asm/uaccess.h>
46 46
47#define CREATE_TRACE_POINTS
48#include <trace/events/printk.h>
49
47/* 50/*
48 * Architectures can override it: 51 * Architectures can override it:
49 */ 52 */
@@ -542,6 +545,8 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
542static void _call_console_drivers(unsigned start, 545static void _call_console_drivers(unsigned start,
543 unsigned end, int msg_log_level) 546 unsigned end, int msg_log_level)
544{ 547{
548 trace_console(&LOG_BUF(0), start, end, log_buf_len);
549
545 if ((msg_log_level < console_loglevel || ignore_loglevel) && 550 if ((msg_log_level < console_loglevel || ignore_loglevel) &&
546 console_drivers && start != end) { 551 console_drivers && start != end) {
547 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { 552 if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
@@ -702,6 +707,9 @@ static bool printk_time = 0;
702#endif 707#endif
703module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 708module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
704 709
710static bool always_kmsg_dump;
711module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
712
705/* Check if we have any console registered that can be called early in boot. */ 713/* Check if we have any console registered that can be called early in boot. */
706static int have_callable_console(void) 714static int have_callable_console(void)
707{ 715{
@@ -1208,13 +1216,27 @@ int is_console_locked(void)
1208 return console_locked; 1216 return console_locked;
1209} 1217}
1210 1218
1219/*
1220 * Delayed printk facility, for scheduler-internal messages:
1221 */
1222#define PRINTK_BUF_SIZE 512
1223
1224#define PRINTK_PENDING_WAKEUP 0x01
1225#define PRINTK_PENDING_SCHED 0x02
1226
1211static DEFINE_PER_CPU(int, printk_pending); 1227static DEFINE_PER_CPU(int, printk_pending);
1228static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
1212 1229
1213void printk_tick(void) 1230void printk_tick(void)
1214{ 1231{
1215 if (__this_cpu_read(printk_pending)) { 1232 if (__this_cpu_read(printk_pending)) {
1216 __this_cpu_write(printk_pending, 0); 1233 int pending = __this_cpu_xchg(printk_pending, 0);
1217 wake_up_interruptible(&log_wait); 1234 if (pending & PRINTK_PENDING_SCHED) {
1235 char *buf = __get_cpu_var(printk_sched_buf);
1236 printk(KERN_WARNING "[sched_delayed] %s", buf);
1237 }
1238 if (pending & PRINTK_PENDING_WAKEUP)
1239 wake_up_interruptible(&log_wait);
1218 } 1240 }
1219} 1241}
1220 1242
@@ -1228,7 +1250,7 @@ int printk_needs_cpu(int cpu)
1228void wake_up_klogd(void) 1250void wake_up_klogd(void)
1229{ 1251{
1230 if (waitqueue_active(&log_wait)) 1252 if (waitqueue_active(&log_wait))
1231 this_cpu_write(printk_pending, 1); 1253 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
1232} 1254}
1233 1255
1234/** 1256/**
@@ -1621,6 +1643,26 @@ late_initcall(printk_late_init);
1621 1643
1622#if defined CONFIG_PRINTK 1644#if defined CONFIG_PRINTK
1623 1645
1646int printk_sched(const char *fmt, ...)
1647{
1648 unsigned long flags;
1649 va_list args;
1650 char *buf;
1651 int r;
1652
1653 local_irq_save(flags);
1654 buf = __get_cpu_var(printk_sched_buf);
1655
1656 va_start(args, fmt);
1657 r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args);
1658 va_end(args);
1659
1660 __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED);
1661 local_irq_restore(flags);
1662
1663 return r;
1664}
1665
1624/* 1666/*
1625 * printk rate limiting, lifted from the networking subsystem. 1667 * printk rate limiting, lifted from the networking subsystem.
1626 * 1668 *
@@ -1732,6 +1774,9 @@ void kmsg_dump(enum kmsg_dump_reason reason)
1732 unsigned long l1, l2; 1774 unsigned long l1, l2;
1733 unsigned long flags; 1775 unsigned long flags;
1734 1776
1777 if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump)
1778 return;
1779
1735 /* Theoretically, the log could move on after we do this, but 1780 /* Theoretically, the log could move on after we do this, but
1736 there's not a lot we can do about that. The new messages 1781 there's not a lot we can do about that. The new messages
1737 will overwrite the start of what we dump. */ 1782 will overwrite the start of what we dump. */
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 00ab2ca5ed11..ee8d49b9c309 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -231,26 +231,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
231} 231}
232 232
233static int ptrace_attach(struct task_struct *task, long request, 233static int ptrace_attach(struct task_struct *task, long request,
234 unsigned long addr,
234 unsigned long flags) 235 unsigned long flags)
235{ 236{
236 bool seize = (request == PTRACE_SEIZE); 237 bool seize = (request == PTRACE_SEIZE);
237 int retval; 238 int retval;
238 239
239 /*
240 * SEIZE will enable new ptrace behaviors which will be implemented
241 * gradually. SEIZE_DEVEL is used to prevent applications
242 * expecting full SEIZE behaviors trapping on kernel commits which
243 * are still in the process of implementing them.
244 *
245 * Only test programs for new ptrace behaviors being implemented
246 * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
247 *
248 * Once SEIZE behaviors are completely implemented, this flag and
249 * the following test will be removed.
250 */
251 retval = -EIO; 240 retval = -EIO;
252 if (seize && !(flags & PTRACE_SEIZE_DEVEL)) 241 if (seize) {
253 goto out; 242 if (addr != 0)
243 goto out;
244 if (flags & ~(unsigned long)PTRACE_O_MASK)
245 goto out;
246 flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
247 } else {
248 flags = PT_PTRACED;
249 }
254 250
255 audit_ptrace(task); 251 audit_ptrace(task);
256 252
@@ -262,7 +258,7 @@ static int ptrace_attach(struct task_struct *task, long request,
262 258
263 /* 259 /*
264 * Protect exec's credential calculations against our interference; 260 * Protect exec's credential calculations against our interference;
265 * interference; SUID, SGID and LSM creds get determined differently 261 * SUID, SGID and LSM creds get determined differently
266 * under ptrace. 262 * under ptrace.
267 */ 263 */
268 retval = -ERESTARTNOINTR; 264 retval = -ERESTARTNOINTR;
@@ -282,11 +278,11 @@ static int ptrace_attach(struct task_struct *task, long request,
282 if (task->ptrace) 278 if (task->ptrace)
283 goto unlock_tasklist; 279 goto unlock_tasklist;
284 280
285 task->ptrace = PT_PTRACED;
286 if (seize) 281 if (seize)
287 task->ptrace |= PT_SEIZED; 282 flags |= PT_SEIZED;
288 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) 283 if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
289 task->ptrace |= PT_PTRACE_CAP; 284 flags |= PT_PTRACE_CAP;
285 task->ptrace = flags;
290 286
291 __ptrace_link(task, current); 287 __ptrace_link(task, current);
292 288
@@ -528,30 +524,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
528 524
529static int ptrace_setoptions(struct task_struct *child, unsigned long data) 525static int ptrace_setoptions(struct task_struct *child, unsigned long data)
530{ 526{
531 child->ptrace &= ~PT_TRACE_MASK; 527 unsigned flags;
532 528
533 if (data & PTRACE_O_TRACESYSGOOD) 529 if (data & ~(unsigned long)PTRACE_O_MASK)
534 child->ptrace |= PT_TRACESYSGOOD; 530 return -EINVAL;
535
536 if (data & PTRACE_O_TRACEFORK)
537 child->ptrace |= PT_TRACE_FORK;
538
539 if (data & PTRACE_O_TRACEVFORK)
540 child->ptrace |= PT_TRACE_VFORK;
541
542 if (data & PTRACE_O_TRACECLONE)
543 child->ptrace |= PT_TRACE_CLONE;
544
545 if (data & PTRACE_O_TRACEEXEC)
546 child->ptrace |= PT_TRACE_EXEC;
547
548 if (data & PTRACE_O_TRACEVFORKDONE)
549 child->ptrace |= PT_TRACE_VFORK_DONE;
550 531
551 if (data & PTRACE_O_TRACEEXIT) 532 /* Avoid intermediate state when all opts are cleared */
552 child->ptrace |= PT_TRACE_EXIT; 533 flags = child->ptrace;
534 flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
535 flags |= (data << PT_OPT_FLAG_SHIFT);
536 child->ptrace = flags;
553 537
554 return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; 538 return 0;
555} 539}
556 540
557static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) 541static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
@@ -891,7 +875,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
891 } 875 }
892 876
893 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { 877 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
894 ret = ptrace_attach(child, request, data); 878 ret = ptrace_attach(child, request, addr, data);
895 /* 879 /*
896 * Some architectures need to do book-keeping after 880 * Some architectures need to do book-keeping after
897 * a ptrace attach. 881 * a ptrace attach.
@@ -1034,7 +1018,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
1034 } 1018 }
1035 1019
1036 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { 1020 if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
1037 ret = ptrace_attach(child, request, data); 1021 ret = ptrace_attach(child, request, addr, data);
1038 /* 1022 /*
1039 * Some architectures need to do book-keeping after 1023 * Some architectures need to do book-keeping after
1040 * a ptrace attach. 1024 * a ptrace attach.
diff --git a/kernel/rcu.h b/kernel/rcu.h
index aa88baab5f78..8ba99cdc6515 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -33,8 +33,27 @@
33 * Process-level increment to ->dynticks_nesting field. This allows for 33 * Process-level increment to ->dynticks_nesting field. This allows for
34 * architectures that use half-interrupts and half-exceptions from 34 * architectures that use half-interrupts and half-exceptions from
35 * process context. 35 * process context.
36 *
37 * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH
38 * that counts the number of process-based reasons why RCU cannot
39 * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE
40 * is the value used to increment or decrement this field.
41 *
42 * The rest of the bits could in principle be used to count interrupts,
43 * but this would mean that a negative-one value in the interrupt
44 * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field.
45 * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK
46 * that is set to DYNTICK_TASK_FLAG upon initial exit from idle.
47 * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon
48 * initial exit from idle.
36 */ 49 */
37#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) 50#define DYNTICK_TASK_NEST_WIDTH 7
51#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1)
52#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1)
53#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2)
54#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3)
55#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \
56 DYNTICK_TASK_FLAG)
38 57
39/* 58/*
40 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally 59 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
@@ -50,7 +69,6 @@ extern struct debug_obj_descr rcuhead_debug_descr;
50 69
51static inline void debug_rcu_head_queue(struct rcu_head *head) 70static inline void debug_rcu_head_queue(struct rcu_head *head)
52{ 71{
53 WARN_ON_ONCE((unsigned long)head & 0x3);
54 debug_object_activate(head, &rcuhead_debug_descr); 72 debug_object_activate(head, &rcuhead_debug_descr);
55 debug_object_active_state(head, &rcuhead_debug_descr, 73 debug_object_active_state(head, &rcuhead_debug_descr,
56 STATE_RCU_HEAD_READY, 74 STATE_RCU_HEAD_READY,
@@ -76,16 +94,18 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
76 94
77extern void kfree(const void *); 95extern void kfree(const void *);
78 96
79static inline void __rcu_reclaim(char *rn, struct rcu_head *head) 97static inline bool __rcu_reclaim(char *rn, struct rcu_head *head)
80{ 98{
81 unsigned long offset = (unsigned long)head->func; 99 unsigned long offset = (unsigned long)head->func;
82 100
83 if (__is_kfree_rcu_offset(offset)) { 101 if (__is_kfree_rcu_offset(offset)) {
84 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); 102 RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
85 kfree((void *)head - offset); 103 kfree((void *)head - offset);
104 return 1;
86 } else { 105 } else {
87 RCU_TRACE(trace_rcu_invoke_callback(rn, head)); 106 RCU_TRACE(trace_rcu_invoke_callback(rn, head));
88 head->func(head); 107 head->func(head);
108 return 0;
89 } 109 }
90} 110}
91 111
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2bc4e135ff23..a86f1741cc27 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -88,6 +88,9 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
88 * section. 88 * section.
89 * 89 *
90 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. 90 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
91 *
92 * Note that rcu_read_lock() is disallowed if the CPU is either idle or
93 * offline from an RCU perspective, so check for those as well.
91 */ 94 */
92int rcu_read_lock_bh_held(void) 95int rcu_read_lock_bh_held(void)
93{ 96{
@@ -95,6 +98,8 @@ int rcu_read_lock_bh_held(void)
95 return 1; 98 return 1;
96 if (rcu_is_cpu_idle()) 99 if (rcu_is_cpu_idle())
97 return 0; 100 return 0;
101 if (!rcu_lockdep_current_cpu_online())
102 return 0;
98 return in_softirq() || irqs_disabled(); 103 return in_softirq() || irqs_disabled();
99} 104}
100EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 105EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 977296dca0a4..37a5444204d2 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -53,7 +53,7 @@ static void __call_rcu(struct rcu_head *head,
53 53
54#include "rcutiny_plugin.h" 54#include "rcutiny_plugin.h"
55 55
56static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; 56static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
57 57
58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
59static void rcu_idle_enter_common(long long oldval) 59static void rcu_idle_enter_common(long long oldval)
@@ -88,10 +88,16 @@ void rcu_idle_enter(void)
88 88
89 local_irq_save(flags); 89 local_irq_save(flags);
90 oldval = rcu_dynticks_nesting; 90 oldval = rcu_dynticks_nesting;
91 rcu_dynticks_nesting = 0; 91 WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
92 if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
93 DYNTICK_TASK_NEST_VALUE)
94 rcu_dynticks_nesting = 0;
95 else
96 rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
92 rcu_idle_enter_common(oldval); 97 rcu_idle_enter_common(oldval);
93 local_irq_restore(flags); 98 local_irq_restore(flags);
94} 99}
100EXPORT_SYMBOL_GPL(rcu_idle_enter);
95 101
96/* 102/*
97 * Exit an interrupt handler towards idle. 103 * Exit an interrupt handler towards idle.
@@ -140,11 +146,15 @@ void rcu_idle_exit(void)
140 146
141 local_irq_save(flags); 147 local_irq_save(flags);
142 oldval = rcu_dynticks_nesting; 148 oldval = rcu_dynticks_nesting;
143 WARN_ON_ONCE(oldval != 0); 149 WARN_ON_ONCE(rcu_dynticks_nesting < 0);
144 rcu_dynticks_nesting = DYNTICK_TASK_NESTING; 150 if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK)
151 rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
152 else
153 rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
145 rcu_idle_exit_common(oldval); 154 rcu_idle_exit_common(oldval);
146 local_irq_restore(flags); 155 local_irq_restore(flags);
147} 156}
157EXPORT_SYMBOL_GPL(rcu_idle_exit);
148 158
149/* 159/*
150 * Enter an interrupt handler, moving away from idle. 160 * Enter an interrupt handler, moving away from idle.
@@ -258,7 +268,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
258 268
259 /* If no RCU callbacks ready to invoke, just return. */ 269 /* If no RCU callbacks ready to invoke, just return. */
260 if (&rcp->rcucblist == rcp->donetail) { 270 if (&rcp->rcucblist == rcp->donetail) {
261 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); 271 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
262 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, 272 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
263 ACCESS_ONCE(rcp->rcucblist), 273 ACCESS_ONCE(rcp->rcucblist),
264 need_resched(), 274 need_resched(),
@@ -269,7 +279,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
269 279
270 /* Move the ready-to-invoke callbacks to a local list. */ 280 /* Move the ready-to-invoke callbacks to a local list. */
271 local_irq_save(flags); 281 local_irq_save(flags);
272 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); 282 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));
273 list = rcp->rcucblist; 283 list = rcp->rcucblist;
274 rcp->rcucblist = *rcp->donetail; 284 rcp->rcucblist = *rcp->donetail;
275 *rcp->donetail = NULL; 285 *rcp->donetail = NULL;
@@ -319,6 +329,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
319 */ 329 */
320void synchronize_sched(void) 330void synchronize_sched(void)
321{ 331{
332 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
333 !lock_is_held(&rcu_lock_map) &&
334 !lock_is_held(&rcu_sched_lock_map),
335 "Illegal synchronize_sched() in RCU read-side critical section");
322 cond_resched(); 336 cond_resched();
323} 337}
324EXPORT_SYMBOL_GPL(synchronize_sched); 338EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 9cb1ae4aabdd..22ecea0dfb62 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -132,6 +132,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
132 RCU_TRACE(.rcb.name = "rcu_preempt") 132 RCU_TRACE(.rcb.name = "rcu_preempt")
133}; 133};
134 134
135static void rcu_read_unlock_special(struct task_struct *t);
135static int rcu_preempted_readers_exp(void); 136static int rcu_preempted_readers_exp(void);
136static void rcu_report_exp_done(void); 137static void rcu_report_exp_done(void);
137 138
@@ -146,6 +147,16 @@ static int rcu_cpu_blocking_cur_gp(void)
146/* 147/*
147 * Check for a running RCU reader. Because there is only one CPU, 148 * Check for a running RCU reader. Because there is only one CPU,
148 * there can be but one running RCU reader at a time. ;-) 149 * there can be but one running RCU reader at a time. ;-)
150 *
151 * Returns zero if there are no running readers. Returns a positive
152 * number if there is at least one reader within its RCU read-side
153 * critical section. Returns a negative number if an outermost reader
154 * is in the midst of exiting from its RCU read-side critical section
155 *
156 * Returns zero if there are no running readers. Returns a positive
157 * number if there is at least one reader within its RCU read-side
158 * critical section. Returns a negative number if an outermost reader
159 * is in the midst of exiting from its RCU read-side critical section.
149 */ 160 */
150static int rcu_preempt_running_reader(void) 161static int rcu_preempt_running_reader(void)
151{ 162{
@@ -307,7 +318,6 @@ static int rcu_boost(void)
307 t = container_of(tb, struct task_struct, rcu_node_entry); 318 t = container_of(tb, struct task_struct, rcu_node_entry);
308 rt_mutex_init_proxy_locked(&mtx, t); 319 rt_mutex_init_proxy_locked(&mtx, t);
309 t->rcu_boost_mutex = &mtx; 320 t->rcu_boost_mutex = &mtx;
310 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
311 raw_local_irq_restore(flags); 321 raw_local_irq_restore(flags);
312 rt_mutex_lock(&mtx); 322 rt_mutex_lock(&mtx);
313 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ 323 rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
@@ -475,7 +485,7 @@ void rcu_preempt_note_context_switch(void)
475 unsigned long flags; 485 unsigned long flags;
476 486
477 local_irq_save(flags); /* must exclude scheduler_tick(). */ 487 local_irq_save(flags); /* must exclude scheduler_tick(). */
478 if (rcu_preempt_running_reader() && 488 if (rcu_preempt_running_reader() > 0 &&
479 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 489 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
480 490
481 /* Possibly blocking in an RCU read-side critical section. */ 491 /* Possibly blocking in an RCU read-side critical section. */
@@ -494,6 +504,13 @@ void rcu_preempt_note_context_switch(void)
494 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); 504 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
495 if (rcu_cpu_blocking_cur_gp()) 505 if (rcu_cpu_blocking_cur_gp())
496 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; 506 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
507 } else if (rcu_preempt_running_reader() < 0 &&
508 t->rcu_read_unlock_special) {
509 /*
510 * Complete exit from RCU read-side critical section on
511 * behalf of preempted instance of __rcu_read_unlock().
512 */
513 rcu_read_unlock_special(t);
497 } 514 }
498 515
499 /* 516 /*
@@ -526,12 +543,15 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
526 * notify RCU core processing or task having blocked during the RCU 543 * notify RCU core processing or task having blocked during the RCU
527 * read-side critical section. 544 * read-side critical section.
528 */ 545 */
529static void rcu_read_unlock_special(struct task_struct *t) 546static noinline void rcu_read_unlock_special(struct task_struct *t)
530{ 547{
531 int empty; 548 int empty;
532 int empty_exp; 549 int empty_exp;
533 unsigned long flags; 550 unsigned long flags;
534 struct list_head *np; 551 struct list_head *np;
552#ifdef CONFIG_RCU_BOOST
553 struct rt_mutex *rbmp = NULL;
554#endif /* #ifdef CONFIG_RCU_BOOST */
535 int special; 555 int special;
536 556
537 /* 557 /*
@@ -552,7 +572,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
552 rcu_preempt_cpu_qs(); 572 rcu_preempt_cpu_qs();
553 573
554 /* Hardware IRQ handlers cannot block. */ 574 /* Hardware IRQ handlers cannot block. */
555 if (in_irq()) { 575 if (in_irq() || in_serving_softirq()) {
556 local_irq_restore(flags); 576 local_irq_restore(flags);
557 return; 577 return;
558 } 578 }
@@ -597,10 +617,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
597 } 617 }
598#ifdef CONFIG_RCU_BOOST 618#ifdef CONFIG_RCU_BOOST
599 /* Unboost self if was boosted. */ 619 /* Unboost self if was boosted. */
600 if (special & RCU_READ_UNLOCK_BOOSTED) { 620 if (t->rcu_boost_mutex != NULL) {
601 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; 621 rbmp = t->rcu_boost_mutex;
602 rt_mutex_unlock(t->rcu_boost_mutex);
603 t->rcu_boost_mutex = NULL; 622 t->rcu_boost_mutex = NULL;
623 rt_mutex_unlock(rbmp);
604 } 624 }
605#endif /* #ifdef CONFIG_RCU_BOOST */ 625#endif /* #ifdef CONFIG_RCU_BOOST */
606 local_irq_restore(flags); 626 local_irq_restore(flags);
@@ -618,13 +638,22 @@ void __rcu_read_unlock(void)
618 struct task_struct *t = current; 638 struct task_struct *t = current;
619 639
620 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ 640 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
621 --t->rcu_read_lock_nesting; 641 if (t->rcu_read_lock_nesting != 1)
622 barrier(); /* decrement before load of ->rcu_read_unlock_special */ 642 --t->rcu_read_lock_nesting;
623 if (t->rcu_read_lock_nesting == 0 && 643 else {
624 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 644 t->rcu_read_lock_nesting = INT_MIN;
625 rcu_read_unlock_special(t); 645 barrier(); /* assign before ->rcu_read_unlock_special load */
646 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
647 rcu_read_unlock_special(t);
648 barrier(); /* ->rcu_read_unlock_special load before assign */
649 t->rcu_read_lock_nesting = 0;
650 }
626#ifdef CONFIG_PROVE_LOCKING 651#ifdef CONFIG_PROVE_LOCKING
627 WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); 652 {
653 int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting);
654
655 WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2);
656 }
628#endif /* #ifdef CONFIG_PROVE_LOCKING */ 657#endif /* #ifdef CONFIG_PROVE_LOCKING */
629} 658}
630EXPORT_SYMBOL_GPL(__rcu_read_unlock); 659EXPORT_SYMBOL_GPL(__rcu_read_unlock);
@@ -649,7 +678,7 @@ static void rcu_preempt_check_callbacks(void)
649 invoke_rcu_callbacks(); 678 invoke_rcu_callbacks();
650 if (rcu_preempt_gp_in_progress() && 679 if (rcu_preempt_gp_in_progress() &&
651 rcu_cpu_blocking_cur_gp() && 680 rcu_cpu_blocking_cur_gp() &&
652 rcu_preempt_running_reader()) 681 rcu_preempt_running_reader() > 0)
653 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 682 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
654} 683}
655 684
@@ -706,6 +735,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
706 */ 735 */
707void synchronize_rcu(void) 736void synchronize_rcu(void)
708{ 737{
738 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
739 !lock_is_held(&rcu_lock_map) &&
740 !lock_is_held(&rcu_sched_lock_map),
741 "Illegal synchronize_rcu() in RCU read-side critical section");
742
709#ifdef CONFIG_DEBUG_LOCK_ALLOC 743#ifdef CONFIG_DEBUG_LOCK_ALLOC
710 if (!rcu_scheduler_active) 744 if (!rcu_scheduler_active)
711 return; 745 return;
@@ -882,7 +916,8 @@ static void rcu_preempt_process_callbacks(void)
882static void invoke_rcu_callbacks(void) 916static void invoke_rcu_callbacks(void)
883{ 917{
884 have_rcu_kthread_work = 1; 918 have_rcu_kthread_work = 1;
885 wake_up(&rcu_kthread_wq); 919 if (rcu_kthread_task != NULL)
920 wake_up(&rcu_kthread_wq);
886} 921}
887 922
888#ifdef CONFIG_RCU_TRACE 923#ifdef CONFIG_RCU_TRACE
@@ -943,12 +978,16 @@ early_initcall(rcu_spawn_kthreads);
943 978
944#else /* #ifdef CONFIG_RCU_BOOST */ 979#else /* #ifdef CONFIG_RCU_BOOST */
945 980
981/* Hold off callback invocation until early_initcall() time. */
982static int rcu_scheduler_fully_active __read_mostly;
983
946/* 984/*
947 * Start up softirq processing of callbacks. 985 * Start up softirq processing of callbacks.
948 */ 986 */
949void invoke_rcu_callbacks(void) 987void invoke_rcu_callbacks(void)
950{ 988{
951 raise_softirq(RCU_SOFTIRQ); 989 if (rcu_scheduler_fully_active)
990 raise_softirq(RCU_SOFTIRQ);
952} 991}
953 992
954#ifdef CONFIG_RCU_TRACE 993#ifdef CONFIG_RCU_TRACE
@@ -963,10 +1002,14 @@ static bool rcu_is_callbacks_kthread(void)
963 1002
964#endif /* #ifdef CONFIG_RCU_TRACE */ 1003#endif /* #ifdef CONFIG_RCU_TRACE */
965 1004
966void rcu_init(void) 1005static int __init rcu_scheduler_really_started(void)
967{ 1006{
1007 rcu_scheduler_fully_active = 1;
968 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1008 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1009 raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */
1010 return 0;
969} 1011}
1012early_initcall(rcu_scheduler_really_started);
970 1013
971#endif /* #else #ifdef CONFIG_RCU_BOOST */ 1014#endif /* #else #ifdef CONFIG_RCU_BOOST */
972 1015
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 88f17b8a3b1d..a89b381a8c6e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -56,8 +56,8 @@ static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
56static int nfakewriters = 4; /* # fake writer threads */ 56static int nfakewriters = 4; /* # fake writer threads */
57static int stat_interval; /* Interval between stats, in seconds. */ 57static int stat_interval; /* Interval between stats, in seconds. */
58 /* Defaults to "only at end of test". */ 58 /* Defaults to "only at end of test". */
59static int verbose; /* Print more debug info. */ 59static bool verbose; /* Print more debug info. */
60static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 60static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
@@ -65,7 +65,10 @@ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff; /* Hold time within burst (us). */ 65static int fqs_holdoff; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */ 66static int fqs_stutter = 3; /* Wait time between bursts (s). */
67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ 67static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
68static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */
68static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ 69static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
70static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */
71static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */
69static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ 72static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
70static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ 73static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
71static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ 74static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -95,8 +98,14 @@ module_param(fqs_stutter, int, 0444);
95MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); 98MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
96module_param(onoff_interval, int, 0444); 99module_param(onoff_interval, int, 0444);
97MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); 100MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
101module_param(onoff_holdoff, int, 0444);
102MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
98module_param(shutdown_secs, int, 0444); 103module_param(shutdown_secs, int, 0444);
99MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); 104MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
105module_param(stall_cpu, int, 0444);
106MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
107module_param(stall_cpu_holdoff, int, 0444);
108MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
100module_param(test_boost, int, 0444); 109module_param(test_boost, int, 0444);
101MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); 110MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
102module_param(test_boost_interval, int, 0444); 111module_param(test_boost_interval, int, 0444);
@@ -129,6 +138,7 @@ static struct task_struct *shutdown_task;
129#ifdef CONFIG_HOTPLUG_CPU 138#ifdef CONFIG_HOTPLUG_CPU
130static struct task_struct *onoff_task; 139static struct task_struct *onoff_task;
131#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 140#endif /* #ifdef CONFIG_HOTPLUG_CPU */
141static struct task_struct *stall_task;
132 142
133#define RCU_TORTURE_PIPE_LEN 10 143#define RCU_TORTURE_PIPE_LEN 10
134 144
@@ -990,12 +1000,12 @@ static void rcu_torture_timer(unsigned long unused)
990 rcu_read_lock_bh_held() || 1000 rcu_read_lock_bh_held() ||
991 rcu_read_lock_sched_held() || 1001 rcu_read_lock_sched_held() ||
992 srcu_read_lock_held(&srcu_ctl)); 1002 srcu_read_lock_held(&srcu_ctl));
993 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
994 if (p == NULL) { 1003 if (p == NULL) {
995 /* Leave because rcu_torture_writer is not yet underway */ 1004 /* Leave because rcu_torture_writer is not yet underway */
996 cur_ops->readunlock(idx); 1005 cur_ops->readunlock(idx);
997 return; 1006 return;
998 } 1007 }
1008 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
999 if (p->rtort_mbtest == 0) 1009 if (p->rtort_mbtest == 0)
1000 atomic_inc(&n_rcu_torture_mberror); 1010 atomic_inc(&n_rcu_torture_mberror);
1001 spin_lock(&rand_lock); 1011 spin_lock(&rand_lock);
@@ -1053,13 +1063,13 @@ rcu_torture_reader(void *arg)
1053 rcu_read_lock_bh_held() || 1063 rcu_read_lock_bh_held() ||
1054 rcu_read_lock_sched_held() || 1064 rcu_read_lock_sched_held() ||
1055 srcu_read_lock_held(&srcu_ctl)); 1065 srcu_read_lock_held(&srcu_ctl));
1056 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1057 if (p == NULL) { 1066 if (p == NULL) {
1058 /* Wait for rcu_torture_writer to get underway */ 1067 /* Wait for rcu_torture_writer to get underway */
1059 cur_ops->readunlock(idx); 1068 cur_ops->readunlock(idx);
1060 schedule_timeout_interruptible(HZ); 1069 schedule_timeout_interruptible(HZ);
1061 continue; 1070 continue;
1062 } 1071 }
1072 do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
1063 if (p->rtort_mbtest == 0) 1073 if (p->rtort_mbtest == 0)
1064 atomic_inc(&n_rcu_torture_mberror); 1074 atomic_inc(&n_rcu_torture_mberror);
1065 cur_ops->read_delay(&rand); 1075 cur_ops->read_delay(&rand);
@@ -1300,13 +1310,13 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
1300 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " 1310 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
1301 "test_boost=%d/%d test_boost_interval=%d " 1311 "test_boost=%d/%d test_boost_interval=%d "
1302 "test_boost_duration=%d shutdown_secs=%d " 1312 "test_boost_duration=%d shutdown_secs=%d "
1303 "onoff_interval=%d\n", 1313 "onoff_interval=%d onoff_holdoff=%d\n",
1304 torture_type, tag, nrealreaders, nfakewriters, 1314 torture_type, tag, nrealreaders, nfakewriters,
1305 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1315 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1306 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, 1316 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
1307 test_boost, cur_ops->can_boost, 1317 test_boost, cur_ops->can_boost,
1308 test_boost_interval, test_boost_duration, shutdown_secs, 1318 test_boost_interval, test_boost_duration, shutdown_secs,
1309 onoff_interval); 1319 onoff_interval, onoff_holdoff);
1310} 1320}
1311 1321
1312static struct notifier_block rcutorture_shutdown_nb = { 1322static struct notifier_block rcutorture_shutdown_nb = {
@@ -1399,7 +1409,7 @@ rcu_torture_shutdown(void *arg)
1399 * Execute random CPU-hotplug operations at the interval specified 1409 * Execute random CPU-hotplug operations at the interval specified
1400 * by the onoff_interval. 1410 * by the onoff_interval.
1401 */ 1411 */
1402static int 1412static int __cpuinit
1403rcu_torture_onoff(void *arg) 1413rcu_torture_onoff(void *arg)
1404{ 1414{
1405 int cpu; 1415 int cpu;
@@ -1410,6 +1420,11 @@ rcu_torture_onoff(void *arg)
1410 for_each_online_cpu(cpu) 1420 for_each_online_cpu(cpu)
1411 maxcpu = cpu; 1421 maxcpu = cpu;
1412 WARN_ON(maxcpu < 0); 1422 WARN_ON(maxcpu < 0);
1423 if (onoff_holdoff > 0) {
1424 VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
1425 schedule_timeout_interruptible(onoff_holdoff * HZ);
1426 VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
1427 }
1413 while (!kthread_should_stop()) { 1428 while (!kthread_should_stop()) {
1414 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); 1429 cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
1415 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { 1430 if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
@@ -1447,15 +1462,18 @@ rcu_torture_onoff(void *arg)
1447 return 0; 1462 return 0;
1448} 1463}
1449 1464
1450static int 1465static int __cpuinit
1451rcu_torture_onoff_init(void) 1466rcu_torture_onoff_init(void)
1452{ 1467{
1468 int ret;
1469
1453 if (onoff_interval <= 0) 1470 if (onoff_interval <= 0)
1454 return 0; 1471 return 0;
1455 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); 1472 onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
1456 if (IS_ERR(onoff_task)) { 1473 if (IS_ERR(onoff_task)) {
1474 ret = PTR_ERR(onoff_task);
1457 onoff_task = NULL; 1475 onoff_task = NULL;
1458 return PTR_ERR(onoff_task); 1476 return ret;
1459 } 1477 }
1460 return 0; 1478 return 0;
1461} 1479}
@@ -1481,6 +1499,63 @@ static void rcu_torture_onoff_cleanup(void)
1481 1499
1482#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ 1500#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1483 1501
1502/*
1503 * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
1504 * induces a CPU stall for the time specified by stall_cpu.
1505 */
1506static int __cpuinit rcu_torture_stall(void *args)
1507{
1508 unsigned long stop_at;
1509
1510 VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
1511 if (stall_cpu_holdoff > 0) {
1512 VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
1513 schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
1514 VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
1515 }
1516 if (!kthread_should_stop()) {
1517 stop_at = get_seconds() + stall_cpu;
1518 /* RCU CPU stall is expected behavior in following code. */
1519 printk(KERN_ALERT "rcu_torture_stall start.\n");
1520 rcu_read_lock();
1521 preempt_disable();
1522 while (ULONG_CMP_LT(get_seconds(), stop_at))
1523 continue; /* Induce RCU CPU stall warning. */
1524 preempt_enable();
1525 rcu_read_unlock();
1526 printk(KERN_ALERT "rcu_torture_stall end.\n");
1527 }
1528 rcutorture_shutdown_absorb("rcu_torture_stall");
1529 while (!kthread_should_stop())
1530 schedule_timeout_interruptible(10 * HZ);
1531 return 0;
1532}
1533
1534/* Spawn CPU-stall kthread, if stall_cpu specified. */
1535static int __init rcu_torture_stall_init(void)
1536{
1537 int ret;
1538
1539 if (stall_cpu <= 0)
1540 return 0;
1541 stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
1542 if (IS_ERR(stall_task)) {
1543 ret = PTR_ERR(stall_task);
1544 stall_task = NULL;
1545 return ret;
1546 }
1547 return 0;
1548}
1549
1550/* Clean up after the CPU-stall kthread, if one was spawned. */
1551static void rcu_torture_stall_cleanup(void)
1552{
1553 if (stall_task == NULL)
1554 return;
1555 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
1556 kthread_stop(stall_task);
1557}
1558
1484static int rcutorture_cpu_notify(struct notifier_block *self, 1559static int rcutorture_cpu_notify(struct notifier_block *self,
1485 unsigned long action, void *hcpu) 1560 unsigned long action, void *hcpu)
1486{ 1561{
@@ -1523,6 +1598,7 @@ rcu_torture_cleanup(void)
1523 fullstop = FULLSTOP_RMMOD; 1598 fullstop = FULLSTOP_RMMOD;
1524 mutex_unlock(&fullstop_mutex); 1599 mutex_unlock(&fullstop_mutex);
1525 unregister_reboot_notifier(&rcutorture_shutdown_nb); 1600 unregister_reboot_notifier(&rcutorture_shutdown_nb);
1601 rcu_torture_stall_cleanup();
1526 if (stutter_task) { 1602 if (stutter_task) {
1527 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); 1603 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
1528 kthread_stop(stutter_task); 1604 kthread_stop(stutter_task);
@@ -1602,6 +1678,10 @@ rcu_torture_cleanup(void)
1602 cur_ops->cleanup(); 1678 cur_ops->cleanup();
1603 if (atomic_read(&n_rcu_torture_error)) 1679 if (atomic_read(&n_rcu_torture_error))
1604 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); 1680 rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
1681 else if (n_online_successes != n_online_attempts ||
1682 n_offline_successes != n_offline_attempts)
1683 rcu_torture_print_module_parms(cur_ops,
1684 "End of test: RCU_HOTPLUG");
1605 else 1685 else
1606 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1686 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1607} 1687}
@@ -1819,6 +1899,7 @@ rcu_torture_init(void)
1819 } 1899 }
1820 rcu_torture_onoff_init(); 1900 rcu_torture_onoff_init();
1821 register_reboot_notifier(&rcutorture_shutdown_nb); 1901 register_reboot_notifier(&rcutorture_shutdown_nb);
1902 rcu_torture_stall_init();
1822 rcutorture_record_test_transition(); 1903 rcutorture_record_test_transition();
1823 mutex_unlock(&fullstop_mutex); 1904 mutex_unlock(&fullstop_mutex);
1824 return 0; 1905 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6c4a6722abfd..1050d6d3922c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -50,6 +50,8 @@
50#include <linux/wait.h> 50#include <linux/wait.h>
51#include <linux/kthread.h> 51#include <linux/kthread.h>
52#include <linux/prefetch.h> 52#include <linux/prefetch.h>
53#include <linux/delay.h>
54#include <linux/stop_machine.h>
53 55
54#include "rcutree.h" 56#include "rcutree.h"
55#include <trace/events/rcu.h> 57#include <trace/events/rcu.h>
@@ -196,7 +198,7 @@ void rcu_note_context_switch(int cpu)
196EXPORT_SYMBOL_GPL(rcu_note_context_switch); 198EXPORT_SYMBOL_GPL(rcu_note_context_switch);
197 199
198DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 200DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
199 .dynticks_nesting = DYNTICK_TASK_NESTING, 201 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
200 .dynticks = ATOMIC_INIT(1), 202 .dynticks = ATOMIC_INIT(1),
201}; 203};
202 204
@@ -208,8 +210,11 @@ module_param(blimit, int, 0);
208module_param(qhimark, int, 0); 210module_param(qhimark, int, 0);
209module_param(qlowmark, int, 0); 211module_param(qlowmark, int, 0);
210 212
211int rcu_cpu_stall_suppress __read_mostly; 213int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
214int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
215
212module_param(rcu_cpu_stall_suppress, int, 0644); 216module_param(rcu_cpu_stall_suppress, int, 0644);
217module_param(rcu_cpu_stall_timeout, int, 0644);
213 218
214static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 219static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
215static int rcu_pending(int cpu); 220static int rcu_pending(int cpu);
@@ -301,8 +306,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
301 return &rsp->node[0]; 306 return &rsp->node[0];
302} 307}
303 308
304#ifdef CONFIG_SMP
305
306/* 309/*
307 * If the specified CPU is offline, tell the caller that it is in 310 * If the specified CPU is offline, tell the caller that it is in
308 * a quiescent state. Otherwise, whack it with a reschedule IPI. 311 * a quiescent state. Otherwise, whack it with a reschedule IPI.
@@ -317,30 +320,21 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
317static int rcu_implicit_offline_qs(struct rcu_data *rdp) 320static int rcu_implicit_offline_qs(struct rcu_data *rdp)
318{ 321{
319 /* 322 /*
320 * If the CPU is offline, it is in a quiescent state. We can 323 * If the CPU is offline for more than a jiffy, it is in a quiescent
321 * trust its state not to change because interrupts are disabled. 324 * state. We can trust its state not to change because interrupts
325 * are disabled. The reason for the jiffy's worth of slack is to
326 * handle CPUs initializing on the way up and finding their way
327 * to the idle loop on the way down.
322 */ 328 */
323 if (cpu_is_offline(rdp->cpu)) { 329 if (cpu_is_offline(rdp->cpu) &&
330 ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {
324 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); 331 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
325 rdp->offline_fqs++; 332 rdp->offline_fqs++;
326 return 1; 333 return 1;
327 } 334 }
328
329 /*
330 * The CPU is online, so send it a reschedule IPI. This forces
331 * it through the scheduler, and (inefficiently) also handles cases
332 * where idle loops fail to inform RCU about the CPU being idle.
333 */
334 if (rdp->cpu != smp_processor_id())
335 smp_send_reschedule(rdp->cpu);
336 else
337 set_need_resched();
338 rdp->resched_ipi++;
339 return 0; 335 return 0;
340} 336}
341 337
342#endif /* #ifdef CONFIG_SMP */
343
344/* 338/*
345 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle 339 * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
346 * 340 *
@@ -366,6 +360,17 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
366 atomic_inc(&rdtp->dynticks); 360 atomic_inc(&rdtp->dynticks);
367 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ 361 smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
368 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 362 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
363
364 /*
365 * The idle task is not permitted to enter the idle loop while
366 * in an RCU read-side critical section.
367 */
368 rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
369 "Illegal idle entry in RCU read-side critical section.");
370 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map),
371 "Illegal idle entry in RCU-bh read-side critical section.");
372 rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map),
373 "Illegal idle entry in RCU-sched read-side critical section.");
369} 374}
370 375
371/** 376/**
@@ -389,10 +394,15 @@ void rcu_idle_enter(void)
389 local_irq_save(flags); 394 local_irq_save(flags);
390 rdtp = &__get_cpu_var(rcu_dynticks); 395 rdtp = &__get_cpu_var(rcu_dynticks);
391 oldval = rdtp->dynticks_nesting; 396 oldval = rdtp->dynticks_nesting;
392 rdtp->dynticks_nesting = 0; 397 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
398 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
399 rdtp->dynticks_nesting = 0;
400 else
401 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
393 rcu_idle_enter_common(rdtp, oldval); 402 rcu_idle_enter_common(rdtp, oldval);
394 local_irq_restore(flags); 403 local_irq_restore(flags);
395} 404}
405EXPORT_SYMBOL_GPL(rcu_idle_enter);
396 406
397/** 407/**
398 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 408 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -462,7 +472,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
462 * Exit idle mode, in other words, -enter- the mode in which RCU 472 * Exit idle mode, in other words, -enter- the mode in which RCU
463 * read-side critical sections can occur. 473 * read-side critical sections can occur.
464 * 474 *
465 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to 475 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to
466 * allow for the possibility of usermode upcalls messing up our count 476 * allow for the possibility of usermode upcalls messing up our count
467 * of interrupt nesting level during the busy period that is just 477 * of interrupt nesting level during the busy period that is just
468 * now starting. 478 * now starting.
@@ -476,11 +486,15 @@ void rcu_idle_exit(void)
476 local_irq_save(flags); 486 local_irq_save(flags);
477 rdtp = &__get_cpu_var(rcu_dynticks); 487 rdtp = &__get_cpu_var(rcu_dynticks);
478 oldval = rdtp->dynticks_nesting; 488 oldval = rdtp->dynticks_nesting;
479 WARN_ON_ONCE(oldval != 0); 489 WARN_ON_ONCE(oldval < 0);
480 rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; 490 if (oldval & DYNTICK_TASK_NEST_MASK)
491 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
492 else
493 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
481 rcu_idle_exit_common(rdtp, oldval); 494 rcu_idle_exit_common(rdtp, oldval);
482 local_irq_restore(flags); 495 local_irq_restore(flags);
483} 496}
497EXPORT_SYMBOL_GPL(rcu_idle_exit);
484 498
485/** 499/**
486 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle 500 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -581,6 +595,49 @@ int rcu_is_cpu_idle(void)
581} 595}
582EXPORT_SYMBOL(rcu_is_cpu_idle); 596EXPORT_SYMBOL(rcu_is_cpu_idle);
583 597
598#ifdef CONFIG_HOTPLUG_CPU
599
600/*
601 * Is the current CPU online? Disable preemption to avoid false positives
602 * that could otherwise happen due to the current CPU number being sampled,
603 * this task being preempted, its old CPU being taken offline, resuming
604 * on some other CPU, then determining that its old CPU is now offline.
605 * It is OK to use RCU on an offline processor during initial boot, hence
606 * the check for rcu_scheduler_fully_active. Note also that it is OK
607 * for a CPU coming online to use RCU for one jiffy prior to marking itself
608 * online in the cpu_online_mask. Similarly, it is OK for a CPU going
609 * offline to continue to use RCU for one jiffy after marking itself
610 * offline in the cpu_online_mask. This leniency is necessary given the
611 * non-atomic nature of the online and offline processing, for example,
612 * the fact that a CPU enters the scheduler after completing the CPU_DYING
613 * notifiers.
614 *
615 * This is also why RCU internally marks CPUs online during the
616 * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase.
617 *
618 * Disable checking if in an NMI handler because we cannot safely report
619 * errors from NMI handlers anyway.
620 */
621bool rcu_lockdep_current_cpu_online(void)
622{
623 struct rcu_data *rdp;
624 struct rcu_node *rnp;
625 bool ret;
626
627 if (in_nmi())
628 return 1;
629 preempt_disable();
630 rdp = &__get_cpu_var(rcu_sched_data);
631 rnp = rdp->mynode;
632 ret = (rdp->grpmask & rnp->qsmaskinit) ||
633 !rcu_scheduler_fully_active;
634 preempt_enable();
635 return ret;
636}
637EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
638
639#endif /* #ifdef CONFIG_HOTPLUG_CPU */
640
584#endif /* #ifdef CONFIG_PROVE_RCU */ 641#endif /* #ifdef CONFIG_PROVE_RCU */
585 642
586/** 643/**
@@ -595,8 +652,6 @@ int rcu_is_cpu_rrupt_from_idle(void)
595 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 652 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
596} 653}
597 654
598#ifdef CONFIG_SMP
599
600/* 655/*
601 * Snapshot the specified CPU's dynticks counter so that we can later 656 * Snapshot the specified CPU's dynticks counter so that we can later
602 * credit them with an implicit quiescent state. Return 1 if this CPU 657 * credit them with an implicit quiescent state. Return 1 if this CPU
@@ -640,12 +695,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
640 return rcu_implicit_offline_qs(rdp); 695 return rcu_implicit_offline_qs(rdp);
641} 696}
642 697
643#endif /* #ifdef CONFIG_SMP */ 698static int jiffies_till_stall_check(void)
699{
700 int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout);
701
702 /*
703 * Limit check must be consistent with the Kconfig limits
704 * for CONFIG_RCU_CPU_STALL_TIMEOUT.
705 */
706 if (till_stall_check < 3) {
707 ACCESS_ONCE(rcu_cpu_stall_timeout) = 3;
708 till_stall_check = 3;
709 } else if (till_stall_check > 300) {
710 ACCESS_ONCE(rcu_cpu_stall_timeout) = 300;
711 till_stall_check = 300;
712 }
713 return till_stall_check * HZ + RCU_STALL_DELAY_DELTA;
714}
644 715
645static void record_gp_stall_check_time(struct rcu_state *rsp) 716static void record_gp_stall_check_time(struct rcu_state *rsp)
646{ 717{
647 rsp->gp_start = jiffies; 718 rsp->gp_start = jiffies;
648 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; 719 rsp->jiffies_stall = jiffies + jiffies_till_stall_check();
649} 720}
650 721
651static void print_other_cpu_stall(struct rcu_state *rsp) 722static void print_other_cpu_stall(struct rcu_state *rsp)
@@ -664,13 +735,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
664 raw_spin_unlock_irqrestore(&rnp->lock, flags); 735 raw_spin_unlock_irqrestore(&rnp->lock, flags);
665 return; 736 return;
666 } 737 }
667 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 738 rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3;
668
669 /*
670 * Now rat on any tasks that got kicked up to the root rcu_node
671 * due to CPU offlining.
672 */
673 ndetected = rcu_print_task_stall(rnp);
674 raw_spin_unlock_irqrestore(&rnp->lock, flags); 739 raw_spin_unlock_irqrestore(&rnp->lock, flags);
675 740
676 /* 741 /*
@@ -678,8 +743,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
678 * See Documentation/RCU/stallwarn.txt for info on how to debug 743 * See Documentation/RCU/stallwarn.txt for info on how to debug
679 * RCU CPU stall warnings. 744 * RCU CPU stall warnings.
680 */ 745 */
681 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", 746 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:",
682 rsp->name); 747 rsp->name);
748 print_cpu_stall_info_begin();
683 rcu_for_each_leaf_node(rsp, rnp) { 749 rcu_for_each_leaf_node(rsp, rnp) {
684 raw_spin_lock_irqsave(&rnp->lock, flags); 750 raw_spin_lock_irqsave(&rnp->lock, flags);
685 ndetected += rcu_print_task_stall(rnp); 751 ndetected += rcu_print_task_stall(rnp);
@@ -688,11 +754,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
688 continue; 754 continue;
689 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 755 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
690 if (rnp->qsmask & (1UL << cpu)) { 756 if (rnp->qsmask & (1UL << cpu)) {
691 printk(" %d", rnp->grplo + cpu); 757 print_cpu_stall_info(rsp, rnp->grplo + cpu);
692 ndetected++; 758 ndetected++;
693 } 759 }
694 } 760 }
695 printk("} (detected by %d, t=%ld jiffies)\n", 761
762 /*
763 * Now rat on any tasks that got kicked up to the root rcu_node
764 * due to CPU offlining.
765 */
766 rnp = rcu_get_root(rsp);
767 raw_spin_lock_irqsave(&rnp->lock, flags);
768 ndetected = rcu_print_task_stall(rnp);
769 raw_spin_unlock_irqrestore(&rnp->lock, flags);
770
771 print_cpu_stall_info_end();
772 printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n",
696 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 773 smp_processor_id(), (long)(jiffies - rsp->gp_start));
697 if (ndetected == 0) 774 if (ndetected == 0)
698 printk(KERN_ERR "INFO: Stall ended before state dump start\n"); 775 printk(KERN_ERR "INFO: Stall ended before state dump start\n");
@@ -716,15 +793,18 @@ static void print_cpu_stall(struct rcu_state *rsp)
716 * See Documentation/RCU/stallwarn.txt for info on how to debug 793 * See Documentation/RCU/stallwarn.txt for info on how to debug
717 * RCU CPU stall warnings. 794 * RCU CPU stall warnings.
718 */ 795 */
719 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 796 printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name);
720 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 797 print_cpu_stall_info_begin();
798 print_cpu_stall_info(rsp, smp_processor_id());
799 print_cpu_stall_info_end();
800 printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start);
721 if (!trigger_all_cpu_backtrace()) 801 if (!trigger_all_cpu_backtrace())
722 dump_stack(); 802 dump_stack();
723 803
724 raw_spin_lock_irqsave(&rnp->lock, flags); 804 raw_spin_lock_irqsave(&rnp->lock, flags);
725 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) 805 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
726 rsp->jiffies_stall = 806 rsp->jiffies_stall = jiffies +
727 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 807 3 * jiffies_till_stall_check() + 3;
728 raw_spin_unlock_irqrestore(&rnp->lock, flags); 808 raw_spin_unlock_irqrestore(&rnp->lock, flags);
729 809
730 set_need_resched(); /* kick ourselves to get things going. */ 810 set_need_resched(); /* kick ourselves to get things going. */
@@ -807,6 +887,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
807 rdp->passed_quiesce = 0; 887 rdp->passed_quiesce = 0;
808 } else 888 } else
809 rdp->qs_pending = 0; 889 rdp->qs_pending = 0;
890 zero_cpu_stall_ticks(rdp);
810 } 891 }
811} 892}
812 893
@@ -943,6 +1024,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
943 * in preparation for detecting the next grace period. The caller must hold 1024 * in preparation for detecting the next grace period. The caller must hold
944 * the root node's ->lock, which is released before return. Hard irqs must 1025 * the root node's ->lock, which is released before return. Hard irqs must
945 * be disabled. 1026 * be disabled.
1027 *
1028 * Note that it is legal for a dying CPU (which is marked as offline) to
1029 * invoke this function. This can happen when the dying CPU reports its
1030 * quiescent state.
946 */ 1031 */
947static void 1032static void
948rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 1033rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
@@ -980,26 +1065,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
980 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ 1065 rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
981 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1066 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
982 record_gp_stall_check_time(rsp); 1067 record_gp_stall_check_time(rsp);
983
984 /* Special-case the common single-level case. */
985 if (NUM_RCU_NODES == 1) {
986 rcu_preempt_check_blocked_tasks(rnp);
987 rnp->qsmask = rnp->qsmaskinit;
988 rnp->gpnum = rsp->gpnum;
989 rnp->completed = rsp->completed;
990 rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
991 rcu_start_gp_per_cpu(rsp, rnp, rdp);
992 rcu_preempt_boost_start_gp(rnp);
993 trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
994 rnp->level, rnp->grplo,
995 rnp->grphi, rnp->qsmask);
996 raw_spin_unlock_irqrestore(&rnp->lock, flags);
997 return;
998 }
999
1000 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ 1068 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
1001 1069
1002
1003 /* Exclude any concurrent CPU-hotplug operations. */ 1070 /* Exclude any concurrent CPU-hotplug operations. */
1004 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1071 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1005 1072
@@ -1245,53 +1312,115 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
1245 1312
1246/* 1313/*
1247 * Move a dying CPU's RCU callbacks to online CPU's callback list. 1314 * Move a dying CPU's RCU callbacks to online CPU's callback list.
1248 * Synchronization is not required because this function executes 1315 * Also record a quiescent state for this CPU for the current grace period.
1249 * in stop_machine() context. 1316 * Synchronization and interrupt disabling are not required because
1317 * this function executes in stop_machine() context. Therefore, cleanup
1318 * operations that might block must be done later from the CPU_DEAD
1319 * notifier.
1320 *
1321 * Note that the outgoing CPU's bit has already been cleared in the
1322 * cpu_online_mask. This allows us to randomly pick a callback
1323 * destination from the bits set in that mask.
1250 */ 1324 */
1251static void rcu_send_cbs_to_online(struct rcu_state *rsp) 1325static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1252{ 1326{
1253 int i; 1327 int i;
1254 /* current DYING CPU is cleared in the cpu_online_mask */ 1328 unsigned long mask;
1255 int receive_cpu = cpumask_any(cpu_online_mask); 1329 int receive_cpu = cpumask_any(cpu_online_mask);
1256 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1330 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
1257 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); 1331 struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
1332 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */
1333
1334 /* First, adjust the counts. */
1335 if (rdp->nxtlist != NULL) {
1336 receive_rdp->qlen_lazy += rdp->qlen_lazy;
1337 receive_rdp->qlen += rdp->qlen;
1338 rdp->qlen_lazy = 0;
1339 rdp->qlen = 0;
1340 }
1258 1341
1259 if (rdp->nxtlist == NULL) 1342 /*
1260 return; /* irqs disabled, so comparison is stable. */ 1343 * Next, move ready-to-invoke callbacks to be invoked on some
1344 * other CPU. These will not be required to pass through another
1345 * grace period: They are done, regardless of CPU.
1346 */
1347 if (rdp->nxtlist != NULL &&
1348 rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) {
1349 struct rcu_head *oldhead;
1350 struct rcu_head **oldtail;
1351 struct rcu_head **newtail;
1352
1353 oldhead = rdp->nxtlist;
1354 oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
1355 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1356 *rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
1357 *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
1358 newtail = rdp->nxttail[RCU_DONE_TAIL];
1359 for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
1360 if (receive_rdp->nxttail[i] == oldtail)
1361 receive_rdp->nxttail[i] = newtail;
1362 if (rdp->nxttail[i] == newtail)
1363 rdp->nxttail[i] = &rdp->nxtlist;
1364 }
1365 }
1261 1366
1262 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; 1367 /*
1263 receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; 1368 * Finally, put the rest of the callbacks at the end of the list.
1264 receive_rdp->qlen += rdp->qlen; 1369 * The ones that made it partway through get to start over: We
1265 receive_rdp->n_cbs_adopted += rdp->qlen; 1370 * cannot assume that grace periods are synchronized across CPUs.
1266 rdp->n_cbs_orphaned += rdp->qlen; 1371 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
1372 * this does not seem compelling. Not yet, anyway.)
1373 */
1374 if (rdp->nxtlist != NULL) {
1375 *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
1376 receive_rdp->nxttail[RCU_NEXT_TAIL] =
1377 rdp->nxttail[RCU_NEXT_TAIL];
1378 receive_rdp->n_cbs_adopted += rdp->qlen;
1379 rdp->n_cbs_orphaned += rdp->qlen;
1380
1381 rdp->nxtlist = NULL;
1382 for (i = 0; i < RCU_NEXT_SIZE; i++)
1383 rdp->nxttail[i] = &rdp->nxtlist;
1384 }
1267 1385
1268 rdp->nxtlist = NULL; 1386 /*
1269 for (i = 0; i < RCU_NEXT_SIZE; i++) 1387 * Record a quiescent state for the dying CPU. This is safe
1270 rdp->nxttail[i] = &rdp->nxtlist; 1388 * only because we have already cleared out the callbacks.
1271 rdp->qlen = 0; 1389 * (Otherwise, the RCU core might try to schedule the invocation
1390 * of callbacks on this now-offline CPU, which would be bad.)
1391 */
1392 mask = rdp->grpmask; /* rnp->grplo is constant. */
1393 trace_rcu_grace_period(rsp->name,
1394 rnp->gpnum + 1 - !!(rnp->qsmask & mask),
1395 "cpuofl");
1396 rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum);
1397 /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */
1272} 1398}
1273 1399
1274/* 1400/*
1275 * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy 1401 * The CPU has been completely removed, and some other CPU is reporting
1276 * and move all callbacks from the outgoing CPU to the current one. 1402 * this fact from process context. Do the remainder of the cleanup.
1277 * There can only be one CPU hotplug operation at a time, so no other 1403 * There can only be one CPU hotplug operation at a time, so no other
1278 * CPU can be attempting to update rcu_cpu_kthread_task. 1404 * CPU can be attempting to update rcu_cpu_kthread_task.
1279 */ 1405 */
1280static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) 1406static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1281{ 1407{
1282 unsigned long flags; 1408 unsigned long flags;
1283 unsigned long mask; 1409 unsigned long mask;
1284 int need_report = 0; 1410 int need_report = 0;
1285 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 1411 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1286 struct rcu_node *rnp; 1412 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */
1287 1413
1414 /* Adjust any no-longer-needed kthreads. */
1288 rcu_stop_cpu_kthread(cpu); 1415 rcu_stop_cpu_kthread(cpu);
1416 rcu_node_kthread_setaffinity(rnp, -1);
1417
1418 /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */
1289 1419
1290 /* Exclude any attempts to start a new grace period. */ 1420 /* Exclude any attempts to start a new grace period. */
1291 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1421 raw_spin_lock_irqsave(&rsp->onofflock, flags);
1292 1422
1293 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 1423 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
1294 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
1295 mask = rdp->grpmask; /* rnp->grplo is constant. */ 1424 mask = rdp->grpmask; /* rnp->grplo is constant. */
1296 do { 1425 do {
1297 raw_spin_lock(&rnp->lock); /* irqs already disabled. */ 1426 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
@@ -1299,20 +1428,11 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1299 if (rnp->qsmaskinit != 0) { 1428 if (rnp->qsmaskinit != 0) {
1300 if (rnp != rdp->mynode) 1429 if (rnp != rdp->mynode)
1301 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1430 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1302 else
1303 trace_rcu_grace_period(rsp->name,
1304 rnp->gpnum + 1 -
1305 !!(rnp->qsmask & mask),
1306 "cpuofl");
1307 break; 1431 break;
1308 } 1432 }
1309 if (rnp == rdp->mynode) { 1433 if (rnp == rdp->mynode)
1310 trace_rcu_grace_period(rsp->name,
1311 rnp->gpnum + 1 -
1312 !!(rnp->qsmask & mask),
1313 "cpuofl");
1314 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 1434 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
1315 } else 1435 else
1316 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1436 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1317 mask = rnp->grpmask; 1437 mask = rnp->grpmask;
1318 rnp = rnp->parent; 1438 rnp = rnp->parent;
@@ -1332,29 +1452,15 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1332 raw_spin_unlock_irqrestore(&rnp->lock, flags); 1452 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1333 if (need_report & RCU_OFL_TASKS_EXP_GP) 1453 if (need_report & RCU_OFL_TASKS_EXP_GP)
1334 rcu_report_exp_rnp(rsp, rnp, true); 1454 rcu_report_exp_rnp(rsp, rnp, true);
1335 rcu_node_kthread_setaffinity(rnp, -1);
1336}
1337
1338/*
1339 * Remove the specified CPU from the RCU hierarchy and move any pending
1340 * callbacks that it might have to the current CPU. This code assumes
1341 * that at least one CPU in the system will remain running at all times.
1342 * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
1343 */
1344static void rcu_offline_cpu(int cpu)
1345{
1346 __rcu_offline_cpu(cpu, &rcu_sched_state);
1347 __rcu_offline_cpu(cpu, &rcu_bh_state);
1348 rcu_preempt_offline_cpu(cpu);
1349} 1455}
1350 1456
1351#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1457#else /* #ifdef CONFIG_HOTPLUG_CPU */
1352 1458
1353static void rcu_send_cbs_to_online(struct rcu_state *rsp) 1459static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
1354{ 1460{
1355} 1461}
1356 1462
1357static void rcu_offline_cpu(int cpu) 1463static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
1358{ 1464{
1359} 1465}
1360 1466
@@ -1368,11 +1474,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1368{ 1474{
1369 unsigned long flags; 1475 unsigned long flags;
1370 struct rcu_head *next, *list, **tail; 1476 struct rcu_head *next, *list, **tail;
1371 int bl, count; 1477 int bl, count, count_lazy;
1372 1478
1373 /* If no callbacks are ready, just return.*/ 1479 /* If no callbacks are ready, just return.*/
1374 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 1480 if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
1375 trace_rcu_batch_start(rsp->name, 0, 0); 1481 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0);
1376 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), 1482 trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
1377 need_resched(), is_idle_task(current), 1483 need_resched(), is_idle_task(current),
1378 rcu_is_callbacks_kthread()); 1484 rcu_is_callbacks_kthread());
@@ -1384,8 +1490,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1384 * races with call_rcu() from interrupt handlers. 1490 * races with call_rcu() from interrupt handlers.
1385 */ 1491 */
1386 local_irq_save(flags); 1492 local_irq_save(flags);
1493 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
1387 bl = rdp->blimit; 1494 bl = rdp->blimit;
1388 trace_rcu_batch_start(rsp->name, rdp->qlen, bl); 1495 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl);
1389 list = rdp->nxtlist; 1496 list = rdp->nxtlist;
1390 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 1497 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
1391 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 1498 *rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1396,12 +1503,13 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1396 local_irq_restore(flags); 1503 local_irq_restore(flags);
1397 1504
1398 /* Invoke callbacks. */ 1505 /* Invoke callbacks. */
1399 count = 0; 1506 count = count_lazy = 0;
1400 while (list) { 1507 while (list) {
1401 next = list->next; 1508 next = list->next;
1402 prefetch(next); 1509 prefetch(next);
1403 debug_rcu_head_unqueue(list); 1510 debug_rcu_head_unqueue(list);
1404 __rcu_reclaim(rsp->name, list); 1511 if (__rcu_reclaim(rsp->name, list))
1512 count_lazy++;
1405 list = next; 1513 list = next;
1406 /* Stop only if limit reached and CPU has something to do. */ 1514 /* Stop only if limit reached and CPU has something to do. */
1407 if (++count >= bl && 1515 if (++count >= bl &&
@@ -1416,6 +1524,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1416 rcu_is_callbacks_kthread()); 1524 rcu_is_callbacks_kthread());
1417 1525
1418 /* Update count, and requeue any remaining callbacks. */ 1526 /* Update count, and requeue any remaining callbacks. */
1527 rdp->qlen_lazy -= count_lazy;
1419 rdp->qlen -= count; 1528 rdp->qlen -= count;
1420 rdp->n_cbs_invoked += count; 1529 rdp->n_cbs_invoked += count;
1421 if (list != NULL) { 1530 if (list != NULL) {
@@ -1458,6 +1567,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1458void rcu_check_callbacks(int cpu, int user) 1567void rcu_check_callbacks(int cpu, int user)
1459{ 1568{
1460 trace_rcu_utilization("Start scheduler-tick"); 1569 trace_rcu_utilization("Start scheduler-tick");
1570 increment_cpu_stall_ticks();
1461 if (user || rcu_is_cpu_rrupt_from_idle()) { 1571 if (user || rcu_is_cpu_rrupt_from_idle()) {
1462 1572
1463 /* 1573 /*
@@ -1492,8 +1602,6 @@ void rcu_check_callbacks(int cpu, int user)
1492 trace_rcu_utilization("End scheduler-tick"); 1602 trace_rcu_utilization("End scheduler-tick");
1493} 1603}
1494 1604
1495#ifdef CONFIG_SMP
1496
1497/* 1605/*
1498 * Scan the leaf rcu_node structures, processing dyntick state for any that 1606 * Scan the leaf rcu_node structures, processing dyntick state for any that
1499 * have not yet encountered a quiescent state, using the function specified. 1607 * have not yet encountered a quiescent state, using the function specified.
@@ -1616,15 +1724,6 @@ unlock_fqs_ret:
1616 trace_rcu_utilization("End fqs"); 1724 trace_rcu_utilization("End fqs");
1617} 1725}
1618 1726
1619#else /* #ifdef CONFIG_SMP */
1620
1621static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1622{
1623 set_need_resched();
1624}
1625
1626#endif /* #else #ifdef CONFIG_SMP */
1627
1628/* 1727/*
1629 * This does the RCU core processing work for the specified rcu_state 1728 * This does the RCU core processing work for the specified rcu_state
1630 * and rcu_data structures. This may be called only from the CPU to 1729 * and rcu_data structures. This may be called only from the CPU to
@@ -1702,11 +1801,12 @@ static void invoke_rcu_core(void)
1702 1801
1703static void 1802static void
1704__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), 1803__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1705 struct rcu_state *rsp) 1804 struct rcu_state *rsp, bool lazy)
1706{ 1805{
1707 unsigned long flags; 1806 unsigned long flags;
1708 struct rcu_data *rdp; 1807 struct rcu_data *rdp;
1709 1808
1809 WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
1710 debug_rcu_head_queue(head); 1810 debug_rcu_head_queue(head);
1711 head->func = func; 1811 head->func = func;
1712 head->next = NULL; 1812 head->next = NULL;
@@ -1720,18 +1820,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1720 * a quiescent state betweentimes. 1820 * a quiescent state betweentimes.
1721 */ 1821 */
1722 local_irq_save(flags); 1822 local_irq_save(flags);
1823 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
1723 rdp = this_cpu_ptr(rsp->rda); 1824 rdp = this_cpu_ptr(rsp->rda);
1724 1825
1725 /* Add the callback to our list. */ 1826 /* Add the callback to our list. */
1726 *rdp->nxttail[RCU_NEXT_TAIL] = head; 1827 *rdp->nxttail[RCU_NEXT_TAIL] = head;
1727 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 1828 rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
1728 rdp->qlen++; 1829 rdp->qlen++;
1830 if (lazy)
1831 rdp->qlen_lazy++;
1729 1832
1730 if (__is_kfree_rcu_offset((unsigned long)func)) 1833 if (__is_kfree_rcu_offset((unsigned long)func))
1731 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 1834 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
1732 rdp->qlen); 1835 rdp->qlen_lazy, rdp->qlen);
1733 else 1836 else
1734 trace_rcu_callback(rsp->name, head, rdp->qlen); 1837 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen);
1735 1838
1736 /* If interrupts were disabled, don't dive into RCU core. */ 1839 /* If interrupts were disabled, don't dive into RCU core. */
1737 if (irqs_disabled_flags(flags)) { 1840 if (irqs_disabled_flags(flags)) {
@@ -1778,16 +1881,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1778 */ 1881 */
1779void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1882void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1780{ 1883{
1781 __call_rcu(head, func, &rcu_sched_state); 1884 __call_rcu(head, func, &rcu_sched_state, 0);
1782} 1885}
1783EXPORT_SYMBOL_GPL(call_rcu_sched); 1886EXPORT_SYMBOL_GPL(call_rcu_sched);
1784 1887
1785/* 1888/*
1786 * Queue an RCU for invocation after a quicker grace period. 1889 * Queue an RCU callback for invocation after a quicker grace period.
1787 */ 1890 */
1788void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 1891void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1789{ 1892{
1790 __call_rcu(head, func, &rcu_bh_state); 1893 __call_rcu(head, func, &rcu_bh_state, 0);
1791} 1894}
1792EXPORT_SYMBOL_GPL(call_rcu_bh); 1895EXPORT_SYMBOL_GPL(call_rcu_bh);
1793 1896
@@ -1816,6 +1919,10 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
1816 */ 1919 */
1817void synchronize_sched(void) 1920void synchronize_sched(void)
1818{ 1921{
1922 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
1923 !lock_is_held(&rcu_lock_map) &&
1924 !lock_is_held(&rcu_sched_lock_map),
1925 "Illegal synchronize_sched() in RCU-sched read-side critical section");
1819 if (rcu_blocking_is_gp()) 1926 if (rcu_blocking_is_gp())
1820 return; 1927 return;
1821 wait_rcu_gp(call_rcu_sched); 1928 wait_rcu_gp(call_rcu_sched);
@@ -1833,12 +1940,137 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
1833 */ 1940 */
1834void synchronize_rcu_bh(void) 1941void synchronize_rcu_bh(void)
1835{ 1942{
1943 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
1944 !lock_is_held(&rcu_lock_map) &&
1945 !lock_is_held(&rcu_sched_lock_map),
1946 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
1836 if (rcu_blocking_is_gp()) 1947 if (rcu_blocking_is_gp())
1837 return; 1948 return;
1838 wait_rcu_gp(call_rcu_bh); 1949 wait_rcu_gp(call_rcu_bh);
1839} 1950}
1840EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1951EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1841 1952
1953static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1954static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1955
1956static int synchronize_sched_expedited_cpu_stop(void *data)
1957{
1958 /*
1959 * There must be a full memory barrier on each affected CPU
1960 * between the time that try_stop_cpus() is called and the
1961 * time that it returns.
1962 *
1963 * In the current initial implementation of cpu_stop, the
1964 * above condition is already met when the control reaches
1965 * this point and the following smp_mb() is not strictly
1966 * necessary. Do smp_mb() anyway for documentation and
1967 * robustness against future implementation changes.
1968 */
1969 smp_mb(); /* See above comment block. */
1970 return 0;
1971}
1972
1973/**
1974 * synchronize_sched_expedited - Brute-force RCU-sched grace period
1975 *
1976 * Wait for an RCU-sched grace period to elapse, but use a "big hammer"
1977 * approach to force the grace period to end quickly. This consumes
1978 * significant time on all CPUs and is unfriendly to real-time workloads,
1979 * so is thus not recommended for any sort of common-case code. In fact,
1980 * if you are using synchronize_sched_expedited() in a loop, please
1981 * restructure your code to batch your updates, and then use a single
1982 * synchronize_sched() instead.
1983 *
1984 * Note that it is illegal to call this function while holding any lock
1985 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
1986 * to call this function from a CPU-hotplug notifier. Failing to observe
1987 * these restriction will result in deadlock.
1988 *
1989 * This implementation can be thought of as an application of ticket
1990 * locking to RCU, with sync_sched_expedited_started and
1991 * sync_sched_expedited_done taking on the roles of the halves
1992 * of the ticket-lock word. Each task atomically increments
1993 * sync_sched_expedited_started upon entry, snapshotting the old value,
1994 * then attempts to stop all the CPUs. If this succeeds, then each
1995 * CPU will have executed a context switch, resulting in an RCU-sched
1996 * grace period. We are then done, so we use atomic_cmpxchg() to
1997 * update sync_sched_expedited_done to match our snapshot -- but
1998 * only if someone else has not already advanced past our snapshot.
1999 *
2000 * On the other hand, if try_stop_cpus() fails, we check the value
2001 * of sync_sched_expedited_done. If it has advanced past our
2002 * initial snapshot, then someone else must have forced a grace period
2003 * some time after we took our snapshot. In this case, our work is
2004 * done for us, and we can simply return. Otherwise, we try again,
2005 * but keep our initial snapshot for purposes of checking for someone
2006 * doing our work for us.
2007 *
2008 * If we fail too many times in a row, we fall back to synchronize_sched().
2009 */
2010void synchronize_sched_expedited(void)
2011{
2012 int firstsnap, s, snap, trycount = 0;
2013
2014 /* Note that atomic_inc_return() implies full memory barrier. */
2015 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
2016 get_online_cpus();
2017 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2018
2019 /*
2020 * Each pass through the following loop attempts to force a
2021 * context switch on each CPU.
2022 */
2023 while (try_stop_cpus(cpu_online_mask,
2024 synchronize_sched_expedited_cpu_stop,
2025 NULL) == -EAGAIN) {
2026 put_online_cpus();
2027
2028 /* No joy, try again later. Or just synchronize_sched(). */
2029 if (trycount++ < 10)
2030 udelay(trycount * num_online_cpus());
2031 else {
2032 synchronize_sched();
2033 return;
2034 }
2035
2036 /* Check to see if someone else did our work for us. */
2037 s = atomic_read(&sync_sched_expedited_done);
2038 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
2039 smp_mb(); /* ensure test happens before caller kfree */
2040 return;
2041 }
2042
2043 /*
2044 * Refetching sync_sched_expedited_started allows later
2045 * callers to piggyback on our grace period. We subtract
2046 * 1 to get the same token that the last incrementer got.
2047 * We retry after they started, so our grace period works
2048 * for them, and they started after our first try, so their
2049 * grace period works for us.
2050 */
2051 get_online_cpus();
2052 snap = atomic_read(&sync_sched_expedited_started);
2053 smp_mb(); /* ensure read is before try_stop_cpus(). */
2054 }
2055
2056 /*
2057 * Everyone up to our most recent fetch is covered by our grace
2058 * period. Update the counter, but only if our work is still
2059 * relevant -- which it won't be if someone who started later
2060 * than we did beat us to the punch.
2061 */
2062 do {
2063 s = atomic_read(&sync_sched_expedited_done);
2064 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
2065 smp_mb(); /* ensure test happens before caller kfree */
2066 break;
2067 }
2068 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
2069
2070 put_online_cpus();
2071}
2072EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
2073
1842/* 2074/*
1843 * Check to see if there is any immediate RCU-related work to be done 2075 * Check to see if there is any immediate RCU-related work to be done
1844 * by the current CPU, for the specified type of RCU, returning 1 if so. 2076 * by the current CPU, for the specified type of RCU, returning 1 if so.
@@ -1932,7 +2164,7 @@ static int rcu_cpu_has_callbacks(int cpu)
1932 /* RCU callbacks either ready or pending? */ 2164 /* RCU callbacks either ready or pending? */
1933 return per_cpu(rcu_sched_data, cpu).nxtlist || 2165 return per_cpu(rcu_sched_data, cpu).nxtlist ||
1934 per_cpu(rcu_bh_data, cpu).nxtlist || 2166 per_cpu(rcu_bh_data, cpu).nxtlist ||
1935 rcu_preempt_needs_cpu(cpu); 2167 rcu_preempt_cpu_has_callbacks(cpu);
1936} 2168}
1937 2169
1938static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 2170static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
@@ -2027,9 +2259,10 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
2027 rdp->nxtlist = NULL; 2259 rdp->nxtlist = NULL;
2028 for (i = 0; i < RCU_NEXT_SIZE; i++) 2260 for (i = 0; i < RCU_NEXT_SIZE; i++)
2029 rdp->nxttail[i] = &rdp->nxtlist; 2261 rdp->nxttail[i] = &rdp->nxtlist;
2262 rdp->qlen_lazy = 0;
2030 rdp->qlen = 0; 2263 rdp->qlen = 0;
2031 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 2264 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
2032 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); 2265 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
2033 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 2266 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
2034 rdp->cpu = cpu; 2267 rdp->cpu = cpu;
2035 rdp->rsp = rsp; 2268 rdp->rsp = rsp;
@@ -2057,7 +2290,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
2057 rdp->qlen_last_fqs_check = 0; 2290 rdp->qlen_last_fqs_check = 0;
2058 rdp->n_force_qs_snap = rsp->n_force_qs; 2291 rdp->n_force_qs_snap = rsp->n_force_qs;
2059 rdp->blimit = blimit; 2292 rdp->blimit = blimit;
2060 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; 2293 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
2061 atomic_set(&rdp->dynticks->dynticks, 2294 atomic_set(&rdp->dynticks->dynticks,
2062 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); 2295 (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
2063 rcu_prepare_for_idle_init(cpu); 2296 rcu_prepare_for_idle_init(cpu);
@@ -2139,16 +2372,18 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
2139 * touch any data without introducing corruption. We send the 2372 * touch any data without introducing corruption. We send the
2140 * dying CPU's callbacks to an arbitrarily chosen online CPU. 2373 * dying CPU's callbacks to an arbitrarily chosen online CPU.
2141 */ 2374 */
2142 rcu_send_cbs_to_online(&rcu_bh_state); 2375 rcu_cleanup_dying_cpu(&rcu_bh_state);
2143 rcu_send_cbs_to_online(&rcu_sched_state); 2376 rcu_cleanup_dying_cpu(&rcu_sched_state);
2144 rcu_preempt_send_cbs_to_online(); 2377 rcu_preempt_cleanup_dying_cpu();
2145 rcu_cleanup_after_idle(cpu); 2378 rcu_cleanup_after_idle(cpu);
2146 break; 2379 break;
2147 case CPU_DEAD: 2380 case CPU_DEAD:
2148 case CPU_DEAD_FROZEN: 2381 case CPU_DEAD_FROZEN:
2149 case CPU_UP_CANCELED: 2382 case CPU_UP_CANCELED:
2150 case CPU_UP_CANCELED_FROZEN: 2383 case CPU_UP_CANCELED_FROZEN:
2151 rcu_offline_cpu(cpu); 2384 rcu_cleanup_dead_cpu(cpu, &rcu_bh_state);
2385 rcu_cleanup_dead_cpu(cpu, &rcu_sched_state);
2386 rcu_preempt_cleanup_dead_cpu(cpu);
2152 break; 2387 break;
2153 default: 2388 default:
2154 break; 2389 break;
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index fddff92d6676..cdd1be0a4072 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -239,6 +239,12 @@ struct rcu_data {
239 bool preemptible; /* Preemptible RCU? */ 239 bool preemptible; /* Preemptible RCU? */
240 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 240 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
241 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 241 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
242#ifdef CONFIG_RCU_CPU_STALL_INFO
243 unsigned long ticks_this_gp; /* The number of scheduling-clock */
244 /* ticks this CPU has handled */
245 /* during and after the last grace */
246 /* period it is aware of. */
247#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
242 248
243 /* 2) batch handling */ 249 /* 2) batch handling */
244 /* 250 /*
@@ -265,7 +271,8 @@ struct rcu_data {
265 */ 271 */
266 struct rcu_head *nxtlist; 272 struct rcu_head *nxtlist;
267 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 273 struct rcu_head **nxttail[RCU_NEXT_SIZE];
268 long qlen; /* # of queued callbacks */ 274 long qlen_lazy; /* # of lazy queued callbacks */
275 long qlen; /* # of queued callbacks, incl lazy */
269 long qlen_last_fqs_check; 276 long qlen_last_fqs_check;
270 /* qlen at last check for QS forcing */ 277 /* qlen at last check for QS forcing */
271 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 278 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
@@ -282,7 +289,6 @@ struct rcu_data {
282 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ 289 /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
283 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ 290 unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
284 unsigned long offline_fqs; /* Kicked due to being offline. */ 291 unsigned long offline_fqs; /* Kicked due to being offline. */
285 unsigned long resched_ipi; /* Sent a resched IPI. */
286 292
287 /* 5) __rcu_pending() statistics. */ 293 /* 5) __rcu_pending() statistics. */
288 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 294 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
@@ -313,12 +319,6 @@ struct rcu_data {
313#else 319#else
314#define RCU_STALL_DELAY_DELTA 0 320#define RCU_STALL_DELAY_DELTA 0
315#endif 321#endif
316
317#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
318 RCU_STALL_DELAY_DELTA)
319 /* for rsp->jiffies_stall */
320#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
321 /* for rsp->jiffies_stall */
322#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 322#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
323 /* to take at least one */ 323 /* to take at least one */
324 /* scheduling clock irq */ 324 /* scheduling clock irq */
@@ -438,8 +438,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
438static int rcu_preempt_offline_tasks(struct rcu_state *rsp, 438static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
439 struct rcu_node *rnp, 439 struct rcu_node *rnp,
440 struct rcu_data *rdp); 440 struct rcu_data *rdp);
441static void rcu_preempt_offline_cpu(int cpu);
442#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 441#endif /* #ifdef CONFIG_HOTPLUG_CPU */
442static void rcu_preempt_cleanup_dead_cpu(int cpu);
443static void rcu_preempt_check_callbacks(int cpu); 443static void rcu_preempt_check_callbacks(int cpu);
444static void rcu_preempt_process_callbacks(void); 444static void rcu_preempt_process_callbacks(void);
445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 445void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
@@ -448,9 +448,9 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
448 bool wake); 448 bool wake);
449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ 449#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
450static int rcu_preempt_pending(int cpu); 450static int rcu_preempt_pending(int cpu);
451static int rcu_preempt_needs_cpu(int cpu); 451static int rcu_preempt_cpu_has_callbacks(int cpu);
452static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 452static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
453static void rcu_preempt_send_cbs_to_online(void); 453static void rcu_preempt_cleanup_dying_cpu(void);
454static void __init __rcu_init_preempt(void); 454static void __init __rcu_init_preempt(void);
455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); 455static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); 456static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -471,5 +471,10 @@ static void __cpuinit rcu_prepare_kthreads(int cpu);
471static void rcu_prepare_for_idle_init(int cpu); 471static void rcu_prepare_for_idle_init(int cpu);
472static void rcu_cleanup_after_idle(int cpu); 472static void rcu_cleanup_after_idle(int cpu);
473static void rcu_prepare_for_idle(int cpu); 473static void rcu_prepare_for_idle(int cpu);
474static void print_cpu_stall_info_begin(void);
475static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
476static void print_cpu_stall_info_end(void);
477static void zero_cpu_stall_ticks(struct rcu_data *rdp);
478static void increment_cpu_stall_ticks(void);
474 479
475#endif /* #ifndef RCU_TREE_NONCORE */ 480#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8bb35d73e1f9..c023464816be 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,7 +25,6 @@
25 */ 25 */
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28#include <linux/stop_machine.h>
29 28
30#define RCU_KTHREAD_PRIO 1 29#define RCU_KTHREAD_PRIO 1
31 30
@@ -63,7 +62,10 @@ static void __init rcu_bootup_announce_oddness(void)
63 printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); 62 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
64#endif 63#endif
65#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) 64#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
66 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 65 printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n");
66#endif
67#if defined(CONFIG_RCU_CPU_STALL_INFO)
68 printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n");
67#endif 69#endif
68#if NUM_RCU_LVL_4 != 0 70#if NUM_RCU_LVL_4 != 0
69 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); 71 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
@@ -490,6 +492,31 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
490 492
491#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ 493#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
492 494
495#ifdef CONFIG_RCU_CPU_STALL_INFO
496
497static void rcu_print_task_stall_begin(struct rcu_node *rnp)
498{
499 printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
500 rnp->level, rnp->grplo, rnp->grphi);
501}
502
503static void rcu_print_task_stall_end(void)
504{
505 printk(KERN_CONT "\n");
506}
507
508#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
509
510static void rcu_print_task_stall_begin(struct rcu_node *rnp)
511{
512}
513
514static void rcu_print_task_stall_end(void)
515{
516}
517
518#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
519
493/* 520/*
494 * Scan the current list of tasks blocked within RCU read-side critical 521 * Scan the current list of tasks blocked within RCU read-side critical
495 * sections, printing out the tid of each. 522 * sections, printing out the tid of each.
@@ -501,12 +528,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
501 528
502 if (!rcu_preempt_blocked_readers_cgp(rnp)) 529 if (!rcu_preempt_blocked_readers_cgp(rnp))
503 return 0; 530 return 0;
531 rcu_print_task_stall_begin(rnp);
504 t = list_entry(rnp->gp_tasks, 532 t = list_entry(rnp->gp_tasks,
505 struct task_struct, rcu_node_entry); 533 struct task_struct, rcu_node_entry);
506 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { 534 list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
507 printk(" P%d", t->pid); 535 printk(KERN_CONT " P%d", t->pid);
508 ndetected++; 536 ndetected++;
509 } 537 }
538 rcu_print_task_stall_end();
510 return ndetected; 539 return ndetected;
511} 540}
512 541
@@ -581,7 +610,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
581 * absolutely necessary, but this is a good performance/complexity 610 * absolutely necessary, but this is a good performance/complexity
582 * tradeoff. 611 * tradeoff.
583 */ 612 */
584 if (rcu_preempt_blocked_readers_cgp(rnp)) 613 if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0)
585 retval |= RCU_OFL_TASKS_NORM_GP; 614 retval |= RCU_OFL_TASKS_NORM_GP;
586 if (rcu_preempted_readers_exp(rnp)) 615 if (rcu_preempted_readers_exp(rnp))
587 retval |= RCU_OFL_TASKS_EXP_GP; 616 retval |= RCU_OFL_TASKS_EXP_GP;
@@ -618,16 +647,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
618 return retval; 647 return retval;
619} 648}
620 649
650#endif /* #ifdef CONFIG_HOTPLUG_CPU */
651
621/* 652/*
622 * Do CPU-offline processing for preemptible RCU. 653 * Do CPU-offline processing for preemptible RCU.
623 */ 654 */
624static void rcu_preempt_offline_cpu(int cpu) 655static void rcu_preempt_cleanup_dead_cpu(int cpu)
625{ 656{
626 __rcu_offline_cpu(cpu, &rcu_preempt_state); 657 rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state);
627} 658}
628 659
629#endif /* #ifdef CONFIG_HOTPLUG_CPU */
630
631/* 660/*
632 * Check for a quiescent state from the current CPU. When a task blocks, 661 * Check for a quiescent state from the current CPU. When a task blocks,
633 * the task is recorded in the corresponding CPU's rcu_node structure, 662 * the task is recorded in the corresponding CPU's rcu_node structure,
@@ -671,10 +700,24 @@ static void rcu_preempt_do_callbacks(void)
671 */ 700 */
672void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 701void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
673{ 702{
674 __call_rcu(head, func, &rcu_preempt_state); 703 __call_rcu(head, func, &rcu_preempt_state, 0);
675} 704}
676EXPORT_SYMBOL_GPL(call_rcu); 705EXPORT_SYMBOL_GPL(call_rcu);
677 706
707/*
708 * Queue an RCU callback for lazy invocation after a grace period.
709 * This will likely be later named something like "call_rcu_lazy()",
710 * but this change will require some way of tagging the lazy RCU
711 * callbacks in the list of pending callbacks. Until then, this
712 * function may only be called from __kfree_rcu().
713 */
714void kfree_call_rcu(struct rcu_head *head,
715 void (*func)(struct rcu_head *rcu))
716{
717 __call_rcu(head, func, &rcu_preempt_state, 1);
718}
719EXPORT_SYMBOL_GPL(kfree_call_rcu);
720
678/** 721/**
679 * synchronize_rcu - wait until a grace period has elapsed. 722 * synchronize_rcu - wait until a grace period has elapsed.
680 * 723 *
@@ -688,6 +731,10 @@ EXPORT_SYMBOL_GPL(call_rcu);
688 */ 731 */
689void synchronize_rcu(void) 732void synchronize_rcu(void)
690{ 733{
734 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
735 !lock_is_held(&rcu_lock_map) &&
736 !lock_is_held(&rcu_sched_lock_map),
737 "Illegal synchronize_rcu() in RCU read-side critical section");
691 if (!rcu_scheduler_active) 738 if (!rcu_scheduler_active)
692 return; 739 return;
693 wait_rcu_gp(call_rcu); 740 wait_rcu_gp(call_rcu);
@@ -788,10 +835,22 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
788 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ 835 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
789} 836}
790 837
791/* 838/**
792 * Wait for an rcu-preempt grace period, but expedite it. The basic idea 839 * synchronize_rcu_expedited - Brute-force RCU grace period
793 * is to invoke synchronize_sched_expedited() to push all the tasks to 840 *
794 * the ->blkd_tasks lists and wait for this list to drain. 841 * Wait for an RCU-preempt grace period, but expedite it. The basic
842 * idea is to invoke synchronize_sched_expedited() to push all the tasks to
843 * the ->blkd_tasks lists and wait for this list to drain. This consumes
844 * significant time on all CPUs and is unfriendly to real-time workloads,
845 * so is thus not recommended for any sort of common-case code.
846 * In fact, if you are using synchronize_rcu_expedited() in a loop,
847 * please restructure your code to batch your updates, and then Use a
848 * single synchronize_rcu() instead.
849 *
850 * Note that it is illegal to call this function while holding any lock
851 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
852 * to call this function from a CPU-hotplug notifier. Failing to observe
853 * these restriction will result in deadlock.
795 */ 854 */
796void synchronize_rcu_expedited(void) 855void synchronize_rcu_expedited(void)
797{ 856{
@@ -869,9 +928,9 @@ static int rcu_preempt_pending(int cpu)
869} 928}
870 929
871/* 930/*
872 * Does preemptible RCU need the CPU to stay out of dynticks mode? 931 * Does preemptible RCU have callbacks on this CPU?
873 */ 932 */
874static int rcu_preempt_needs_cpu(int cpu) 933static int rcu_preempt_cpu_has_callbacks(int cpu)
875{ 934{
876 return !!per_cpu(rcu_preempt_data, cpu).nxtlist; 935 return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
877} 936}
@@ -894,11 +953,12 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
894} 953}
895 954
896/* 955/*
897 * Move preemptible RCU's callbacks from dying CPU to other online CPU. 956 * Move preemptible RCU's callbacks from dying CPU to other online CPU
957 * and record a quiescent state.
898 */ 958 */
899static void rcu_preempt_send_cbs_to_online(void) 959static void rcu_preempt_cleanup_dying_cpu(void)
900{ 960{
901 rcu_send_cbs_to_online(&rcu_preempt_state); 961 rcu_cleanup_dying_cpu(&rcu_preempt_state);
902} 962}
903 963
904/* 964/*
@@ -1034,16 +1094,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
1034 return 0; 1094 return 0;
1035} 1095}
1036 1096
1097#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1098
1037/* 1099/*
1038 * Because preemptible RCU does not exist, it never needs CPU-offline 1100 * Because preemptible RCU does not exist, it never needs CPU-offline
1039 * processing. 1101 * processing.
1040 */ 1102 */
1041static void rcu_preempt_offline_cpu(int cpu) 1103static void rcu_preempt_cleanup_dead_cpu(int cpu)
1042{ 1104{
1043} 1105}
1044 1106
1045#endif /* #ifdef CONFIG_HOTPLUG_CPU */
1046
1047/* 1107/*
1048 * Because preemptible RCU does not exist, it never has any callbacks 1108 * Because preemptible RCU does not exist, it never has any callbacks
1049 * to check. 1109 * to check.
@@ -1061,6 +1121,22 @@ static void rcu_preempt_process_callbacks(void)
1061} 1121}
1062 1122
1063/* 1123/*
1124 * Queue an RCU callback for lazy invocation after a grace period.
1125 * This will likely be later named something like "call_rcu_lazy()",
1126 * but this change will require some way of tagging the lazy RCU
1127 * callbacks in the list of pending callbacks. Until then, this
1128 * function may only be called from __kfree_rcu().
1129 *
1130 * Because there is no preemptible RCU, we use RCU-sched instead.
1131 */
1132void kfree_call_rcu(struct rcu_head *head,
1133 void (*func)(struct rcu_head *rcu))
1134{
1135 __call_rcu(head, func, &rcu_sched_state, 1);
1136}
1137EXPORT_SYMBOL_GPL(kfree_call_rcu);
1138
1139/*
1064 * Wait for an rcu-preempt grace period, but make it happen quickly. 1140 * Wait for an rcu-preempt grace period, but make it happen quickly.
1065 * But because preemptible RCU does not exist, map to rcu-sched. 1141 * But because preemptible RCU does not exist, map to rcu-sched.
1066 */ 1142 */
@@ -1093,9 +1169,9 @@ static int rcu_preempt_pending(int cpu)
1093} 1169}
1094 1170
1095/* 1171/*
1096 * Because preemptible RCU does not exist, it never needs any CPU. 1172 * Because preemptible RCU does not exist, it never has callbacks
1097 */ 1173 */
1098static int rcu_preempt_needs_cpu(int cpu) 1174static int rcu_preempt_cpu_has_callbacks(int cpu)
1099{ 1175{
1100 return 0; 1176 return 0;
1101} 1177}
@@ -1119,9 +1195,9 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
1119} 1195}
1120 1196
1121/* 1197/*
1122 * Because there is no preemptible RCU, there are no callbacks to move. 1198 * Because there is no preemptible RCU, there is no cleanup to do.
1123 */ 1199 */
1124static void rcu_preempt_send_cbs_to_online(void) 1200static void rcu_preempt_cleanup_dying_cpu(void)
1125{ 1201{
1126} 1202}
1127 1203
@@ -1823,132 +1899,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
1823 1899
1824#endif /* #else #ifdef CONFIG_RCU_BOOST */ 1900#endif /* #else #ifdef CONFIG_RCU_BOOST */
1825 1901
1826#ifndef CONFIG_SMP
1827
1828void synchronize_sched_expedited(void)
1829{
1830 cond_resched();
1831}
1832EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1833
1834#else /* #ifndef CONFIG_SMP */
1835
1836static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1837static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1838
1839static int synchronize_sched_expedited_cpu_stop(void *data)
1840{
1841 /*
1842 * There must be a full memory barrier on each affected CPU
1843 * between the time that try_stop_cpus() is called and the
1844 * time that it returns.
1845 *
1846 * In the current initial implementation of cpu_stop, the
1847 * above condition is already met when the control reaches
1848 * this point and the following smp_mb() is not strictly
1849 * necessary. Do smp_mb() anyway for documentation and
1850 * robustness against future implementation changes.
1851 */
1852 smp_mb(); /* See above comment block. */
1853 return 0;
1854}
1855
1856/*
1857 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
1858 * approach to force grace period to end quickly. This consumes
1859 * significant time on all CPUs, and is thus not recommended for
1860 * any sort of common-case code.
1861 *
1862 * Note that it is illegal to call this function while holding any
1863 * lock that is acquired by a CPU-hotplug notifier. Failing to
1864 * observe this restriction will result in deadlock.
1865 *
1866 * This implementation can be thought of as an application of ticket
1867 * locking to RCU, with sync_sched_expedited_started and
1868 * sync_sched_expedited_done taking on the roles of the halves
1869 * of the ticket-lock word. Each task atomically increments
1870 * sync_sched_expedited_started upon entry, snapshotting the old value,
1871 * then attempts to stop all the CPUs. If this succeeds, then each
1872 * CPU will have executed a context switch, resulting in an RCU-sched
1873 * grace period. We are then done, so we use atomic_cmpxchg() to
1874 * update sync_sched_expedited_done to match our snapshot -- but
1875 * only if someone else has not already advanced past our snapshot.
1876 *
1877 * On the other hand, if try_stop_cpus() fails, we check the value
1878 * of sync_sched_expedited_done. If it has advanced past our
1879 * initial snapshot, then someone else must have forced a grace period
1880 * some time after we took our snapshot. In this case, our work is
1881 * done for us, and we can simply return. Otherwise, we try again,
1882 * but keep our initial snapshot for purposes of checking for someone
1883 * doing our work for us.
1884 *
1885 * If we fail too many times in a row, we fall back to synchronize_sched().
1886 */
1887void synchronize_sched_expedited(void)
1888{
1889 int firstsnap, s, snap, trycount = 0;
1890
1891 /* Note that atomic_inc_return() implies full memory barrier. */
1892 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1893 get_online_cpus();
1894
1895 /*
1896 * Each pass through the following loop attempts to force a
1897 * context switch on each CPU.
1898 */
1899 while (try_stop_cpus(cpu_online_mask,
1900 synchronize_sched_expedited_cpu_stop,
1901 NULL) == -EAGAIN) {
1902 put_online_cpus();
1903
1904 /* No joy, try again later. Or just synchronize_sched(). */
1905 if (trycount++ < 10)
1906 udelay(trycount * num_online_cpus());
1907 else {
1908 synchronize_sched();
1909 return;
1910 }
1911
1912 /* Check to see if someone else did our work for us. */
1913 s = atomic_read(&sync_sched_expedited_done);
1914 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1915 smp_mb(); /* ensure test happens before caller kfree */
1916 return;
1917 }
1918
1919 /*
1920 * Refetching sync_sched_expedited_started allows later
1921 * callers to piggyback on our grace period. We subtract
1922 * 1 to get the same token that the last incrementer got.
1923 * We retry after they started, so our grace period works
1924 * for them, and they started after our first try, so their
1925 * grace period works for us.
1926 */
1927 get_online_cpus();
1928 snap = atomic_read(&sync_sched_expedited_started);
1929 smp_mb(); /* ensure read is before try_stop_cpus(). */
1930 }
1931
1932 /*
1933 * Everyone up to our most recent fetch is covered by our grace
1934 * period. Update the counter, but only if our work is still
1935 * relevant -- which it won't be if someone who started later
1936 * than we did beat us to the punch.
1937 */
1938 do {
1939 s = atomic_read(&sync_sched_expedited_done);
1940 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1941 smp_mb(); /* ensure test happens before caller kfree */
1942 break;
1943 }
1944 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1945
1946 put_online_cpus();
1947}
1948EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1949
1950#endif /* #else #ifndef CONFIG_SMP */
1951
1952#if !defined(CONFIG_RCU_FAST_NO_HZ) 1902#if !defined(CONFIG_RCU_FAST_NO_HZ)
1953 1903
1954/* 1904/*
@@ -1981,7 +1931,7 @@ static void rcu_cleanup_after_idle(int cpu)
1981} 1931}
1982 1932
1983/* 1933/*
1984 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, 1934 * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1985 * is nothing. 1935 * is nothing.
1986 */ 1936 */
1987static void rcu_prepare_for_idle(int cpu) 1937static void rcu_prepare_for_idle(int cpu)
@@ -2015,6 +1965,9 @@ static void rcu_prepare_for_idle(int cpu)
2015 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your 1965 * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
2016 * system. And if you are -that- concerned about energy efficiency, 1966 * system. And if you are -that- concerned about energy efficiency,
2017 * just power the system down and be done with it! 1967 * just power the system down and be done with it!
1968 * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
1969 * permitted to sleep in dyntick-idle mode with only lazy RCU
1970 * callbacks pending. Setting this too high can OOM your system.
2018 * 1971 *
2019 * The values below work well in practice. If future workloads require 1972 * The values below work well in practice. If future workloads require
2020 * adjustment, they can be converted into kernel config parameters, though 1973 * adjustment, they can be converted into kernel config parameters, though
@@ -2023,11 +1976,13 @@ static void rcu_prepare_for_idle(int cpu)
2023#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ 1976#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
2024#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ 1977#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
2025#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ 1978#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
1979#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
2026 1980
2027static DEFINE_PER_CPU(int, rcu_dyntick_drain); 1981static DEFINE_PER_CPU(int, rcu_dyntick_drain);
2028static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); 1982static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
2029static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); 1983static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
2030static ktime_t rcu_idle_gp_wait; 1984static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */
1985static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */
2031 1986
2032/* 1987/*
2033 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no 1988 * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
@@ -2048,6 +2003,48 @@ int rcu_needs_cpu(int cpu)
2048} 2003}
2049 2004
2050/* 2005/*
2006 * Does the specified flavor of RCU have non-lazy callbacks pending on
2007 * the specified CPU? Both RCU flavor and CPU are specified by the
2008 * rcu_data structure.
2009 */
2010static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
2011{
2012 return rdp->qlen != rdp->qlen_lazy;
2013}
2014
2015#ifdef CONFIG_TREE_PREEMPT_RCU
2016
2017/*
2018 * Are there non-lazy RCU-preempt callbacks? (There cannot be if there
2019 * is no RCU-preempt in the kernel.)
2020 */
2021static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
2022{
2023 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
2024
2025 return __rcu_cpu_has_nonlazy_callbacks(rdp);
2026}
2027
2028#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2029
2030static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
2031{
2032 return 0;
2033}
2034
2035#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
2036
2037/*
2038 * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
2039 */
2040static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
2041{
2042 return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
2043 __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
2044 rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
2045}
2046
2047/*
2051 * Timer handler used to force CPU to start pushing its remaining RCU 2048 * Timer handler used to force CPU to start pushing its remaining RCU
2052 * callbacks in the case where it entered dyntick-idle mode with callbacks 2049 * callbacks in the case where it entered dyntick-idle mode with callbacks
2053 * pending. The hander doesn't really need to do anything because the 2050 * pending. The hander doesn't really need to do anything because the
@@ -2074,6 +2071,8 @@ static void rcu_prepare_for_idle_init(int cpu)
2074 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); 2071 unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
2075 2072
2076 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); 2073 rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
2074 upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY);
2075 rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000);
2077 firsttime = 0; 2076 firsttime = 0;
2078 } 2077 }
2079} 2078}
@@ -2109,10 +2108,6 @@ static void rcu_cleanup_after_idle(int cpu)
2109 */ 2108 */
2110static void rcu_prepare_for_idle(int cpu) 2109static void rcu_prepare_for_idle(int cpu)
2111{ 2110{
2112 unsigned long flags;
2113
2114 local_irq_save(flags);
2115
2116 /* 2111 /*
2117 * If there are no callbacks on this CPU, enter dyntick-idle mode. 2112 * If there are no callbacks on this CPU, enter dyntick-idle mode.
2118 * Also reset state to avoid prejudicing later attempts. 2113 * Also reset state to avoid prejudicing later attempts.
@@ -2120,7 +2115,6 @@ static void rcu_prepare_for_idle(int cpu)
2120 if (!rcu_cpu_has_callbacks(cpu)) { 2115 if (!rcu_cpu_has_callbacks(cpu)) {
2121 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2116 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
2122 per_cpu(rcu_dyntick_drain, cpu) = 0; 2117 per_cpu(rcu_dyntick_drain, cpu) = 0;
2123 local_irq_restore(flags);
2124 trace_rcu_prep_idle("No callbacks"); 2118 trace_rcu_prep_idle("No callbacks");
2125 return; 2119 return;
2126 } 2120 }
@@ -2130,7 +2124,6 @@ static void rcu_prepare_for_idle(int cpu)
2130 * refrained from disabling the scheduling-clock tick. 2124 * refrained from disabling the scheduling-clock tick.
2131 */ 2125 */
2132 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { 2126 if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
2133 local_irq_restore(flags);
2134 trace_rcu_prep_idle("In holdoff"); 2127 trace_rcu_prep_idle("In holdoff");
2135 return; 2128 return;
2136 } 2129 }
@@ -2140,18 +2133,22 @@ static void rcu_prepare_for_idle(int cpu)
2140 /* First time through, initialize the counter. */ 2133 /* First time through, initialize the counter. */
2141 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; 2134 per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
2142 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && 2135 } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
2143 !rcu_pending(cpu)) { 2136 !rcu_pending(cpu) &&
2137 !local_softirq_pending()) {
2144 /* Can we go dyntick-idle despite still having callbacks? */ 2138 /* Can we go dyntick-idle despite still having callbacks? */
2145 trace_rcu_prep_idle("Dyntick with callbacks"); 2139 trace_rcu_prep_idle("Dyntick with callbacks");
2146 per_cpu(rcu_dyntick_drain, cpu) = 0; 2140 per_cpu(rcu_dyntick_drain, cpu) = 0;
2147 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 2141 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2148 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), 2142 if (rcu_cpu_has_nonlazy_callbacks(cpu))
2149 rcu_idle_gp_wait, HRTIMER_MODE_REL); 2143 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2144 rcu_idle_gp_wait, HRTIMER_MODE_REL);
2145 else
2146 hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
2147 rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL);
2150 return; /* Nothing more to do immediately. */ 2148 return; /* Nothing more to do immediately. */
2151 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { 2149 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
2152 /* We have hit the limit, so time to give up. */ 2150 /* We have hit the limit, so time to give up. */
2153 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; 2151 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
2154 local_irq_restore(flags);
2155 trace_rcu_prep_idle("Begin holdoff"); 2152 trace_rcu_prep_idle("Begin holdoff");
2156 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ 2153 invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
2157 return; 2154 return;
@@ -2163,23 +2160,17 @@ static void rcu_prepare_for_idle(int cpu)
2163 */ 2160 */
2164#ifdef CONFIG_TREE_PREEMPT_RCU 2161#ifdef CONFIG_TREE_PREEMPT_RCU
2165 if (per_cpu(rcu_preempt_data, cpu).nxtlist) { 2162 if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
2166 local_irq_restore(flags);
2167 rcu_preempt_qs(cpu); 2163 rcu_preempt_qs(cpu);
2168 force_quiescent_state(&rcu_preempt_state, 0); 2164 force_quiescent_state(&rcu_preempt_state, 0);
2169 local_irq_save(flags);
2170 } 2165 }
2171#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 2166#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2172 if (per_cpu(rcu_sched_data, cpu).nxtlist) { 2167 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
2173 local_irq_restore(flags);
2174 rcu_sched_qs(cpu); 2168 rcu_sched_qs(cpu);
2175 force_quiescent_state(&rcu_sched_state, 0); 2169 force_quiescent_state(&rcu_sched_state, 0);
2176 local_irq_save(flags);
2177 } 2170 }
2178 if (per_cpu(rcu_bh_data, cpu).nxtlist) { 2171 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
2179 local_irq_restore(flags);
2180 rcu_bh_qs(cpu); 2172 rcu_bh_qs(cpu);
2181 force_quiescent_state(&rcu_bh_state, 0); 2173 force_quiescent_state(&rcu_bh_state, 0);
2182 local_irq_save(flags);
2183 } 2174 }
2184 2175
2185 /* 2176 /*
@@ -2187,13 +2178,124 @@ static void rcu_prepare_for_idle(int cpu)
2187 * So try forcing the callbacks through the grace period. 2178 * So try forcing the callbacks through the grace period.
2188 */ 2179 */
2189 if (rcu_cpu_has_callbacks(cpu)) { 2180 if (rcu_cpu_has_callbacks(cpu)) {
2190 local_irq_restore(flags);
2191 trace_rcu_prep_idle("More callbacks"); 2181 trace_rcu_prep_idle("More callbacks");
2192 invoke_rcu_core(); 2182 invoke_rcu_core();
2193 } else { 2183 } else
2194 local_irq_restore(flags);
2195 trace_rcu_prep_idle("Callbacks drained"); 2184 trace_rcu_prep_idle("Callbacks drained");
2196 }
2197} 2185}
2198 2186
2199#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 2187#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
2188
2189#ifdef CONFIG_RCU_CPU_STALL_INFO
2190
2191#ifdef CONFIG_RCU_FAST_NO_HZ
2192
2193static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2194{
2195 struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
2196
2197 sprintf(cp, "drain=%d %c timer=%lld",
2198 per_cpu(rcu_dyntick_drain, cpu),
2199 per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.',
2200 hrtimer_active(hrtp)
2201 ? ktime_to_us(hrtimer_get_remaining(hrtp))
2202 : -1);
2203}
2204
2205#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
2206
2207static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
2208{
2209}
2210
2211#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
2212
2213/* Initiate the stall-info list. */
2214static void print_cpu_stall_info_begin(void)
2215{
2216 printk(KERN_CONT "\n");
2217}
2218
2219/*
2220 * Print out diagnostic information for the specified stalled CPU.
2221 *
2222 * If the specified CPU is aware of the current RCU grace period
2223 * (flavor specified by rsp), then print the number of scheduling
2224 * clock interrupts the CPU has taken during the time that it has
2225 * been aware. Otherwise, print the number of RCU grace periods
2226 * that this CPU is ignorant of, for example, "1" if the CPU was
2227 * aware of the previous grace period.
2228 *
2229 * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
2230 */
2231static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2232{
2233 char fast_no_hz[72];
2234 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2235 struct rcu_dynticks *rdtp = rdp->dynticks;
2236 char *ticks_title;
2237 unsigned long ticks_value;
2238
2239 if (rsp->gpnum == rdp->gpnum) {
2240 ticks_title = "ticks this GP";
2241 ticks_value = rdp->ticks_this_gp;
2242 } else {
2243 ticks_title = "GPs behind";
2244 ticks_value = rsp->gpnum - rdp->gpnum;
2245 }
2246 print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
2247 printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
2248 cpu, ticks_value, ticks_title,
2249 atomic_read(&rdtp->dynticks) & 0xfff,
2250 rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
2251 fast_no_hz);
2252}
2253
2254/* Terminate the stall-info list. */
2255static void print_cpu_stall_info_end(void)
2256{
2257 printk(KERN_ERR "\t");
2258}
2259
2260/* Zero ->ticks_this_gp for all flavors of RCU. */
2261static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2262{
2263 rdp->ticks_this_gp = 0;
2264}
2265
2266/* Increment ->ticks_this_gp for all flavors of RCU. */
2267static void increment_cpu_stall_ticks(void)
2268{
2269 __get_cpu_var(rcu_sched_data).ticks_this_gp++;
2270 __get_cpu_var(rcu_bh_data).ticks_this_gp++;
2271#ifdef CONFIG_TREE_PREEMPT_RCU
2272 __get_cpu_var(rcu_preempt_data).ticks_this_gp++;
2273#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
2274}
2275
2276#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
2277
2278static void print_cpu_stall_info_begin(void)
2279{
2280 printk(KERN_CONT " {");
2281}
2282
2283static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
2284{
2285 printk(KERN_CONT " %d", cpu);
2286}
2287
2288static void print_cpu_stall_info_end(void)
2289{
2290 printk(KERN_CONT "} ");
2291}
2292
2293static void zero_cpu_stall_ticks(struct rcu_data *rdp)
2294{
2295}
2296
2297static void increment_cpu_stall_ticks(void)
2298{
2299}
2300
2301#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 654cfe67f0d1..ed459edeff43 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -72,9 +72,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
72 rdp->dynticks->dynticks_nesting, 72 rdp->dynticks->dynticks_nesting,
73 rdp->dynticks->dynticks_nmi_nesting, 73 rdp->dynticks->dynticks_nmi_nesting,
74 rdp->dynticks_fqs); 74 rdp->dynticks_fqs);
75 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 75 seq_printf(m, " of=%lu", rdp->offline_fqs);
76 seq_printf(m, " ql=%ld qs=%c%c%c%c", 76 seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c",
77 rdp->qlen, 77 rdp->qlen_lazy, rdp->qlen,
78 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 78 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
79 rdp->nxttail[RCU_NEXT_TAIL]], 79 rdp->nxttail[RCU_NEXT_TAIL]],
80 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 80 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -144,8 +144,8 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
144 rdp->dynticks->dynticks_nesting, 144 rdp->dynticks->dynticks_nesting,
145 rdp->dynticks->dynticks_nmi_nesting, 145 rdp->dynticks->dynticks_nmi_nesting,
146 rdp->dynticks_fqs); 146 rdp->dynticks_fqs);
147 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 147 seq_printf(m, ",%lu", rdp->offline_fqs);
148 seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, 148 seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen,
149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != 149 ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
150 rdp->nxttail[RCU_NEXT_TAIL]], 150 rdp->nxttail[RCU_NEXT_TAIL]],
151 ".R"[rdp->nxttail[RCU_WAIT_TAIL] != 151 ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
@@ -168,7 +168,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
168{ 168{
169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); 169 seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); 170 seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
171 seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); 171 seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
172#ifdef CONFIG_RCU_BOOST 172#ifdef CONFIG_RCU_BOOST
173 seq_puts(m, "\"kt\",\"ktl\""); 173 seq_puts(m, "\"kt\",\"ktl\"");
174#endif /* #ifdef CONFIG_RCU_BOOST */ 174#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/relay.c b/kernel/relay.c
index 4335e1d7ee2d..ab56a1764d4d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -164,10 +164,14 @@ depopulate:
164 */ 164 */
165static struct rchan_buf *relay_create_buf(struct rchan *chan) 165static struct rchan_buf *relay_create_buf(struct rchan *chan)
166{ 166{
167 struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); 167 struct rchan_buf *buf;
168 if (!buf) 168
169 if (chan->n_subbufs > UINT_MAX / sizeof(size_t *))
169 return NULL; 170 return NULL;
170 171
172 buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
173 if (!buf)
174 return NULL;
171 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); 175 buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
172 if (!buf->padding) 176 if (!buf->padding)
173 goto free_buf; 177 goto free_buf;
@@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename,
574 578
575 if (!(subbuf_size && n_subbufs)) 579 if (!(subbuf_size && n_subbufs))
576 return NULL; 580 return NULL;
581 if (subbuf_size > UINT_MAX / n_subbufs)
582 return NULL;
577 583
578 chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); 584 chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
579 if (!chan) 585 if (!chan)
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 6d269cce7aa1..d508363858b3 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -66,6 +66,31 @@ done:
66 return ret; 66 return ret;
67} 67}
68 68
69int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
70 struct res_counter **limit_fail_at)
71{
72 int ret, r;
73 unsigned long flags;
74 struct res_counter *c;
75
76 r = ret = 0;
77 *limit_fail_at = NULL;
78 local_irq_save(flags);
79 for (c = counter; c != NULL; c = c->parent) {
80 spin_lock(&c->lock);
81 r = res_counter_charge_locked(c, val);
82 if (r)
83 c->usage += val;
84 spin_unlock(&c->lock);
85 if (r < 0 && ret == 0) {
86 *limit_fail_at = c;
87 ret = r;
88 }
89 }
90 local_irq_restore(flags);
91
92 return ret;
93}
69void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) 94void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
70{ 95{
71 if (WARN_ON(counter->usage < val)) 96 if (WARN_ON(counter->usage < val))
diff --git a/kernel/resource.c b/kernel/resource.c
index 7640b3a947d0..7e8ea66a8c01 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -749,6 +749,7 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
749 write_unlock(&resource_lock); 749 write_unlock(&resource_lock);
750 return result; 750 return result;
751} 751}
752EXPORT_SYMBOL(adjust_resource);
752 753
753static void __init __reserve_region_with_split(struct resource *root, 754static void __init __reserve_region_with_split(struct resource *root,
754 resource_size_t start, resource_size_t end, 755 resource_size_t start, resource_size_t end,
@@ -792,8 +793,6 @@ void __init reserve_region_with_split(struct resource *root,
792 write_unlock(&resource_lock); 793 write_unlock(&resource_lock);
793} 794}
794 795
795EXPORT_SYMBOL(adjust_resource);
796
797/** 796/**
798 * resource_alignment - calculate resource's alignment 797 * resource_alignment - calculate resource's alignment
799 * @res: resource pointer 798 * @res: resource pointer
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b152f74f02de..6850f53e02d8 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -10,7 +10,6 @@
10#include <linux/export.h> 10#include <linux/export.h>
11#include <linux/rwsem.h> 11#include <linux/rwsem.h>
12 12
13#include <asm/system.h>
14#include <linux/atomic.h> 13#include <linux/atomic.h>
15 14
16/* 15/*
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e8a1f83ee0e7..0984a21076a3 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup);
195 195
196#ifdef CONFIG_PROC_FS 196#ifdef CONFIG_PROC_FS
197 197
198int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) 198int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
199{ 199{
200 static unsigned long next = INITIAL_JIFFIES; 200 static unsigned long next = INITIAL_JIFFIES;
201 struct autogroup *ag; 201 struct autogroup *ag;
202 int err; 202 int err;
203 203
204 if (*nice < -20 || *nice > 19) 204 if (nice < -20 || nice > 19)
205 return -EINVAL; 205 return -EINVAL;
206 206
207 err = security_task_setnice(current, *nice); 207 err = security_task_setnice(current, nice);
208 if (err) 208 if (err)
209 return err; 209 return err;
210 210
211 if (*nice < 0 && !can_nice(current, *nice)) 211 if (nice < 0 && !can_nice(current, nice))
212 return -EPERM; 212 return -EPERM;
213 213
214 /* this is a heavy operation taking global locks.. */ 214 /* this is a heavy operation taking global locks.. */
@@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
219 ag = autogroup_task_get(p); 219 ag = autogroup_task_get(p);
220 220
221 down_write(&ag->lock); 221 down_write(&ag->lock);
222 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); 222 err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]);
223 if (!err) 223 if (!err)
224 ag->nice = *nice; 224 ag->nice = nice;
225 up_write(&ag->lock); 225 up_write(&ag->lock);
226 226
227 autogroup_kref_put(ag); 227 autogroup_kref_put(ag);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index df00cb09263e..e3ed0ecee7c7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -71,9 +71,12 @@
71#include <linux/ftrace.h> 71#include <linux/ftrace.h>
72#include <linux/slab.h> 72#include <linux/slab.h>
73#include <linux/init_task.h> 73#include <linux/init_task.h>
74#include <linux/binfmts.h>
74 75
76#include <asm/switch_to.h>
75#include <asm/tlb.h> 77#include <asm/tlb.h>
76#include <asm/irq_regs.h> 78#include <asm/irq_regs.h>
79#include <asm/mutex.h>
77#ifdef CONFIG_PARAVIRT 80#ifdef CONFIG_PARAVIRT
78#include <asm/paravirt.h> 81#include <asm/paravirt.h>
79#endif 82#endif
@@ -161,13 +164,13 @@ static int sched_feat_show(struct seq_file *m, void *v)
161 164
162#ifdef HAVE_JUMP_LABEL 165#ifdef HAVE_JUMP_LABEL
163 166
164#define jump_label_key__true jump_label_key_enabled 167#define jump_label_key__true STATIC_KEY_INIT_TRUE
165#define jump_label_key__false jump_label_key_disabled 168#define jump_label_key__false STATIC_KEY_INIT_FALSE
166 169
167#define SCHED_FEAT(name, enabled) \ 170#define SCHED_FEAT(name, enabled) \
168 jump_label_key__##enabled , 171 jump_label_key__##enabled ,
169 172
170struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { 173struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
171#include "features.h" 174#include "features.h"
172}; 175};
173 176
@@ -175,14 +178,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
175 178
176static void sched_feat_disable(int i) 179static void sched_feat_disable(int i)
177{ 180{
178 if (jump_label_enabled(&sched_feat_keys[i])) 181 if (static_key_enabled(&sched_feat_keys[i]))
179 jump_label_dec(&sched_feat_keys[i]); 182 static_key_slow_dec(&sched_feat_keys[i]);
180} 183}
181 184
182static void sched_feat_enable(int i) 185static void sched_feat_enable(int i)
183{ 186{
184 if (!jump_label_enabled(&sched_feat_keys[i])) 187 if (!static_key_enabled(&sched_feat_keys[i]))
185 jump_label_inc(&sched_feat_keys[i]); 188 static_key_slow_inc(&sched_feat_keys[i]);
186} 189}
187#else 190#else
188static void sched_feat_disable(int i) { }; 191static void sched_feat_disable(int i) { };
@@ -723,9 +726,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
723 p->sched_class->dequeue_task(rq, p, flags); 726 p->sched_class->dequeue_task(rq, p, flags);
724} 727}
725 728
726/*
727 * activate_task - move a task to the runqueue.
728 */
729void activate_task(struct rq *rq, struct task_struct *p, int flags) 729void activate_task(struct rq *rq, struct task_struct *p, int flags)
730{ 730{
731 if (task_contributes_to_load(p)) 731 if (task_contributes_to_load(p))
@@ -734,9 +734,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
734 enqueue_task(rq, p, flags); 734 enqueue_task(rq, p, flags);
735} 735}
736 736
737/*
738 * deactivate_task - remove a task from the runqueue.
739 */
740void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 737void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
741{ 738{
742 if (task_contributes_to_load(p)) 739 if (task_contributes_to_load(p))
@@ -899,7 +896,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
899 delta -= irq_delta; 896 delta -= irq_delta;
900#endif 897#endif
901#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 898#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
902 if (static_branch((&paravirt_steal_rq_enabled))) { 899 if (static_key_false((&paravirt_steal_rq_enabled))) {
903 u64 st; 900 u64 st;
904 901
905 steal = paravirt_steal_clock(cpu_of(rq)); 902 steal = paravirt_steal_clock(cpu_of(rq));
@@ -1268,29 +1265,59 @@ EXPORT_SYMBOL_GPL(kick_process);
1268 */ 1265 */
1269static int select_fallback_rq(int cpu, struct task_struct *p) 1266static int select_fallback_rq(int cpu, struct task_struct *p)
1270{ 1267{
1271 int dest_cpu;
1272 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 1268 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
1269 enum { cpuset, possible, fail } state = cpuset;
1270 int dest_cpu;
1273 1271
1274 /* Look for allowed, online CPU in same node. */ 1272 /* Look for allowed, online CPU in same node. */
1275 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) 1273 for_each_cpu_mask(dest_cpu, *nodemask) {
1274 if (!cpu_online(dest_cpu))
1275 continue;
1276 if (!cpu_active(dest_cpu))
1277 continue;
1276 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1278 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1277 return dest_cpu; 1279 return dest_cpu;
1280 }
1278 1281
1279 /* Any allowed, online CPU? */ 1282 for (;;) {
1280 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); 1283 /* Any allowed, online CPU? */
1281 if (dest_cpu < nr_cpu_ids) 1284 for_each_cpu_mask(dest_cpu, *tsk_cpus_allowed(p)) {
1282 return dest_cpu; 1285 if (!cpu_online(dest_cpu))
1286 continue;
1287 if (!cpu_active(dest_cpu))
1288 continue;
1289 goto out;
1290 }
1283 1291
1284 /* No more Mr. Nice Guy. */ 1292 switch (state) {
1285 dest_cpu = cpuset_cpus_allowed_fallback(p); 1293 case cpuset:
1286 /* 1294 /* No more Mr. Nice Guy. */
1287 * Don't tell them about moving exiting tasks or 1295 cpuset_cpus_allowed_fallback(p);
1288 * kernel threads (both mm NULL), since they never 1296 state = possible;
1289 * leave kernel. 1297 break;
1290 */ 1298
1291 if (p->mm && printk_ratelimit()) { 1299 case possible:
1292 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", 1300 do_set_cpus_allowed(p, cpu_possible_mask);
1293 task_pid_nr(p), p->comm, cpu); 1301 state = fail;
1302 break;
1303
1304 case fail:
1305 BUG();
1306 break;
1307 }
1308 }
1309
1310out:
1311 if (state != cpuset) {
1312 /*
1313 * Don't tell them about moving exiting tasks or
1314 * kernel threads (both mm NULL), since they never
1315 * leave kernel.
1316 */
1317 if (p->mm && printk_ratelimit()) {
1318 printk_sched("process %d (%s) no longer affine to cpu%d\n",
1319 task_pid_nr(p), p->comm, cpu);
1320 }
1294 } 1321 }
1295 1322
1296 return dest_cpu; 1323 return dest_cpu;
@@ -1512,7 +1539,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
1512} 1539}
1513#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1540#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1514 1541
1515static inline int ttwu_share_cache(int this_cpu, int that_cpu) 1542bool cpus_share_cache(int this_cpu, int that_cpu)
1516{ 1543{
1517 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1544 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1518} 1545}
@@ -1523,7 +1550,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
1523 struct rq *rq = cpu_rq(cpu); 1550 struct rq *rq = cpu_rq(cpu);
1524 1551
1525#if defined(CONFIG_SMP) 1552#if defined(CONFIG_SMP)
1526 if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { 1553 if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1527 sched_clock_cpu(cpu); /* sync clocks x-cpu */ 1554 sched_clock_cpu(cpu); /* sync clocks x-cpu */
1528 ttwu_queue_remote(p, cpu); 1555 ttwu_queue_remote(p, cpu);
1529 return; 1556 return;
@@ -1937,7 +1964,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1937 local_irq_enable(); 1964 local_irq_enable();
1938#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ 1965#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
1939 finish_lock_switch(rq, prev); 1966 finish_lock_switch(rq, prev);
1940 trace_sched_stat_sleeptime(current, rq->clock);
1941 1967
1942 fire_sched_in_preempt_notifiers(current); 1968 fire_sched_in_preempt_notifiers(current);
1943 if (mm) 1969 if (mm)
@@ -2272,13 +2298,10 @@ calc_load_n(unsigned long load, unsigned long exp,
2272 * Once we've updated the global active value, we need to apply the exponential 2298 * Once we've updated the global active value, we need to apply the exponential
2273 * weights adjusted to the number of cycles missed. 2299 * weights adjusted to the number of cycles missed.
2274 */ 2300 */
2275static void calc_global_nohz(unsigned long ticks) 2301static void calc_global_nohz(void)
2276{ 2302{
2277 long delta, active, n; 2303 long delta, active, n;
2278 2304
2279 if (time_before(jiffies, calc_load_update))
2280 return;
2281
2282 /* 2305 /*
2283 * If we crossed a calc_load_update boundary, make sure to fold 2306 * If we crossed a calc_load_update boundary, make sure to fold
2284 * any pending idle changes, the respective CPUs might have 2307 * any pending idle changes, the respective CPUs might have
@@ -2290,31 +2313,25 @@ static void calc_global_nohz(unsigned long ticks)
2290 atomic_long_add(delta, &calc_load_tasks); 2313 atomic_long_add(delta, &calc_load_tasks);
2291 2314
2292 /* 2315 /*
2293 * If we were idle for multiple load cycles, apply them. 2316 * It could be the one fold was all it took, we done!
2294 */ 2317 */
2295 if (ticks >= LOAD_FREQ) { 2318 if (time_before(jiffies, calc_load_update + 10))
2296 n = ticks / LOAD_FREQ; 2319 return;
2297 2320
2298 active = atomic_long_read(&calc_load_tasks); 2321 /*
2299 active = active > 0 ? active * FIXED_1 : 0; 2322 * Catch-up, fold however many we are behind still
2323 */
2324 delta = jiffies - calc_load_update - 10;
2325 n = 1 + (delta / LOAD_FREQ);
2300 2326
2301 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 2327 active = atomic_long_read(&calc_load_tasks);
2302 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 2328 active = active > 0 ? active * FIXED_1 : 0;
2303 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2304 2329
2305 calc_load_update += n * LOAD_FREQ; 2330 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2306 } 2331 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2332 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2307 2333
2308 /* 2334 calc_load_update += n * LOAD_FREQ;
2309 * Its possible the remainder of the above division also crosses
2310 * a LOAD_FREQ period, the regular check in calc_global_load()
2311 * which comes after this will take care of that.
2312 *
2313 * Consider us being 11 ticks before a cycle completion, and us
2314 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
2315 * age us 4 cycles, and the test in calc_global_load() will
2316 * pick up the final one.
2317 */
2318} 2335}
2319#else 2336#else
2320void calc_load_account_idle(struct rq *this_rq) 2337void calc_load_account_idle(struct rq *this_rq)
@@ -2326,7 +2343,7 @@ static inline long calc_load_fold_idle(void)
2326 return 0; 2343 return 0;
2327} 2344}
2328 2345
2329static void calc_global_nohz(unsigned long ticks) 2346static void calc_global_nohz(void)
2330{ 2347{
2331} 2348}
2332#endif 2349#endif
@@ -2354,8 +2371,6 @@ void calc_global_load(unsigned long ticks)
2354{ 2371{
2355 long active; 2372 long active;
2356 2373
2357 calc_global_nohz(ticks);
2358
2359 if (time_before(jiffies, calc_load_update + 10)) 2374 if (time_before(jiffies, calc_load_update + 10))
2360 return; 2375 return;
2361 2376
@@ -2367,6 +2382,16 @@ void calc_global_load(unsigned long ticks)
2367 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 2382 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
2368 2383
2369 calc_load_update += LOAD_FREQ; 2384 calc_load_update += LOAD_FREQ;
2385
2386 /*
2387 * Account one period with whatever state we found before
2388 * folding in the nohz state and ageing the entire idle period.
2389 *
2390 * This avoids loosing a sample when we go idle between
2391 * calc_load_account_active() (10 ticks ago) and now and thus
2392 * under-accounting.
2393 */
2394 calc_global_nohz();
2370} 2395}
2371 2396
2372/* 2397/*
@@ -2761,7 +2786,7 @@ void account_idle_time(cputime_t cputime)
2761static __always_inline bool steal_account_process_tick(void) 2786static __always_inline bool steal_account_process_tick(void)
2762{ 2787{
2763#ifdef CONFIG_PARAVIRT 2788#ifdef CONFIG_PARAVIRT
2764 if (static_branch(&paravirt_steal_enabled)) { 2789 if (static_key_false(&paravirt_steal_enabled)) {
2765 u64 steal, st = 0; 2790 u64 steal, st = 0;
2766 2791
2767 steal = paravirt_steal_clock(smp_processor_id()); 2792 steal = paravirt_steal_clock(smp_processor_id());
@@ -3226,14 +3251,14 @@ need_resched:
3226 3251
3227 post_schedule(rq); 3252 post_schedule(rq);
3228 3253
3229 preempt_enable_no_resched(); 3254 sched_preempt_enable_no_resched();
3230 if (need_resched()) 3255 if (need_resched())
3231 goto need_resched; 3256 goto need_resched;
3232} 3257}
3233 3258
3234static inline void sched_submit_work(struct task_struct *tsk) 3259static inline void sched_submit_work(struct task_struct *tsk)
3235{ 3260{
3236 if (!tsk->state) 3261 if (!tsk->state || tsk_is_pi_blocked(tsk))
3237 return; 3262 return;
3238 /* 3263 /*
3239 * If we are going to sleep and we have plugged IO queued, 3264 * If we are going to sleep and we have plugged IO queued,
@@ -3252,6 +3277,18 @@ asmlinkage void __sched schedule(void)
3252} 3277}
3253EXPORT_SYMBOL(schedule); 3278EXPORT_SYMBOL(schedule);
3254 3279
3280/**
3281 * schedule_preempt_disabled - called with preemption disabled
3282 *
3283 * Returns with preemption disabled. Note: preempt_count must be 1
3284 */
3285void __sched schedule_preempt_disabled(void)
3286{
3287 sched_preempt_enable_no_resched();
3288 schedule();
3289 preempt_disable();
3290}
3291
3255#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 3292#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
3256 3293
3257static inline bool owner_running(struct mutex *lock, struct task_struct *owner) 3294static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
@@ -3412,9 +3449,9 @@ EXPORT_SYMBOL(__wake_up);
3412/* 3449/*
3413 * Same as __wake_up but called with the spinlock in wait_queue_head_t held. 3450 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3414 */ 3451 */
3415void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) 3452void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
3416{ 3453{
3417 __wake_up_common(q, mode, 1, 0, NULL); 3454 __wake_up_common(q, mode, nr, 0, NULL);
3418} 3455}
3419EXPORT_SYMBOL_GPL(__wake_up_locked); 3456EXPORT_SYMBOL_GPL(__wake_up_locked);
3420 3457
@@ -3773,6 +3810,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3773 3810
3774 rq = __task_rq_lock(p); 3811 rq = __task_rq_lock(p);
3775 3812
3813 /*
3814 * Idle task boosting is a nono in general. There is one
3815 * exception, when PREEMPT_RT and NOHZ is active:
3816 *
3817 * The idle task calls get_next_timer_interrupt() and holds
3818 * the timer wheel base->lock on the CPU and another CPU wants
3819 * to access the timer (probably to cancel it). We can safely
3820 * ignore the boosting request, as the idle CPU runs this code
3821 * with interrupts disabled and will complete the lock
3822 * protected section without being interrupted. So there is no
3823 * real need to boost.
3824 */
3825 if (unlikely(p == rq->idle)) {
3826 WARN_ON(p != rq->curr);
3827 WARN_ON(p->pi_blocked_on);
3828 goto out_unlock;
3829 }
3830
3776 trace_sched_pi_setprio(p, prio); 3831 trace_sched_pi_setprio(p, prio);
3777 oldprio = p->prio; 3832 oldprio = p->prio;
3778 prev_class = p->sched_class; 3833 prev_class = p->sched_class;
@@ -3796,11 +3851,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3796 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 3851 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
3797 3852
3798 check_class_changed(rq, p, prev_class, oldprio); 3853 check_class_changed(rq, p, prev_class, oldprio);
3854out_unlock:
3799 __task_rq_unlock(rq); 3855 __task_rq_unlock(rq);
3800} 3856}
3801
3802#endif 3857#endif
3803
3804void set_user_nice(struct task_struct *p, long nice) 3858void set_user_nice(struct task_struct *p, long nice)
3805{ 3859{
3806 int old_prio, delta, on_rq; 3860 int old_prio, delta, on_rq;
@@ -4134,7 +4188,7 @@ recheck:
4134 on_rq = p->on_rq; 4188 on_rq = p->on_rq;
4135 running = task_current(rq, p); 4189 running = task_current(rq, p);
4136 if (on_rq) 4190 if (on_rq)
4137 deactivate_task(rq, p, 0); 4191 dequeue_task(rq, p, 0);
4138 if (running) 4192 if (running)
4139 p->sched_class->put_prev_task(rq, p); 4193 p->sched_class->put_prev_task(rq, p);
4140 4194
@@ -4147,7 +4201,7 @@ recheck:
4147 if (running) 4201 if (running)
4148 p->sched_class->set_curr_task(rq); 4202 p->sched_class->set_curr_task(rq);
4149 if (on_rq) 4203 if (on_rq)
4150 activate_task(rq, p, 0); 4204 enqueue_task(rq, p, 0);
4151 4205
4152 check_class_changed(rq, p, prev_class, oldprio); 4206 check_class_changed(rq, p, prev_class, oldprio);
4153 task_rq_unlock(rq, p, &flags); 4207 task_rq_unlock(rq, p, &flags);
@@ -4480,7 +4534,7 @@ SYSCALL_DEFINE0(sched_yield)
4480 __release(rq->lock); 4534 __release(rq->lock);
4481 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4535 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4482 do_raw_spin_unlock(&rq->lock); 4536 do_raw_spin_unlock(&rq->lock);
4483 preempt_enable_no_resched(); 4537 sched_preempt_enable_no_resched();
4484 4538
4485 schedule(); 4539 schedule();
4486 4540
@@ -4554,8 +4608,24 @@ EXPORT_SYMBOL(__cond_resched_softirq);
4554/** 4608/**
4555 * yield - yield the current processor to other threads. 4609 * yield - yield the current processor to other threads.
4556 * 4610 *
4557 * This is a shortcut for kernel-space yielding - it marks the 4611 * Do not ever use this function, there's a 99% chance you're doing it wrong.
4558 * thread runnable and calls sys_sched_yield(). 4612 *
4613 * The scheduler is at all times free to pick the calling task as the most
4614 * eligible task to run, if removing the yield() call from your code breaks
4615 * it, its already broken.
4616 *
4617 * Typical broken usage is:
4618 *
4619 * while (!event)
4620 * yield();
4621 *
4622 * where one assumes that yield() will let 'the other' process run that will
4623 * make event true. If the current task is a SCHED_FIFO task that will never
4624 * happen. Never use yield() as a progress guarantee!!
4625 *
4626 * If you want to use yield() to wait for something, use wait_event().
4627 * If you want to use yield() to be 'nice' for others, use cond_resched().
4628 * If you still want to use yield(), do not!
4559 */ 4629 */
4560void __sched yield(void) 4630void __sched yield(void)
4561{ 4631{
@@ -4998,9 +5068,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4998 * placed properly. 5068 * placed properly.
4999 */ 5069 */
5000 if (p->on_rq) { 5070 if (p->on_rq) {
5001 deactivate_task(rq_src, p, 0); 5071 dequeue_task(rq_src, p, 0);
5002 set_task_cpu(p, dest_cpu); 5072 set_task_cpu(p, dest_cpu);
5003 activate_task(rq_dest, p, 0); 5073 enqueue_task(rq_dest, p, 0);
5004 check_preempt_curr(rq_dest, p, 0); 5074 check_preempt_curr(rq_dest, p, 0);
5005 } 5075 }
5006done: 5076done:
@@ -5387,7 +5457,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5387 unsigned long action, void *hcpu) 5457 unsigned long action, void *hcpu)
5388{ 5458{
5389 switch (action & ~CPU_TASKS_FROZEN) { 5459 switch (action & ~CPU_TASKS_FROZEN) {
5390 case CPU_ONLINE: 5460 case CPU_STARTING:
5391 case CPU_DOWN_FAILED: 5461 case CPU_DOWN_FAILED:
5392 set_cpu_active((long)hcpu, true); 5462 set_cpu_active((long)hcpu, true);
5393 return NOTIFY_OK; 5463 return NOTIFY_OK;
@@ -5759,7 +5829,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5759 * 5829 *
5760 * Also keep a unique ID per domain (we use the first cpu number in 5830 * Also keep a unique ID per domain (we use the first cpu number in
5761 * the cpumask of the domain), this allows us to quickly tell if 5831 * the cpumask of the domain), this allows us to quickly tell if
5762 * two cpus are in the same cache domain, see ttwu_share_cache(). 5832 * two cpus are in the same cache domain, see cpus_share_cache().
5763 */ 5833 */
5764DEFINE_PER_CPU(struct sched_domain *, sd_llc); 5834DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5765DEFINE_PER_CPU(int, sd_llc_id); 5835DEFINE_PER_CPU(int, sd_llc_id);
@@ -6936,6 +7006,9 @@ void __init sched_init(void)
6936 rq->online = 0; 7006 rq->online = 0;
6937 rq->idle_stamp = 0; 7007 rq->idle_stamp = 0;
6938 rq->avg_idle = 2*sysctl_sched_migration_cost; 7008 rq->avg_idle = 2*sysctl_sched_migration_cost;
7009
7010 INIT_LIST_HEAD(&rq->cfs_tasks);
7011
6939 rq_attach_root(rq, &def_root_domain); 7012 rq_attach_root(rq, &def_root_domain);
6940#ifdef CONFIG_NO_HZ 7013#ifdef CONFIG_NO_HZ
6941 rq->nohz_flags = 0; 7014 rq->nohz_flags = 0;
@@ -7032,10 +7105,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7032 7105
7033 on_rq = p->on_rq; 7106 on_rq = p->on_rq;
7034 if (on_rq) 7107 if (on_rq)
7035 deactivate_task(rq, p, 0); 7108 dequeue_task(rq, p, 0);
7036 __setscheduler(rq, p, SCHED_NORMAL, 0); 7109 __setscheduler(rq, p, SCHED_NORMAL, 0);
7037 if (on_rq) { 7110 if (on_rq) {
7038 activate_task(rq, p, 0); 7111 enqueue_task(rq, p, 0);
7039 resched_task(rq->curr); 7112 resched_task(rq->curr);
7040 } 7113 }
7041 7114
@@ -7530,8 +7603,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
7530 struct task_group, css); 7603 struct task_group, css);
7531} 7604}
7532 7605
7533static struct cgroup_subsys_state * 7606static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp)
7534cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7535{ 7607{
7536 struct task_group *tg, *parent; 7608 struct task_group *tg, *parent;
7537 7609
@@ -7548,15 +7620,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7548 return &tg->css; 7620 return &tg->css;
7549} 7621}
7550 7622
7551static void 7623static void cpu_cgroup_destroy(struct cgroup *cgrp)
7552cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7553{ 7624{
7554 struct task_group *tg = cgroup_tg(cgrp); 7625 struct task_group *tg = cgroup_tg(cgrp);
7555 7626
7556 sched_destroy_group(tg); 7627 sched_destroy_group(tg);
7557} 7628}
7558 7629
7559static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7630static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7560 struct cgroup_taskset *tset) 7631 struct cgroup_taskset *tset)
7561{ 7632{
7562 struct task_struct *task; 7633 struct task_struct *task;
@@ -7574,7 +7645,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7574 return 0; 7645 return 0;
7575} 7646}
7576 7647
7577static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7648static void cpu_cgroup_attach(struct cgroup *cgrp,
7578 struct cgroup_taskset *tset) 7649 struct cgroup_taskset *tset)
7579{ 7650{
7580 struct task_struct *task; 7651 struct task_struct *task;
@@ -7584,8 +7655,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7584} 7655}
7585 7656
7586static void 7657static void
7587cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7658cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7588 struct cgroup *old_cgrp, struct task_struct *task) 7659 struct task_struct *task)
7589{ 7660{
7590 /* 7661 /*
7591 * cgroup_exit() is called in the copy_process() failure path. 7662 * cgroup_exit() is called in the copy_process() failure path.
@@ -7935,8 +8006,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7935 */ 8006 */
7936 8007
7937/* create a new cpu accounting group */ 8008/* create a new cpu accounting group */
7938static struct cgroup_subsys_state *cpuacct_create( 8009static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
7939 struct cgroup_subsys *ss, struct cgroup *cgrp)
7940{ 8010{
7941 struct cpuacct *ca; 8011 struct cpuacct *ca;
7942 8012
@@ -7966,8 +8036,7 @@ out:
7966} 8036}
7967 8037
7968/* destroy an existing cpu accounting group */ 8038/* destroy an existing cpu accounting group */
7969static void 8039static void cpuacct_destroy(struct cgroup *cgrp)
7970cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7971{ 8040{
7972 struct cpuacct *ca = cgroup_ca(cgrp); 8041 struct cpuacct *ca = cgroup_ca(cgrp);
7973 8042
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index b0d798eaf130..d72586fdf660 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -129,7 +129,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
129 * cpupri_set - update the cpu priority setting 129 * cpupri_set - update the cpu priority setting
130 * @cp: The cpupri context 130 * @cp: The cpupri context
131 * @cpu: The target cpu 131 * @cpu: The target cpu
132 * @pri: The priority (INVALID-RT99) to assign to this CPU 132 * @newpri: The priority (INVALID-RT99) to assign to this CPU
133 * 133 *
134 * Note: Assumes cpu_rq(cpu)->lock is locked 134 * Note: Assumes cpu_rq(cpu)->lock is locked
135 * 135 *
@@ -200,7 +200,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
200/** 200/**
201 * cpupri_init - initialize the cpupri structure 201 * cpupri_init - initialize the cpupri structure
202 * @cp: The cpupri context 202 * @cp: The cpupri context
203 * @bootmem: true if allocations need to use bootmem
204 * 203 *
205 * Returns: -ENOMEM if memory fails. 204 * Returns: -ENOMEM if memory fails.
206 */ 205 */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a075e10004b..09acaa15161d 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -288,7 +288,6 @@ static void print_cpu(struct seq_file *m, int cpu)
288 288
289 P(yld_count); 289 P(yld_count);
290 290
291 P(sched_switch);
292 P(sched_count); 291 P(sched_count);
293 P(sched_goidle); 292 P(sched_goidle);
294#ifdef CONFIG_SMP 293#ifdef CONFIG_SMP
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 84adb2d66cbd..0d97ebdc58f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
416 416
417#endif /* CONFIG_FAIR_GROUP_SCHED */ 417#endif /* CONFIG_FAIR_GROUP_SCHED */
418 418
419static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 419static __always_inline
420 unsigned long delta_exec); 420void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
421 421
422/************************************************************** 422/**************************************************************
423 * Scheduling class tree data structure manipulation methods: 423 * Scheduling class tree data structure manipulation methods:
@@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
776 * Scheduling class queueing methods: 776 * Scheduling class queueing methods:
777 */ 777 */
778 778
779#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
780static void
781add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
782{
783 cfs_rq->task_weight += weight;
784}
785#else
786static inline void
787add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
788{
789}
790#endif
791
792static void 779static void
793account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
794{ 781{
795 update_load_add(&cfs_rq->load, se->load.weight); 782 update_load_add(&cfs_rq->load, se->load.weight);
796 if (!parent_entity(se)) 783 if (!parent_entity(se))
797 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 784 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
798 if (entity_is_task(se)) { 785#ifdef CONFIG_SMP
799 add_cfs_task_weight(cfs_rq, se->load.weight); 786 if (entity_is_task(se))
800 list_add(&se->group_node, &cfs_rq->tasks); 787 list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
801 } 788#endif
802 cfs_rq->nr_running++; 789 cfs_rq->nr_running++;
803} 790}
804 791
@@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
808 update_load_sub(&cfs_rq->load, se->load.weight); 795 update_load_sub(&cfs_rq->load, se->load.weight);
809 if (!parent_entity(se)) 796 if (!parent_entity(se))
810 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 797 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
811 if (entity_is_task(se)) { 798 if (entity_is_task(se))
812 add_cfs_task_weight(cfs_rq, -se->load.weight);
813 list_del_init(&se->group_node); 799 list_del_init(&se->group_node);
814 }
815 cfs_rq->nr_running--; 800 cfs_rq->nr_running--;
816} 801}
817 802
@@ -1003,6 +988,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1003 if (unlikely(delta > se->statistics.sleep_max)) 988 if (unlikely(delta > se->statistics.sleep_max))
1004 se->statistics.sleep_max = delta; 989 se->statistics.sleep_max = delta;
1005 990
991 se->statistics.sleep_start = 0;
1006 se->statistics.sum_sleep_runtime += delta; 992 se->statistics.sum_sleep_runtime += delta;
1007 993
1008 if (tsk) { 994 if (tsk) {
@@ -1019,6 +1005,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
1019 if (unlikely(delta > se->statistics.block_max)) 1005 if (unlikely(delta > se->statistics.block_max))
1020 se->statistics.block_max = delta; 1006 se->statistics.block_max = delta;
1021 1007
1008 se->statistics.block_start = 0;
1022 se->statistics.sum_sleep_runtime += delta; 1009 se->statistics.sum_sleep_runtime += delta;
1023 1010
1024 if (tsk) { 1011 if (tsk) {
@@ -1175,7 +1162,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1175 __clear_buddies_skip(se); 1162 __clear_buddies_skip(se);
1176} 1163}
1177 1164
1178static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); 1165static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1179 1166
1180static void 1167static void
1181dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 1168dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -1399,20 +1386,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
1399#ifdef CONFIG_CFS_BANDWIDTH 1386#ifdef CONFIG_CFS_BANDWIDTH
1400 1387
1401#ifdef HAVE_JUMP_LABEL 1388#ifdef HAVE_JUMP_LABEL
1402static struct jump_label_key __cfs_bandwidth_used; 1389static struct static_key __cfs_bandwidth_used;
1403 1390
1404static inline bool cfs_bandwidth_used(void) 1391static inline bool cfs_bandwidth_used(void)
1405{ 1392{
1406 return static_branch(&__cfs_bandwidth_used); 1393 return static_key_false(&__cfs_bandwidth_used);
1407} 1394}
1408 1395
1409void account_cfs_bandwidth_used(int enabled, int was_enabled) 1396void account_cfs_bandwidth_used(int enabled, int was_enabled)
1410{ 1397{
1411 /* only need to count groups transitioning between enabled/!enabled */ 1398 /* only need to count groups transitioning between enabled/!enabled */
1412 if (enabled && !was_enabled) 1399 if (enabled && !was_enabled)
1413 jump_label_inc(&__cfs_bandwidth_used); 1400 static_key_slow_inc(&__cfs_bandwidth_used);
1414 else if (!enabled && was_enabled) 1401 else if (!enabled && was_enabled)
1415 jump_label_dec(&__cfs_bandwidth_used); 1402 static_key_slow_dec(&__cfs_bandwidth_used);
1416} 1403}
1417#else /* HAVE_JUMP_LABEL */ 1404#else /* HAVE_JUMP_LABEL */
1418static bool cfs_bandwidth_used(void) 1405static bool cfs_bandwidth_used(void)
@@ -1559,8 +1546,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
1559 resched_task(rq_of(cfs_rq)->curr); 1546 resched_task(rq_of(cfs_rq)->curr);
1560} 1547}
1561 1548
1562static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 1549static __always_inline
1563 unsigned long delta_exec) 1550void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
1564{ 1551{
1565 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) 1552 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
1566 return; 1553 return;
@@ -2086,11 +2073,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
2086} 2073}
2087 2074
2088#else /* CONFIG_CFS_BANDWIDTH */ 2075#else /* CONFIG_CFS_BANDWIDTH */
2089static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, 2076static __always_inline
2090 unsigned long delta_exec) {} 2077void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
2091static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2078static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2092static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} 2079static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
2093static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 2080static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2094 2081
2095static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 2082static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2096{ 2083{
@@ -2670,8 +2657,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
2670 /* 2657 /*
2671 * Otherwise, iterate the domains and find an elegible idle cpu. 2658 * Otherwise, iterate the domains and find an elegible idle cpu.
2672 */ 2659 */
2673 rcu_read_lock();
2674
2675 sd = rcu_dereference(per_cpu(sd_llc, target)); 2660 sd = rcu_dereference(per_cpu(sd_llc, target));
2676 for_each_lower_domain(sd) { 2661 for_each_lower_domain(sd) {
2677 sg = sd->groups; 2662 sg = sd->groups;
@@ -2693,8 +2678,6 @@ next:
2693 } while (sg != sd->groups); 2678 } while (sg != sd->groups);
2694 } 2679 }
2695done: 2680done:
2696 rcu_read_unlock();
2697
2698 return target; 2681 return target;
2699} 2682}
2700 2683
@@ -2920,7 +2903,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
2920 return; 2903 return;
2921 2904
2922 /* 2905 /*
2923 * This is possible from callers such as pull_task(), in which we 2906 * This is possible from callers such as move_task(), in which we
2924 * unconditionally check_prempt_curr() after an enqueue (which may have 2907 * unconditionally check_prempt_curr() after an enqueue (which may have
2925 * lead to a throttle). This both saves work and prevents false 2908 * lead to a throttle). This both saves work and prevents false
2926 * next-buddy nomination below. 2909 * next-buddy nomination below.
@@ -3084,17 +3067,39 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3084 * Fair scheduling class load-balancing methods: 3067 * Fair scheduling class load-balancing methods:
3085 */ 3068 */
3086 3069
3070static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3071
3072#define LBF_ALL_PINNED 0x01
3073#define LBF_NEED_BREAK 0x02
3074
3075struct lb_env {
3076 struct sched_domain *sd;
3077
3078 int src_cpu;
3079 struct rq *src_rq;
3080
3081 int dst_cpu;
3082 struct rq *dst_rq;
3083
3084 enum cpu_idle_type idle;
3085 long load_move;
3086 unsigned int flags;
3087
3088 unsigned int loop;
3089 unsigned int loop_break;
3090 unsigned int loop_max;
3091};
3092
3087/* 3093/*
3088 * pull_task - move a task from a remote runqueue to the local runqueue. 3094 * move_task - move a task from one runqueue to another runqueue.
3089 * Both runqueues must be locked. 3095 * Both runqueues must be locked.
3090 */ 3096 */
3091static void pull_task(struct rq *src_rq, struct task_struct *p, 3097static void move_task(struct task_struct *p, struct lb_env *env)
3092 struct rq *this_rq, int this_cpu)
3093{ 3098{
3094 deactivate_task(src_rq, p, 0); 3099 deactivate_task(env->src_rq, p, 0);
3095 set_task_cpu(p, this_cpu); 3100 set_task_cpu(p, env->dst_cpu);
3096 activate_task(this_rq, p, 0); 3101 activate_task(env->dst_rq, p, 0);
3097 check_preempt_curr(this_rq, p, 0); 3102 check_preempt_curr(env->dst_rq, p, 0);
3098} 3103}
3099 3104
3100/* 3105/*
@@ -3129,19 +3134,11 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3129 return delta < (s64)sysctl_sched_migration_cost; 3134 return delta < (s64)sysctl_sched_migration_cost;
3130} 3135}
3131 3136
3132#define LBF_ALL_PINNED 0x01
3133#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
3134#define LBF_HAD_BREAK 0x04
3135#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
3136#define LBF_ABORT 0x10
3137
3138/* 3137/*
3139 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 3138 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3140 */ 3139 */
3141static 3140static
3142int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, 3141int can_migrate_task(struct task_struct *p, struct lb_env *env)
3143 struct sched_domain *sd, enum cpu_idle_type idle,
3144 int *lb_flags)
3145{ 3142{
3146 int tsk_cache_hot = 0; 3143 int tsk_cache_hot = 0;
3147 /* 3144 /*
@@ -3150,13 +3147,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3150 * 2) cannot be migrated to this CPU due to cpus_allowed, or 3147 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3151 * 3) are cache-hot on their current CPU. 3148 * 3) are cache-hot on their current CPU.
3152 */ 3149 */
3153 if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { 3150 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
3154 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 3151 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3155 return 0; 3152 return 0;
3156 } 3153 }
3157 *lb_flags &= ~LBF_ALL_PINNED; 3154 env->flags &= ~LBF_ALL_PINNED;
3158 3155
3159 if (task_running(rq, p)) { 3156 if (task_running(env->src_rq, p)) {
3160 schedstat_inc(p, se.statistics.nr_failed_migrations_running); 3157 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
3161 return 0; 3158 return 0;
3162 } 3159 }
@@ -3167,12 +3164,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3167 * 2) too many balance attempts have failed. 3164 * 2) too many balance attempts have failed.
3168 */ 3165 */
3169 3166
3170 tsk_cache_hot = task_hot(p, rq->clock_task, sd); 3167 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
3171 if (!tsk_cache_hot || 3168 if (!tsk_cache_hot ||
3172 sd->nr_balance_failed > sd->cache_nice_tries) { 3169 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
3173#ifdef CONFIG_SCHEDSTATS 3170#ifdef CONFIG_SCHEDSTATS
3174 if (tsk_cache_hot) { 3171 if (tsk_cache_hot) {
3175 schedstat_inc(sd, lb_hot_gained[idle]); 3172 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
3176 schedstat_inc(p, se.statistics.nr_forced_migrations); 3173 schedstat_inc(p, se.statistics.nr_forced_migrations);
3177 } 3174 }
3178#endif 3175#endif
@@ -3193,65 +3190,80 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3193 * 3190 *
3194 * Called with both runqueues locked. 3191 * Called with both runqueues locked.
3195 */ 3192 */
3196static int 3193static int move_one_task(struct lb_env *env)
3197move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3198 struct sched_domain *sd, enum cpu_idle_type idle)
3199{ 3194{
3200 struct task_struct *p, *n; 3195 struct task_struct *p, *n;
3201 struct cfs_rq *cfs_rq;
3202 int pinned = 0;
3203 3196
3204 for_each_leaf_cfs_rq(busiest, cfs_rq) { 3197 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
3205 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { 3198 if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
3206 if (throttled_lb_pair(task_group(p), 3199 continue;
3207 busiest->cpu, this_cpu))
3208 break;
3209 3200
3210 if (!can_migrate_task(p, busiest, this_cpu, 3201 if (!can_migrate_task(p, env))
3211 sd, idle, &pinned)) 3202 continue;
3212 continue;
3213 3203
3214 pull_task(busiest, p, this_rq, this_cpu); 3204 move_task(p, env);
3215 /* 3205 /*
3216 * Right now, this is only the second place pull_task() 3206 * Right now, this is only the second place move_task()
3217 * is called, so we can safely collect pull_task() 3207 * is called, so we can safely collect move_task()
3218 * stats here rather than inside pull_task(). 3208 * stats here rather than inside move_task().
3219 */ 3209 */
3220 schedstat_inc(sd, lb_gained[idle]); 3210 schedstat_inc(env->sd, lb_gained[env->idle]);
3221 return 1; 3211 return 1;
3222 }
3223 } 3212 }
3224
3225 return 0; 3213 return 0;
3226} 3214}
3227 3215
3228static unsigned long 3216static unsigned long task_h_load(struct task_struct *p);
3229balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, 3217
3230 unsigned long max_load_move, struct sched_domain *sd, 3218/*
3231 enum cpu_idle_type idle, int *lb_flags, 3219 * move_tasks tries to move up to load_move weighted load from busiest to
3232 struct cfs_rq *busiest_cfs_rq) 3220 * this_rq, as part of a balancing operation within domain "sd".
3221 * Returns 1 if successful and 0 otherwise.
3222 *
3223 * Called with both runqueues locked.
3224 */
3225static int move_tasks(struct lb_env *env)
3233{ 3226{
3234 int loops = 0, pulled = 0; 3227 struct list_head *tasks = &env->src_rq->cfs_tasks;
3235 long rem_load_move = max_load_move; 3228 struct task_struct *p;
3236 struct task_struct *p, *n; 3229 unsigned long load;
3230 int pulled = 0;
3231
3232 if (env->load_move <= 0)
3233 return 0;
3237 3234
3238 if (max_load_move == 0) 3235 while (!list_empty(tasks)) {
3239 goto out; 3236 p = list_first_entry(tasks, struct task_struct, se.group_node);
3240 3237
3241 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { 3238 env->loop++;
3242 if (loops++ > sysctl_sched_nr_migrate) { 3239 /* We've more or less seen every task there is, call it quits */
3243 *lb_flags |= LBF_NEED_BREAK; 3240 if (env->loop > env->loop_max)
3241 break;
3242
3243 /* take a breather every nr_migrate tasks */
3244 if (env->loop > env->loop_break) {
3245 env->loop_break += sysctl_sched_nr_migrate;
3246 env->flags |= LBF_NEED_BREAK;
3244 break; 3247 break;
3245 } 3248 }
3246 3249
3247 if ((p->se.load.weight >> 1) > rem_load_move || 3250 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3248 !can_migrate_task(p, busiest, this_cpu, sd, idle, 3251 goto next;
3249 lb_flags)) 3252
3250 continue; 3253 load = task_h_load(p);
3254
3255 if (load < 16 && !env->sd->nr_balance_failed)
3256 goto next;
3257
3258 if ((load / 2) > env->load_move)
3259 goto next;
3260
3261 if (!can_migrate_task(p, env))
3262 goto next;
3251 3263
3252 pull_task(busiest, p, this_rq, this_cpu); 3264 move_task(p, env);
3253 pulled++; 3265 pulled++;
3254 rem_load_move -= p->se.load.weight; 3266 env->load_move -= load;
3255 3267
3256#ifdef CONFIG_PREEMPT 3268#ifdef CONFIG_PREEMPT
3257 /* 3269 /*
@@ -3259,28 +3271,30 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3259 * kernels will stop after the first task is pulled to minimize 3271 * kernels will stop after the first task is pulled to minimize
3260 * the critical section. 3272 * the critical section.
3261 */ 3273 */
3262 if (idle == CPU_NEWLY_IDLE) { 3274 if (env->idle == CPU_NEWLY_IDLE)
3263 *lb_flags |= LBF_ABORT;
3264 break; 3275 break;
3265 }
3266#endif 3276#endif
3267 3277
3268 /* 3278 /*
3269 * We only want to steal up to the prescribed amount of 3279 * We only want to steal up to the prescribed amount of
3270 * weighted load. 3280 * weighted load.
3271 */ 3281 */
3272 if (rem_load_move <= 0) 3282 if (env->load_move <= 0)
3273 break; 3283 break;
3284
3285 continue;
3286next:
3287 list_move_tail(&p->se.group_node, tasks);
3274 } 3288 }
3275out: 3289
3276 /* 3290 /*
3277 * Right now, this is one of only two places pull_task() is called, 3291 * Right now, this is one of only two places move_task() is called,
3278 * so we can safely collect pull_task() stats here rather than 3292 * so we can safely collect move_task() stats here rather than
3279 * inside pull_task(). 3293 * inside move_task().
3280 */ 3294 */
3281 schedstat_add(sd, lb_gained[idle], pulled); 3295 schedstat_add(env->sd, lb_gained[env->idle], pulled);
3282 3296
3283 return max_load_move - rem_load_move; 3297 return pulled;
3284} 3298}
3285 3299
3286#ifdef CONFIG_FAIR_GROUP_SCHED 3300#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3360,113 +3374,35 @@ static int tg_load_down(struct task_group *tg, void *data)
3360 3374
3361static void update_h_load(long cpu) 3375static void update_h_load(long cpu)
3362{ 3376{
3377 rcu_read_lock();
3363 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 3378 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
3379 rcu_read_unlock();
3364} 3380}
3365 3381
3366static unsigned long 3382static unsigned long task_h_load(struct task_struct *p)
3367load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3368 unsigned long max_load_move,
3369 struct sched_domain *sd, enum cpu_idle_type idle,
3370 int *lb_flags)
3371{ 3383{
3372 long rem_load_move = max_load_move; 3384 struct cfs_rq *cfs_rq = task_cfs_rq(p);
3373 struct cfs_rq *busiest_cfs_rq; 3385 unsigned long load;
3374
3375 rcu_read_lock();
3376 update_h_load(cpu_of(busiest));
3377
3378 for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
3379 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
3380 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
3381 u64 rem_load, moved_load;
3382
3383 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3384 break;
3385
3386 /*
3387 * empty group or part of a throttled hierarchy
3388 */
3389 if (!busiest_cfs_rq->task_weight ||
3390 throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
3391 continue;
3392
3393 rem_load = (u64)rem_load_move * busiest_weight;
3394 rem_load = div_u64(rem_load, busiest_h_load + 1);
3395
3396 moved_load = balance_tasks(this_rq, this_cpu, busiest,
3397 rem_load, sd, idle, lb_flags,
3398 busiest_cfs_rq);
3399
3400 if (!moved_load)
3401 continue;
3402
3403 moved_load *= busiest_h_load;
3404 moved_load = div_u64(moved_load, busiest_weight + 1);
3405 3386
3406 rem_load_move -= moved_load; 3387 load = p->se.load.weight;
3407 if (rem_load_move < 0) 3388 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
3408 break;
3409 }
3410 rcu_read_unlock();
3411 3389
3412 return max_load_move - rem_load_move; 3390 return load;
3413} 3391}
3414#else 3392#else
3415static inline void update_shares(int cpu) 3393static inline void update_shares(int cpu)
3416{ 3394{
3417} 3395}
3418 3396
3419static unsigned long 3397static inline void update_h_load(long cpu)
3420load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
3421 unsigned long max_load_move,
3422 struct sched_domain *sd, enum cpu_idle_type idle,
3423 int *lb_flags)
3424{ 3398{
3425 return balance_tasks(this_rq, this_cpu, busiest,
3426 max_load_move, sd, idle, lb_flags,
3427 &busiest->cfs);
3428} 3399}
3429#endif
3430 3400
3431/* 3401static unsigned long task_h_load(struct task_struct *p)
3432 * move_tasks tries to move up to max_load_move weighted load from busiest to
3433 * this_rq, as part of a balancing operation within domain "sd".
3434 * Returns 1 if successful and 0 otherwise.
3435 *
3436 * Called with both runqueues locked.
3437 */
3438static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3439 unsigned long max_load_move,
3440 struct sched_domain *sd, enum cpu_idle_type idle,
3441 int *lb_flags)
3442{ 3402{
3443 unsigned long total_load_moved = 0, load_moved; 3403 return p->se.load.weight;
3444
3445 do {
3446 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
3447 max_load_move - total_load_moved,
3448 sd, idle, lb_flags);
3449
3450 total_load_moved += load_moved;
3451
3452 if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
3453 break;
3454
3455#ifdef CONFIG_PREEMPT
3456 /*
3457 * NEWIDLE balancing is a source of latency, so preemptible
3458 * kernels will stop after the first task is pulled to minimize
3459 * the critical section.
3460 */
3461 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
3462 *lb_flags |= LBF_ABORT;
3463 break;
3464 }
3465#endif
3466 } while (load_moved && max_load_move > total_load_moved);
3467
3468 return total_load_moved > 0;
3469} 3404}
3405#endif
3470 3406
3471/********** Helpers for find_busiest_group ************************/ 3407/********** Helpers for find_busiest_group ************************/
3472/* 3408/*
@@ -3776,6 +3712,11 @@ void update_group_power(struct sched_domain *sd, int cpu)
3776 struct sched_domain *child = sd->child; 3712 struct sched_domain *child = sd->child;
3777 struct sched_group *group, *sdg = sd->groups; 3713 struct sched_group *group, *sdg = sd->groups;
3778 unsigned long power; 3714 unsigned long power;
3715 unsigned long interval;
3716
3717 interval = msecs_to_jiffies(sd->balance_interval);
3718 interval = clamp(interval, 1UL, max_load_balance_interval);
3719 sdg->sgp->next_update = jiffies + interval;
3779 3720
3780 if (!child) { 3721 if (!child) {
3781 update_cpu_power(sd, cpu); 3722 update_cpu_power(sd, cpu);
@@ -3883,12 +3824,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
3883 * domains. In the newly idle case, we will allow all the cpu's 3824 * domains. In the newly idle case, we will allow all the cpu's
3884 * to do the newly idle load balance. 3825 * to do the newly idle load balance.
3885 */ 3826 */
3886 if (idle != CPU_NEWLY_IDLE && local_group) { 3827 if (local_group) {
3887 if (balance_cpu != this_cpu) { 3828 if (idle != CPU_NEWLY_IDLE) {
3888 *balance = 0; 3829 if (balance_cpu != this_cpu) {
3889 return; 3830 *balance = 0;
3890 } 3831 return;
3891 update_group_power(sd, this_cpu); 3832 }
3833 update_group_power(sd, this_cpu);
3834 } else if (time_after_eq(jiffies, group->sgp->next_update))
3835 update_group_power(sd, this_cpu);
3892 } 3836 }
3893 3837
3894 /* Adjust by relative CPU power of the group */ 3838 /* Adjust by relative CPU power of the group */
@@ -4451,13 +4395,21 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4451 struct sched_domain *sd, enum cpu_idle_type idle, 4395 struct sched_domain *sd, enum cpu_idle_type idle,
4452 int *balance) 4396 int *balance)
4453{ 4397{
4454 int ld_moved, lb_flags = 0, active_balance = 0; 4398 int ld_moved, active_balance = 0;
4455 struct sched_group *group; 4399 struct sched_group *group;
4456 unsigned long imbalance; 4400 unsigned long imbalance;
4457 struct rq *busiest; 4401 struct rq *busiest;
4458 unsigned long flags; 4402 unsigned long flags;
4459 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4403 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4460 4404
4405 struct lb_env env = {
4406 .sd = sd,
4407 .dst_cpu = this_cpu,
4408 .dst_rq = this_rq,
4409 .idle = idle,
4410 .loop_break = sysctl_sched_nr_migrate,
4411 };
4412
4461 cpumask_copy(cpus, cpu_active_mask); 4413 cpumask_copy(cpus, cpu_active_mask);
4462 4414
4463 schedstat_inc(sd, lb_count[idle]); 4415 schedstat_inc(sd, lb_count[idle]);
@@ -4492,32 +4444,34 @@ redo:
4492 * still unbalanced. ld_moved simply stays zero, so it is 4444 * still unbalanced. ld_moved simply stays zero, so it is
4493 * correctly treated as an imbalance. 4445 * correctly treated as an imbalance.
4494 */ 4446 */
4495 lb_flags |= LBF_ALL_PINNED; 4447 env.flags |= LBF_ALL_PINNED;
4448 env.load_move = imbalance;
4449 env.src_cpu = busiest->cpu;
4450 env.src_rq = busiest;
4451 env.loop_max = busiest->nr_running;
4452
4453more_balance:
4496 local_irq_save(flags); 4454 local_irq_save(flags);
4497 double_rq_lock(this_rq, busiest); 4455 double_rq_lock(this_rq, busiest);
4498 ld_moved = move_tasks(this_rq, this_cpu, busiest, 4456 if (!env.loop)
4499 imbalance, sd, idle, &lb_flags); 4457 update_h_load(env.src_cpu);
4458 ld_moved += move_tasks(&env);
4500 double_rq_unlock(this_rq, busiest); 4459 double_rq_unlock(this_rq, busiest);
4501 local_irq_restore(flags); 4460 local_irq_restore(flags);
4502 4461
4462 if (env.flags & LBF_NEED_BREAK) {
4463 env.flags &= ~LBF_NEED_BREAK;
4464 goto more_balance;
4465 }
4466
4503 /* 4467 /*
4504 * some other cpu did the load balance for us. 4468 * some other cpu did the load balance for us.
4505 */ 4469 */
4506 if (ld_moved && this_cpu != smp_processor_id()) 4470 if (ld_moved && this_cpu != smp_processor_id())
4507 resched_cpu(this_cpu); 4471 resched_cpu(this_cpu);
4508 4472
4509 if (lb_flags & LBF_ABORT)
4510 goto out_balanced;
4511
4512 if (lb_flags & LBF_NEED_BREAK) {
4513 lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
4514 if (lb_flags & LBF_ABORT)
4515 goto out_balanced;
4516 goto redo;
4517 }
4518
4519 /* All tasks on this runqueue were pinned by CPU affinity */ 4473 /* All tasks on this runqueue were pinned by CPU affinity */
4520 if (unlikely(lb_flags & LBF_ALL_PINNED)) { 4474 if (unlikely(env.flags & LBF_ALL_PINNED)) {
4521 cpumask_clear_cpu(cpu_of(busiest), cpus); 4475 cpumask_clear_cpu(cpu_of(busiest), cpus);
4522 if (!cpumask_empty(cpus)) 4476 if (!cpumask_empty(cpus))
4523 goto redo; 4477 goto redo;
@@ -4547,7 +4501,7 @@ redo:
4547 tsk_cpus_allowed(busiest->curr))) { 4501 tsk_cpus_allowed(busiest->curr))) {
4548 raw_spin_unlock_irqrestore(&busiest->lock, 4502 raw_spin_unlock_irqrestore(&busiest->lock,
4549 flags); 4503 flags);
4550 lb_flags |= LBF_ALL_PINNED; 4504 env.flags |= LBF_ALL_PINNED;
4551 goto out_one_pinned; 4505 goto out_one_pinned;
4552 } 4506 }
4553 4507
@@ -4600,7 +4554,7 @@ out_balanced:
4600 4554
4601out_one_pinned: 4555out_one_pinned:
4602 /* tune up the balancing interval */ 4556 /* tune up the balancing interval */
4603 if (((lb_flags & LBF_ALL_PINNED) && 4557 if (((env.flags & LBF_ALL_PINNED) &&
4604 sd->balance_interval < MAX_PINNED_INTERVAL) || 4558 sd->balance_interval < MAX_PINNED_INTERVAL) ||
4605 (sd->balance_interval < sd->max_interval)) 4559 (sd->balance_interval < sd->max_interval))
4606 sd->balance_interval *= 2; 4560 sd->balance_interval *= 2;
@@ -4710,10 +4664,18 @@ static int active_load_balance_cpu_stop(void *data)
4710 } 4664 }
4711 4665
4712 if (likely(sd)) { 4666 if (likely(sd)) {
4667 struct lb_env env = {
4668 .sd = sd,
4669 .dst_cpu = target_cpu,
4670 .dst_rq = target_rq,
4671 .src_cpu = busiest_rq->cpu,
4672 .src_rq = busiest_rq,
4673 .idle = CPU_IDLE,
4674 };
4675
4713 schedstat_inc(sd, alb_count); 4676 schedstat_inc(sd, alb_count);
4714 4677
4715 if (move_one_task(target_rq, target_cpu, busiest_rq, 4678 if (move_one_task(&env))
4716 sd, CPU_IDLE))
4717 schedstat_inc(sd, alb_pushed); 4679 schedstat_inc(sd, alb_pushed);
4718 else 4680 else
4719 schedstat_inc(sd, alb_failed); 4681 schedstat_inc(sd, alb_failed);
@@ -4866,6 +4828,15 @@ static void nohz_balancer_kick(int cpu)
4866 return; 4828 return;
4867} 4829}
4868 4830
4831static inline void clear_nohz_tick_stopped(int cpu)
4832{
4833 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
4834 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
4835 atomic_dec(&nohz.nr_cpus);
4836 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
4837 }
4838}
4839
4869static inline void set_cpu_sd_state_busy(void) 4840static inline void set_cpu_sd_state_busy(void)
4870{ 4841{
4871 struct sched_domain *sd; 4842 struct sched_domain *sd;
@@ -4904,6 +4875,12 @@ void select_nohz_load_balancer(int stop_tick)
4904{ 4875{
4905 int cpu = smp_processor_id(); 4876 int cpu = smp_processor_id();
4906 4877
4878 /*
4879 * If this cpu is going down, then nothing needs to be done.
4880 */
4881 if (!cpu_active(cpu))
4882 return;
4883
4907 if (stop_tick) { 4884 if (stop_tick) {
4908 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 4885 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
4909 return; 4886 return;
@@ -4914,12 +4891,22 @@ void select_nohz_load_balancer(int stop_tick)
4914 } 4891 }
4915 return; 4892 return;
4916} 4893}
4894
4895static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
4896 unsigned long action, void *hcpu)
4897{
4898 switch (action & ~CPU_TASKS_FROZEN) {
4899 case CPU_DYING:
4900 clear_nohz_tick_stopped(smp_processor_id());
4901 return NOTIFY_OK;
4902 default:
4903 return NOTIFY_DONE;
4904 }
4905}
4917#endif 4906#endif
4918 4907
4919static DEFINE_SPINLOCK(balancing); 4908static DEFINE_SPINLOCK(balancing);
4920 4909
4921static unsigned long __read_mostly max_load_balance_interval = HZ/10;
4922
4923/* 4910/*
4924 * Scale the max load_balance interval with the number of CPUs in the system. 4911 * Scale the max load_balance interval with the number of CPUs in the system.
4925 * This trades load-balance latency on larger machines for less cross talk. 4912 * This trades load-balance latency on larger machines for less cross talk.
@@ -5070,11 +5057,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5070 * busy tick after returning from idle, we will update the busy stats. 5057 * busy tick after returning from idle, we will update the busy stats.
5071 */ 5058 */
5072 set_cpu_sd_state_busy(); 5059 set_cpu_sd_state_busy();
5073 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { 5060 clear_nohz_tick_stopped(cpu);
5074 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5075 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5076 atomic_dec(&nohz.nr_cpus);
5077 }
5078 5061
5079 /* 5062 /*
5080 * None are in tickless mode and hence no need for NOHZ idle load 5063 * None are in tickless mode and hence no need for NOHZ idle load
@@ -5317,7 +5300,6 @@ static void set_curr_task_fair(struct rq *rq)
5317void init_cfs_rq(struct cfs_rq *cfs_rq) 5300void init_cfs_rq(struct cfs_rq *cfs_rq)
5318{ 5301{
5319 cfs_rq->tasks_timeline = RB_ROOT; 5302 cfs_rq->tasks_timeline = RB_ROOT;
5320 INIT_LIST_HEAD(&cfs_rq->tasks);
5321 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 5303 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5322#ifndef CONFIG_64BIT 5304#ifndef CONFIG_64BIT
5323 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 5305 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -5589,7 +5571,9 @@ __init void init_sched_fair_class(void)
5589 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 5571 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
5590 5572
5591#ifdef CONFIG_NO_HZ 5573#ifdef CONFIG_NO_HZ
5574 nohz.next_balance = jiffies;
5592 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 5575 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
5576 cpu_notifier(sched_ilb_notifier, 0);
5593#endif 5577#endif
5594#endif /* SMP */ 5578#endif /* SMP */
5595 5579
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 3640ebbb466b..44af55e6d5d0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -778,12 +778,9 @@ static inline int balance_runtime(struct rt_rq *rt_rq)
778 778
779static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 779static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
780{ 780{
781 int i, idle = 1; 781 int i, idle = 1, throttled = 0;
782 const struct cpumask *span; 782 const struct cpumask *span;
783 783
784 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
785 return 1;
786
787 span = sched_rt_period_mask(); 784 span = sched_rt_period_mask();
788 for_each_cpu(i, span) { 785 for_each_cpu(i, span) {
789 int enqueue = 0; 786 int enqueue = 0;
@@ -818,12 +815,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
818 if (!rt_rq_throttled(rt_rq)) 815 if (!rt_rq_throttled(rt_rq))
819 enqueue = 1; 816 enqueue = 1;
820 } 817 }
818 if (rt_rq->rt_throttled)
819 throttled = 1;
821 820
822 if (enqueue) 821 if (enqueue)
823 sched_rt_rq_enqueue(rt_rq); 822 sched_rt_rq_enqueue(rt_rq);
824 raw_spin_unlock(&rq->lock); 823 raw_spin_unlock(&rq->lock);
825 } 824 }
826 825
826 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
827 return 1;
828
827 return idle; 829 return idle;
828} 830}
829 831
@@ -855,8 +857,30 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
855 return 0; 857 return 0;
856 858
857 if (rt_rq->rt_time > runtime) { 859 if (rt_rq->rt_time > runtime) {
858 rt_rq->rt_throttled = 1; 860 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
859 printk_once(KERN_WARNING "sched: RT throttling activated\n"); 861
862 /*
863 * Don't actually throttle groups that have no runtime assigned
864 * but accrue some time due to boosting.
865 */
866 if (likely(rt_b->rt_runtime)) {
867 static bool once = false;
868
869 rt_rq->rt_throttled = 1;
870
871 if (!once) {
872 once = true;
873 printk_sched("sched: RT throttling activated\n");
874 }
875 } else {
876 /*
877 * In case we did anyway, make it go away,
878 * replenishment is a joke, since it will replenish us
879 * with exactly 0 ns.
880 */
881 rt_rq->rt_time = 0;
882 }
883
860 if (rt_rq_throttled(rt_rq)) { 884 if (rt_rq_throttled(rt_rq)) {
861 sched_rt_rq_dequeue(rt_rq); 885 sched_rt_rq_dequeue(rt_rq);
862 return 1; 886 return 1;
@@ -884,7 +908,8 @@ static void update_curr_rt(struct rq *rq)
884 if (unlikely((s64)delta_exec < 0)) 908 if (unlikely((s64)delta_exec < 0))
885 delta_exec = 0; 909 delta_exec = 0;
886 910
887 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); 911 schedstat_set(curr->se.statistics.exec_max,
912 max(curr->se.statistics.exec_max, delta_exec));
888 913
889 curr->se.sum_exec_runtime += delta_exec; 914 curr->se.sum_exec_runtime += delta_exec;
890 account_group_exec_runtime(curr, delta_exec); 915 account_group_exec_runtime(curr, delta_exec);
@@ -1403,7 +1428,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1403next_idx: 1428next_idx:
1404 if (idx >= MAX_RT_PRIO) 1429 if (idx >= MAX_RT_PRIO)
1405 continue; 1430 continue;
1406 if (next && next->prio < idx) 1431 if (next && next->prio <= idx)
1407 continue; 1432 continue;
1408 list_for_each_entry(rt_se, array->queue + idx, run_list) { 1433 list_for_each_entry(rt_se, array->queue + idx, run_list) {
1409 struct task_struct *p; 1434 struct task_struct *p;
@@ -1587,6 +1612,11 @@ static int push_rt_task(struct rq *rq)
1587 if (!next_task) 1612 if (!next_task)
1588 return 0; 1613 return 0;
1589 1614
1615#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1616 if (unlikely(task_running(rq, next_task)))
1617 return 0;
1618#endif
1619
1590retry: 1620retry:
1591 if (unlikely(next_task == rq->curr)) { 1621 if (unlikely(next_task == rq->curr)) {
1592 WARN_ON(1); 1622 WARN_ON(1);
@@ -1967,7 +1997,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1967 if (--p->rt.time_slice) 1997 if (--p->rt.time_slice)
1968 return; 1998 return;
1969 1999
1970 p->rt.time_slice = DEF_TIMESLICE; 2000 p->rt.time_slice = RR_TIMESLICE;
1971 2001
1972 /* 2002 /*
1973 * Requeue to the end of queue if we are not the only element 2003 * Requeue to the end of queue if we are not the only element
@@ -1995,7 +2025,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1995 * Time slice is 0 for SCHED_FIFO tasks 2025 * Time slice is 0 for SCHED_FIFO tasks
1996 */ 2026 */
1997 if (task->policy == SCHED_RR) 2027 if (task->policy == SCHED_RR)
1998 return DEF_TIMESLICE; 2028 return RR_TIMESLICE;
1999 else 2029 else
2000 return 0; 2030 return 0;
2001} 2031}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 98c0c2623db8..42b1f304b044 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -36,11 +36,7 @@ extern __read_mostly int scheduler_running;
36 36
37/* 37/*
38 * These are the 'tuning knobs' of the scheduler: 38 * These are the 'tuning knobs' of the scheduler:
39 *
40 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
41 * Timeslices get refilled after they expire.
42 */ 39 */
43#define DEF_TIMESLICE (100 * HZ / 1000)
44 40
45/* 41/*
46 * single value that denotes runtime == period, ie unlimited time. 42 * single value that denotes runtime == period, ie unlimited time.
@@ -216,9 +212,6 @@ struct cfs_rq {
216 struct rb_root tasks_timeline; 212 struct rb_root tasks_timeline;
217 struct rb_node *rb_leftmost; 213 struct rb_node *rb_leftmost;
218 214
219 struct list_head tasks;
220 struct list_head *balance_iterator;
221
222 /* 215 /*
223 * 'curr' points to currently running entity on this cfs_rq. 216 * 'curr' points to currently running entity on this cfs_rq.
224 * It is set to NULL otherwise (i.e when none are currently running). 217 * It is set to NULL otherwise (i.e when none are currently running).
@@ -246,11 +239,6 @@ struct cfs_rq {
246 239
247#ifdef CONFIG_SMP 240#ifdef CONFIG_SMP
248 /* 241 /*
249 * the part of load.weight contributed by tasks
250 */
251 unsigned long task_weight;
252
253 /*
254 * h_load = weight * f(tg) 242 * h_load = weight * f(tg)
255 * 243 *
256 * Where f(tg) is the recursive weight fraction assigned to 244 * Where f(tg) is the recursive weight fraction assigned to
@@ -424,6 +412,8 @@ struct rq {
424 int cpu; 412 int cpu;
425 int online; 413 int online;
426 414
415 struct list_head cfs_tasks;
416
427 u64 rt_avg; 417 u64 rt_avg;
428 u64 age_stamp; 418 u64 age_stamp;
429 u64 idle_stamp; 419 u64 idle_stamp;
@@ -462,7 +452,6 @@ struct rq {
462 unsigned int yld_count; 452 unsigned int yld_count;
463 453
464 /* schedule() stats */ 454 /* schedule() stats */
465 unsigned int sched_switch;
466 unsigned int sched_count; 455 unsigned int sched_count;
467 unsigned int sched_goidle; 456 unsigned int sched_goidle;
468 457
@@ -611,7 +600,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
611 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 600 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
612 */ 601 */
613#ifdef CONFIG_SCHED_DEBUG 602#ifdef CONFIG_SCHED_DEBUG
614# include <linux/jump_label.h> 603# include <linux/static_key.h>
615# define const_debug __read_mostly 604# define const_debug __read_mostly
616#else 605#else
617# define const_debug const 606# define const_debug const
@@ -630,18 +619,18 @@ enum {
630#undef SCHED_FEAT 619#undef SCHED_FEAT
631 620
632#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) 621#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
633static __always_inline bool static_branch__true(struct jump_label_key *key) 622static __always_inline bool static_branch__true(struct static_key *key)
634{ 623{
635 return likely(static_branch(key)); /* Not out of line branch. */ 624 return static_key_true(key); /* Not out of line branch. */
636} 625}
637 626
638static __always_inline bool static_branch__false(struct jump_label_key *key) 627static __always_inline bool static_branch__false(struct static_key *key)
639{ 628{
640 return unlikely(static_branch(key)); /* Out of line branch. */ 629 return static_key_false(key); /* Out of line branch. */
641} 630}
642 631
643#define SCHED_FEAT(name, enabled) \ 632#define SCHED_FEAT(name, enabled) \
644static __always_inline bool static_branch_##name(struct jump_label_key *key) \ 633static __always_inline bool static_branch_##name(struct static_key *key) \
645{ \ 634{ \
646 return static_branch__##enabled(key); \ 635 return static_branch__##enabled(key); \
647} 636}
@@ -650,7 +639,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \
650 639
651#undef SCHED_FEAT 640#undef SCHED_FEAT
652 641
653extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; 642extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
654#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) 643#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
655#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ 644#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
656#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 645#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 2a581ba8e190..903ffa9e8872 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
32 32
33 /* runqueue-specific stats */ 33 /* runqueue-specific stats */
34 seq_printf(seq, 34 seq_printf(seq,
35 "cpu%d %u %u %u %u %u %u %llu %llu %lu", 35 "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
36 cpu, rq->yld_count, 36 cpu, rq->yld_count,
37 rq->sched_switch, rq->sched_count, rq->sched_goidle, 37 rq->sched_count, rq->sched_goidle,
38 rq->ttwu_count, rq->ttwu_local, 38 rq->ttwu_count, rq->ttwu_local,
39 rq->rq_cpu_time, 39 rq->rq_cpu_time,
40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); 40 rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
diff --git a/kernel/signal.c b/kernel/signal.c
index c73c4284160e..17afcaf582d0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -36,6 +36,7 @@
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37#include <asm/unistd.h> 37#include <asm/unistd.h>
38#include <asm/siginfo.h> 38#include <asm/siginfo.h>
39#include <asm/cacheflush.h>
39#include "audit.h" /* audit_signal_info() */ 40#include "audit.h" /* audit_signal_info() */
40 41
41/* 42/*
@@ -58,21 +59,20 @@ static int sig_handler_ignored(void __user *handler, int sig)
58 (handler == SIG_DFL && sig_kernel_ignore(sig)); 59 (handler == SIG_DFL && sig_kernel_ignore(sig));
59} 60}
60 61
61static int sig_task_ignored(struct task_struct *t, int sig, 62static int sig_task_ignored(struct task_struct *t, int sig, bool force)
62 int from_ancestor_ns)
63{ 63{
64 void __user *handler; 64 void __user *handler;
65 65
66 handler = sig_handler(t, sig); 66 handler = sig_handler(t, sig);
67 67
68 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && 68 if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
69 handler == SIG_DFL && !from_ancestor_ns) 69 handler == SIG_DFL && !force)
70 return 1; 70 return 1;
71 71
72 return sig_handler_ignored(handler, sig); 72 return sig_handler_ignored(handler, sig);
73} 73}
74 74
75static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) 75static int sig_ignored(struct task_struct *t, int sig, bool force)
76{ 76{
77 /* 77 /*
78 * Blocked signals are never ignored, since the 78 * Blocked signals are never ignored, since the
@@ -82,7 +82,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
82 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 82 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
83 return 0; 83 return 0;
84 84
85 if (!sig_task_ignored(t, sig, from_ancestor_ns)) 85 if (!sig_task_ignored(t, sig, force))
86 return 0; 86 return 0;
87 87
88 /* 88 /*
@@ -855,7 +855,7 @@ static void ptrace_trap_notify(struct task_struct *t)
855 * Returns true if the signal should be actually delivered, otherwise 855 * Returns true if the signal should be actually delivered, otherwise
856 * it should be dropped. 856 * it should be dropped.
857 */ 857 */
858static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) 858static int prepare_signal(int sig, struct task_struct *p, bool force)
859{ 859{
860 struct signal_struct *signal = p->signal; 860 struct signal_struct *signal = p->signal;
861 struct task_struct *t; 861 struct task_struct *t;
@@ -915,7 +915,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
915 } 915 }
916 } 916 }
917 917
918 return !sig_ignored(p, sig, from_ancestor_ns); 918 return !sig_ignored(p, sig, force);
919} 919}
920 920
921/* 921/*
@@ -1054,13 +1054,14 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1054 struct sigpending *pending; 1054 struct sigpending *pending;
1055 struct sigqueue *q; 1055 struct sigqueue *q;
1056 int override_rlimit; 1056 int override_rlimit;
1057 1057 int ret = 0, result;
1058 trace_signal_generate(sig, info, t);
1059 1058
1060 assert_spin_locked(&t->sighand->siglock); 1059 assert_spin_locked(&t->sighand->siglock);
1061 1060
1062 if (!prepare_signal(sig, t, from_ancestor_ns)) 1061 result = TRACE_SIGNAL_IGNORED;
1063 return 0; 1062 if (!prepare_signal(sig, t,
1063 from_ancestor_ns || (info == SEND_SIG_FORCED)))
1064 goto ret;
1064 1065
1065 pending = group ? &t->signal->shared_pending : &t->pending; 1066 pending = group ? &t->signal->shared_pending : &t->pending;
1066 /* 1067 /*
@@ -1068,8 +1069,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1068 * exactly one non-rt signal, so that we can get more 1069 * exactly one non-rt signal, so that we can get more
1069 * detailed information about the cause of the signal. 1070 * detailed information about the cause of the signal.
1070 */ 1071 */
1072 result = TRACE_SIGNAL_ALREADY_PENDING;
1071 if (legacy_queue(pending, sig)) 1073 if (legacy_queue(pending, sig))
1072 return 0; 1074 goto ret;
1075
1076 result = TRACE_SIGNAL_DELIVERED;
1073 /* 1077 /*
1074 * fast-pathed signals for kernel-internal things like SIGSTOP 1078 * fast-pathed signals for kernel-internal things like SIGSTOP
1075 * or SIGKILL. 1079 * or SIGKILL.
@@ -1127,14 +1131,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
1127 * signal was rt and sent by user using something 1131 * signal was rt and sent by user using something
1128 * other than kill(). 1132 * other than kill().
1129 */ 1133 */
1130 trace_signal_overflow_fail(sig, group, info); 1134 result = TRACE_SIGNAL_OVERFLOW_FAIL;
1131 return -EAGAIN; 1135 ret = -EAGAIN;
1136 goto ret;
1132 } else { 1137 } else {
1133 /* 1138 /*
1134 * This is a silent loss of information. We still 1139 * This is a silent loss of information. We still
1135 * send the signal, but the *info bits are lost. 1140 * send the signal, but the *info bits are lost.
1136 */ 1141 */
1137 trace_signal_lose_info(sig, group, info); 1142 result = TRACE_SIGNAL_LOSE_INFO;
1138 } 1143 }
1139 } 1144 }
1140 1145
@@ -1142,7 +1147,9 @@ out_set:
1142 signalfd_notify(t, sig); 1147 signalfd_notify(t, sig);
1143 sigaddset(&pending->signal, sig); 1148 sigaddset(&pending->signal, sig);
1144 complete_signal(sig, t, group); 1149 complete_signal(sig, t, group);
1145 return 0; 1150ret:
1151 trace_signal_generate(sig, info, t, group, result);
1152 return ret;
1146} 1153}
1147 1154
1148static int send_signal(int sig, struct siginfo *info, struct task_struct *t, 1155static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
@@ -1585,7 +1592,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1585 int sig = q->info.si_signo; 1592 int sig = q->info.si_signo;
1586 struct sigpending *pending; 1593 struct sigpending *pending;
1587 unsigned long flags; 1594 unsigned long flags;
1588 int ret; 1595 int ret, result;
1589 1596
1590 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1597 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1591 1598
@@ -1594,7 +1601,8 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1594 goto ret; 1601 goto ret;
1595 1602
1596 ret = 1; /* the signal is ignored */ 1603 ret = 1; /* the signal is ignored */
1597 if (!prepare_signal(sig, t, 0)) 1604 result = TRACE_SIGNAL_IGNORED;
1605 if (!prepare_signal(sig, t, false))
1598 goto out; 1606 goto out;
1599 1607
1600 ret = 0; 1608 ret = 0;
@@ -1605,6 +1613,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1605 */ 1613 */
1606 BUG_ON(q->info.si_code != SI_TIMER); 1614 BUG_ON(q->info.si_code != SI_TIMER);
1607 q->info.si_overrun++; 1615 q->info.si_overrun++;
1616 result = TRACE_SIGNAL_ALREADY_PENDING;
1608 goto out; 1617 goto out;
1609 } 1618 }
1610 q->info.si_overrun = 0; 1619 q->info.si_overrun = 0;
@@ -1614,7 +1623,9 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1614 list_add_tail(&q->list, &pending->list); 1623 list_add_tail(&q->list, &pending->list);
1615 sigaddset(&pending->signal, sig); 1624 sigaddset(&pending->signal, sig);
1616 complete_signal(sig, t, group); 1625 complete_signal(sig, t, group);
1626 result = TRACE_SIGNAL_DELIVERED;
1617out: 1627out:
1628 trace_signal_generate(sig, &q->info, t, group, result);
1618 unlock_task_sighand(t, &flags); 1629 unlock_task_sighand(t, &flags);
1619ret: 1630ret:
1620 return ret; 1631 return ret;
@@ -1642,6 +1653,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
1642 BUG_ON(!tsk->ptrace && 1653 BUG_ON(!tsk->ptrace &&
1643 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1654 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
1644 1655
1656 if (sig != SIGCHLD) {
1657 /*
1658 * This is only possible if parent == real_parent.
1659 * Check if it has changed security domain.
1660 */
1661 if (tsk->parent_exec_id != tsk->parent->self_exec_id)
1662 sig = SIGCHLD;
1663 }
1664
1645 info.si_signo = sig; 1665 info.si_signo = sig;
1646 info.si_errno = 0; 1666 info.si_errno = 0;
1647 /* 1667 /*
diff --git a/kernel/smp.c b/kernel/smp.c
index db197d60489b..2f8b10ecf759 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -701,3 +701,93 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
701 return ret; 701 return ret;
702} 702}
703EXPORT_SYMBOL(on_each_cpu); 703EXPORT_SYMBOL(on_each_cpu);
704
705/**
706 * on_each_cpu_mask(): Run a function on processors specified by
707 * cpumask, which may include the local processor.
708 * @mask: The set of cpus to run on (only runs on online subset).
709 * @func: The function to run. This must be fast and non-blocking.
710 * @info: An arbitrary pointer to pass to the function.
711 * @wait: If true, wait (atomically) until function has completed
712 * on other CPUs.
713 *
714 * If @wait is true, then returns once @func has returned.
715 *
716 * You must not call this function with disabled interrupts or
717 * from a hardware interrupt handler or from a bottom half handler.
718 */
719void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
720 void *info, bool wait)
721{
722 int cpu = get_cpu();
723
724 smp_call_function_many(mask, func, info, wait);
725 if (cpumask_test_cpu(cpu, mask)) {
726 local_irq_disable();
727 func(info);
728 local_irq_enable();
729 }
730 put_cpu();
731}
732EXPORT_SYMBOL(on_each_cpu_mask);
733
734/*
735 * on_each_cpu_cond(): Call a function on each processor for which
736 * the supplied function cond_func returns true, optionally waiting
737 * for all the required CPUs to finish. This may include the local
738 * processor.
739 * @cond_func: A callback function that is passed a cpu id and
740 * the the info parameter. The function is called
741 * with preemption disabled. The function should
742 * return a blooean value indicating whether to IPI
743 * the specified CPU.
744 * @func: The function to run on all applicable CPUs.
745 * This must be fast and non-blocking.
746 * @info: An arbitrary pointer to pass to both functions.
747 * @wait: If true, wait (atomically) until function has
748 * completed on other CPUs.
749 * @gfp_flags: GFP flags to use when allocating the cpumask
750 * used internally by the function.
751 *
752 * The function might sleep if the GFP flags indicates a non
753 * atomic allocation is allowed.
754 *
755 * Preemption is disabled to protect against CPUs going offline but not online.
756 * CPUs going online during the call will not be seen or sent an IPI.
757 *
758 * You must not call this function with disabled interrupts or
759 * from a hardware interrupt handler or from a bottom half handler.
760 */
761void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
762 smp_call_func_t func, void *info, bool wait,
763 gfp_t gfp_flags)
764{
765 cpumask_var_t cpus;
766 int cpu, ret;
767
768 might_sleep_if(gfp_flags & __GFP_WAIT);
769
770 if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
771 preempt_disable();
772 for_each_online_cpu(cpu)
773 if (cond_func(cpu, info))
774 cpumask_set_cpu(cpu, cpus);
775 on_each_cpu_mask(cpus, func, info, wait);
776 preempt_enable();
777 free_cpumask_var(cpus);
778 } else {
779 /*
780 * No free cpumask, bother. No matter, we'll
781 * just have to IPI them one by one.
782 */
783 preempt_disable();
784 for_each_online_cpu(cpu)
785 if (cond_func(cpu, info)) {
786 ret = smp_call_function_single(cpu, func,
787 info, wait);
788 WARN_ON_ONCE(!ret);
789 }
790 preempt_enable();
791 }
792}
793EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4eb3a0fa351e..671f9594e368 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -297,7 +297,7 @@ void irq_enter(void)
297 int cpu = smp_processor_id(); 297 int cpu = smp_processor_id();
298 298
299 rcu_irq_enter(); 299 rcu_irq_enter();
300 if (idle_cpu(cpu) && !in_interrupt()) { 300 if (is_idle_task(current) && !in_interrupt()) {
301 /* 301 /*
302 * Prevent raise_softirq from needlessly waking up ksoftirqd 302 * Prevent raise_softirq from needlessly waking up ksoftirqd
303 * here, as softirq will be serviced on return from interrupt. 303 * here, as softirq will be serviced on return from interrupt.
@@ -310,31 +310,21 @@ void irq_enter(void)
310 __irq_enter(); 310 __irq_enter();
311} 311}
312 312
313#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
314static inline void invoke_softirq(void) 313static inline void invoke_softirq(void)
315{ 314{
316 if (!force_irqthreads) 315 if (!force_irqthreads) {
316#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
317 __do_softirq(); 317 __do_softirq();
318 else {
319 __local_bh_disable((unsigned long)__builtin_return_address(0),
320 SOFTIRQ_OFFSET);
321 wakeup_softirqd();
322 __local_bh_enable(SOFTIRQ_OFFSET);
323 }
324}
325#else 318#else
326static inline void invoke_softirq(void)
327{
328 if (!force_irqthreads)
329 do_softirq(); 319 do_softirq();
330 else { 320#endif
321 } else {
331 __local_bh_disable((unsigned long)__builtin_return_address(0), 322 __local_bh_disable((unsigned long)__builtin_return_address(0),
332 SOFTIRQ_OFFSET); 323 SOFTIRQ_OFFSET);
333 wakeup_softirqd(); 324 wakeup_softirqd();
334 __local_bh_enable(SOFTIRQ_OFFSET); 325 __local_bh_enable(SOFTIRQ_OFFSET);
335 } 326 }
336} 327}
337#endif
338 328
339/* 329/*
340 * Exit an interrupt context. Process softirqs if needed and possible: 330 * Exit an interrupt context. Process softirqs if needed and possible:
@@ -353,7 +343,7 @@ void irq_exit(void)
353 tick_nohz_irq_exit(); 343 tick_nohz_irq_exit();
354#endif 344#endif
355 rcu_irq_exit(); 345 rcu_irq_exit();
356 preempt_enable_no_resched(); 346 sched_preempt_enable_no_resched();
357} 347}
358 348
359/* 349/*
@@ -385,6 +375,12 @@ void raise_softirq(unsigned int nr)
385 local_irq_restore(flags); 375 local_irq_restore(flags);
386} 376}
387 377
378void __raise_softirq_irqoff(unsigned int nr)
379{
380 trace_softirq_raise(nr);
381 or_softirq_pending(1UL << nr);
382}
383
388void open_softirq(int nr, void (*action)(struct softirq_action *)) 384void open_softirq(int nr, void (*action)(struct softirq_action *))
389{ 385{
390 softirq_vec[nr].action = action; 386 softirq_vec[nr].action = action;
@@ -744,9 +740,7 @@ static int run_ksoftirqd(void * __bind_cpu)
744 while (!kthread_should_stop()) { 740 while (!kthread_should_stop()) {
745 preempt_disable(); 741 preempt_disable();
746 if (!local_softirq_pending()) { 742 if (!local_softirq_pending()) {
747 preempt_enable_no_resched(); 743 schedule_preempt_disabled();
748 schedule();
749 preempt_disable();
750 } 744 }
751 745
752 __set_current_state(TASK_RUNNING); 746 __set_current_state(TASK_RUNNING);
@@ -761,7 +755,7 @@ static int run_ksoftirqd(void * __bind_cpu)
761 if (local_softirq_pending()) 755 if (local_softirq_pending())
762 __do_softirq(); 756 __do_softirq();
763 local_irq_enable(); 757 local_irq_enable();
764 preempt_enable_no_resched(); 758 sched_preempt_enable_no_resched();
765 cond_resched(); 759 cond_resched();
766 preempt_disable(); 760 preempt_disable();
767 rcu_note_context_switch((long)__bind_cpu); 761 rcu_note_context_switch((long)__bind_cpu);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 84c7d96918bf..5cdd8065a3ce 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
163EXPORT_SYMBOL(_raw_spin_lock_bh); 163EXPORT_SYMBOL(_raw_spin_lock_bh);
164#endif 164#endif
165 165
166#ifndef CONFIG_INLINE_SPIN_UNLOCK 166#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) 167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
168{ 168{
169 __raw_spin_unlock(lock); 169 __raw_spin_unlock(lock);
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 0febf61e1aa3..ba35f3a4a1f4 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -172,6 +172,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
172{ 172{
173 int idx; 173 int idx;
174 174
175 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
176 !lock_is_held(&rcu_bh_lock_map) &&
177 !lock_is_held(&rcu_lock_map) &&
178 !lock_is_held(&rcu_sched_lock_map),
179 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
180
175 idx = sp->completed; 181 idx = sp->completed;
176 mutex_lock(&sp->mutex); 182 mutex_lock(&sp->mutex);
177 183
@@ -280,19 +286,26 @@ void synchronize_srcu(struct srcu_struct *sp)
280EXPORT_SYMBOL_GPL(synchronize_srcu); 286EXPORT_SYMBOL_GPL(synchronize_srcu);
281 287
282/** 288/**
283 * synchronize_srcu_expedited - like synchronize_srcu, but less patient 289 * synchronize_srcu_expedited - Brute-force SRCU grace period
284 * @sp: srcu_struct with which to synchronize. 290 * @sp: srcu_struct with which to synchronize.
285 * 291 *
286 * Flip the completed counter, and wait for the old count to drain to zero. 292 * Wait for an SRCU grace period to elapse, but use a "big hammer"
287 * As with classic RCU, the updater must use some separate means of 293 * approach to force the grace period to end quickly. This consumes
288 * synchronizing concurrent updates. Can block; must be called from 294 * significant time on all CPUs and is unfriendly to real-time workloads,
289 * process context. 295 * so is thus not recommended for any sort of common-case code. In fact,
296 * if you are using synchronize_srcu_expedited() in a loop, please
297 * restructure your code to batch your updates, and then use a single
298 * synchronize_srcu() instead.
290 * 299 *
291 * Note that it is illegal to call synchronize_srcu_expedited() 300 * Note that it is illegal to call this function while holding any lock
292 * from the corresponding SRCU read-side critical section; doing so 301 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
293 * will result in deadlock. However, it is perfectly legal to call 302 * to call this function from a CPU-hotplug notifier. Failing to observe
294 * synchronize_srcu_expedited() on one srcu_struct from some other 303 * these restriction will result in deadlock. It is also illegal to call
295 * srcu_struct's read-side critical section. 304 * synchronize_srcu_expedited() from the corresponding SRCU read-side
305 * critical section; doing so will result in deadlock. However, it is
306 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
307 * from some other srcu_struct's read-side critical section, as long as
308 * the resulting graph of srcu_structs is acyclic.
296 */ 309 */
297void synchronize_srcu_expedited(struct srcu_struct *sp) 310void synchronize_srcu_expedited(struct srcu_struct *sp)
298{ 311{
diff --git a/kernel/sys.c b/kernel/sys.c
index 40701538fbd1..e7006eb6c1e4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
444 magic2 != LINUX_REBOOT_MAGIC2C)) 444 magic2 != LINUX_REBOOT_MAGIC2C))
445 return -EINVAL; 445 return -EINVAL;
446 446
447 /*
448 * If pid namespaces are enabled and the current task is in a child
449 * pid_namespace, the command is handled by reboot_pid_ns() which will
450 * call do_exit().
451 */
452 ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
453 if (ret)
454 return ret;
455
447 /* Instead of trying to make the power_off code look like 456 /* Instead of trying to make the power_off code look like
448 * halt when pm_power_off is not set do it the easy way. 457 * halt when pm_power_off is not set do it the easy way.
449 */ 458 */
@@ -1706,7 +1715,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
1706 if (arg4 | arg5) 1715 if (arg4 | arg5)
1707 return -EINVAL; 1716 return -EINVAL;
1708 1717
1709 if (!capable(CAP_SYS_ADMIN)) 1718 if (!capable(CAP_SYS_RESOURCE))
1710 return -EPERM; 1719 return -EPERM;
1711 1720
1712 if (addr >= TASK_SIZE) 1721 if (addr >= TASK_SIZE)
@@ -1962,6 +1971,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1962 case PR_SET_MM: 1971 case PR_SET_MM:
1963 error = prctl_set_mm(arg2, arg3, arg4, arg5); 1972 error = prctl_set_mm(arg2, arg3, arg4, arg5);
1964 break; 1973 break;
1974 case PR_SET_CHILD_SUBREAPER:
1975 me->signal->is_child_subreaper = !!arg2;
1976 error = 0;
1977 break;
1978 case PR_GET_CHILD_SUBREAPER:
1979 error = put_user(me->signal->is_child_subreaper,
1980 (int __user *) arg2);
1981 break;
1965 default: 1982 default:
1966 error = -EINVAL; 1983 error = -EINVAL;
1967 break; 1984 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f487f257e05e..52b3a06a02f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/sysctl.h> 25#include <linux/sysctl.h>
26#include <linux/bitmap.h>
26#include <linux/signal.h> 27#include <linux/signal.h>
27#include <linux/printk.h> 28#include <linux/printk.h>
28#include <linux/proc_fs.h> 29#include <linux/proc_fs.h>
@@ -58,6 +59,7 @@
58#include <linux/oom.h> 59#include <linux/oom.h>
59#include <linux/kmod.h> 60#include <linux/kmod.h>
60#include <linux/capability.h> 61#include <linux/capability.h>
62#include <linux/binfmts.h>
61 63
62#include <asm/uaccess.h> 64#include <asm/uaccess.h>
63#include <asm/processor.h> 65#include <asm/processor.h>
@@ -67,6 +69,9 @@
67#include <asm/stacktrace.h> 69#include <asm/stacktrace.h>
68#include <asm/io.h> 70#include <asm/io.h>
69#endif 71#endif
72#ifdef CONFIG_SPARC
73#include <asm/setup.h>
74#endif
70#ifdef CONFIG_BSD_PROCESS_ACCT 75#ifdef CONFIG_BSD_PROCESS_ACCT
71#include <linux/acct.h> 76#include <linux/acct.h>
72#endif 77#endif
@@ -141,7 +146,6 @@ static const int cap_last_cap = CAP_LAST_CAP;
141#include <linux/inotify.h> 146#include <linux/inotify.h>
142#endif 147#endif
143#ifdef CONFIG_SPARC 148#ifdef CONFIG_SPARC
144#include <asm/system.h>
145#endif 149#endif
146 150
147#ifdef CONFIG_SPARC64 151#ifdef CONFIG_SPARC64
@@ -192,20 +196,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
192 196
193#endif 197#endif
194 198
195static struct ctl_table root_table[];
196static struct ctl_table_root sysctl_table_root;
197static struct ctl_table_header root_table_header = {
198 {{.count = 1,
199 .ctl_table = root_table,
200 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
201 .root = &sysctl_table_root,
202 .set = &sysctl_table_root.default_set,
203};
204static struct ctl_table_root sysctl_table_root = {
205 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
206 .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
207};
208
209static struct ctl_table kern_table[]; 199static struct ctl_table kern_table[];
210static struct ctl_table vm_table[]; 200static struct ctl_table vm_table[];
211static struct ctl_table fs_table[]; 201static struct ctl_table fs_table[];
@@ -222,7 +212,7 @@ int sysctl_legacy_va_layout;
222 212
223/* The default sysctl tables: */ 213/* The default sysctl tables: */
224 214
225static struct ctl_table root_table[] = { 215static struct ctl_table sysctl_base_table[] = {
226 { 216 {
227 .procname = "kernel", 217 .procname = "kernel",
228 .mode = 0555, 218 .mode = 0555,
@@ -1559,490 +1549,12 @@ static struct ctl_table dev_table[] = {
1559 { } 1549 { }
1560}; 1550};
1561 1551
1562static DEFINE_SPINLOCK(sysctl_lock); 1552int __init sysctl_init(void)
1563
1564/* called under sysctl_lock */
1565static int use_table(struct ctl_table_header *p)
1566{ 1553{
1567 if (unlikely(p->unregistering)) 1554 register_sysctl_table(sysctl_base_table);
1568 return 0;
1569 p->used++;
1570 return 1;
1571}
1572
1573/* called under sysctl_lock */
1574static void unuse_table(struct ctl_table_header *p)
1575{
1576 if (!--p->used)
1577 if (unlikely(p->unregistering))
1578 complete(p->unregistering);
1579}
1580
1581/* called under sysctl_lock, will reacquire if has to wait */
1582static void start_unregistering(struct ctl_table_header *p)
1583{
1584 /*
1585 * if p->used is 0, nobody will ever touch that entry again;
1586 * we'll eliminate all paths to it before dropping sysctl_lock
1587 */
1588 if (unlikely(p->used)) {
1589 struct completion wait;
1590 init_completion(&wait);
1591 p->unregistering = &wait;
1592 spin_unlock(&sysctl_lock);
1593 wait_for_completion(&wait);
1594 spin_lock(&sysctl_lock);
1595 } else {
1596 /* anything non-NULL; we'll never dereference it */
1597 p->unregistering = ERR_PTR(-EINVAL);
1598 }
1599 /*
1600 * do not remove from the list until nobody holds it; walking the
1601 * list in do_sysctl() relies on that.
1602 */
1603 list_del_init(&p->ctl_entry);
1604}
1605
1606void sysctl_head_get(struct ctl_table_header *head)
1607{
1608 spin_lock(&sysctl_lock);
1609 head->count++;
1610 spin_unlock(&sysctl_lock);
1611}
1612
1613void sysctl_head_put(struct ctl_table_header *head)
1614{
1615 spin_lock(&sysctl_lock);
1616 if (!--head->count)
1617 kfree_rcu(head, rcu);
1618 spin_unlock(&sysctl_lock);
1619}
1620
1621struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
1622{
1623 if (!head)
1624 BUG();
1625 spin_lock(&sysctl_lock);
1626 if (!use_table(head))
1627 head = ERR_PTR(-ENOENT);
1628 spin_unlock(&sysctl_lock);
1629 return head;
1630}
1631
1632void sysctl_head_finish(struct ctl_table_header *head)
1633{
1634 if (!head)
1635 return;
1636 spin_lock(&sysctl_lock);
1637 unuse_table(head);
1638 spin_unlock(&sysctl_lock);
1639}
1640
1641static struct ctl_table_set *
1642lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
1643{
1644 struct ctl_table_set *set = &root->default_set;
1645 if (root->lookup)
1646 set = root->lookup(root, namespaces);
1647 return set;
1648}
1649
1650static struct list_head *
1651lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
1652{
1653 struct ctl_table_set *set = lookup_header_set(root, namespaces);
1654 return &set->list;
1655}
1656
1657struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
1658 struct ctl_table_header *prev)
1659{
1660 struct ctl_table_root *root;
1661 struct list_head *header_list;
1662 struct ctl_table_header *head;
1663 struct list_head *tmp;
1664
1665 spin_lock(&sysctl_lock);
1666 if (prev) {
1667 head = prev;
1668 tmp = &prev->ctl_entry;
1669 unuse_table(prev);
1670 goto next;
1671 }
1672 tmp = &root_table_header.ctl_entry;
1673 for (;;) {
1674 head = list_entry(tmp, struct ctl_table_header, ctl_entry);
1675
1676 if (!use_table(head))
1677 goto next;
1678 spin_unlock(&sysctl_lock);
1679 return head;
1680 next:
1681 root = head->root;
1682 tmp = tmp->next;
1683 header_list = lookup_header_list(root, namespaces);
1684 if (tmp != header_list)
1685 continue;
1686
1687 do {
1688 root = list_entry(root->root_list.next,
1689 struct ctl_table_root, root_list);
1690 if (root == &sysctl_table_root)
1691 goto out;
1692 header_list = lookup_header_list(root, namespaces);
1693 } while (list_empty(header_list));
1694 tmp = header_list->next;
1695 }
1696out:
1697 spin_unlock(&sysctl_lock);
1698 return NULL;
1699}
1700
1701struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1702{
1703 return __sysctl_head_next(current->nsproxy, prev);
1704}
1705
1706void register_sysctl_root(struct ctl_table_root *root)
1707{
1708 spin_lock(&sysctl_lock);
1709 list_add_tail(&root->root_list, &sysctl_table_root.root_list);
1710 spin_unlock(&sysctl_lock);
1711}
1712
1713/*
1714 * sysctl_perm does NOT grant the superuser all rights automatically, because
1715 * some sysctl variables are readonly even to root.
1716 */
1717
1718static int test_perm(int mode, int op)
1719{
1720 if (!current_euid())
1721 mode >>= 6;
1722 else if (in_egroup_p(0))
1723 mode >>= 3;
1724 if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
1725 return 0;
1726 return -EACCES;
1727}
1728
1729int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1730{
1731 int mode;
1732
1733 if (root->permissions)
1734 mode = root->permissions(root, current->nsproxy, table);
1735 else
1736 mode = table->mode;
1737
1738 return test_perm(mode, op);
1739}
1740
1741static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
1742{
1743 for (; table->procname; table++) {
1744 table->parent = parent;
1745 if (table->child)
1746 sysctl_set_parent(table, table->child);
1747 }
1748}
1749
1750static __init int sysctl_init(void)
1751{
1752 sysctl_set_parent(NULL, root_table);
1753#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1754 sysctl_check_table(current->nsproxy, root_table);
1755#endif
1756 return 0; 1555 return 0;
1757} 1556}
1758 1557
1759core_initcall(sysctl_init);
1760
1761static struct ctl_table *is_branch_in(struct ctl_table *branch,
1762 struct ctl_table *table)
1763{
1764 struct ctl_table *p;
1765 const char *s = branch->procname;
1766
1767 /* branch should have named subdirectory as its first element */
1768 if (!s || !branch->child)
1769 return NULL;
1770
1771 /* ... and nothing else */
1772 if (branch[1].procname)
1773 return NULL;
1774
1775 /* table should contain subdirectory with the same name */
1776 for (p = table; p->procname; p++) {
1777 if (!p->child)
1778 continue;
1779 if (p->procname && strcmp(p->procname, s) == 0)
1780 return p;
1781 }
1782 return NULL;
1783}
1784
1785/* see if attaching q to p would be an improvement */
1786static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1787{
1788 struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
1789 struct ctl_table *next;
1790 int is_better = 0;
1791 int not_in_parent = !p->attached_by;
1792
1793 while ((next = is_branch_in(by, to)) != NULL) {
1794 if (by == q->attached_by)
1795 is_better = 1;
1796 if (to == p->attached_by)
1797 not_in_parent = 1;
1798 by = by->child;
1799 to = next->child;
1800 }
1801
1802 if (is_better && not_in_parent) {
1803 q->attached_by = by;
1804 q->attached_to = to;
1805 q->parent = p;
1806 }
1807}
1808
1809/**
1810 * __register_sysctl_paths - register a sysctl hierarchy
1811 * @root: List of sysctl headers to register on
1812 * @namespaces: Data to compute which lists of sysctl entries are visible
1813 * @path: The path to the directory the sysctl table is in.
1814 * @table: the top-level table structure
1815 *
1816 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1817 * array. A completely 0 filled entry terminates the table.
1818 *
1819 * The members of the &struct ctl_table structure are used as follows:
1820 *
1821 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
1822 * enter a sysctl file
1823 *
1824 * data - a pointer to data for use by proc_handler
1825 *
1826 * maxlen - the maximum size in bytes of the data
1827 *
1828 * mode - the file permissions for the /proc/sys file, and for sysctl(2)
1829 *
1830 * child - a pointer to the child sysctl table if this entry is a directory, or
1831 * %NULL.
1832 *
1833 * proc_handler - the text handler routine (described below)
1834 *
1835 * de - for internal use by the sysctl routines
1836 *
1837 * extra1, extra2 - extra pointers usable by the proc handler routines
1838 *
1839 * Leaf nodes in the sysctl tree will be represented by a single file
1840 * under /proc; non-leaf nodes will be represented by directories.
1841 *
1842 * sysctl(2) can automatically manage read and write requests through
1843 * the sysctl table. The data and maxlen fields of the ctl_table
1844 * struct enable minimal validation of the values being written to be
1845 * performed, and the mode field allows minimal authentication.
1846 *
1847 * There must be a proc_handler routine for any terminal nodes
1848 * mirrored under /proc/sys (non-terminals are handled by a built-in
1849 * directory handler). Several default handlers are available to
1850 * cover common cases -
1851 *
1852 * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
1853 * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
1854 * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
1855 *
1856 * It is the handler's job to read the input buffer from user memory
1857 * and process it. The handler should return 0 on success.
1858 *
1859 * This routine returns %NULL on a failure to register, and a pointer
1860 * to the table header on success.
1861 */
1862struct ctl_table_header *__register_sysctl_paths(
1863 struct ctl_table_root *root,
1864 struct nsproxy *namespaces,
1865 const struct ctl_path *path, struct ctl_table *table)
1866{
1867 struct ctl_table_header *header;
1868 struct ctl_table *new, **prevp;
1869 unsigned int n, npath;
1870 struct ctl_table_set *set;
1871
1872 /* Count the path components */
1873 for (npath = 0; path[npath].procname; ++npath)
1874 ;
1875
1876 /*
1877 * For each path component, allocate a 2-element ctl_table array.
1878 * The first array element will be filled with the sysctl entry
1879 * for this, the second will be the sentinel (procname == 0).
1880 *
1881 * We allocate everything in one go so that we don't have to
1882 * worry about freeing additional memory in unregister_sysctl_table.
1883 */
1884 header = kzalloc(sizeof(struct ctl_table_header) +
1885 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
1886 if (!header)
1887 return NULL;
1888
1889 new = (struct ctl_table *) (header + 1);
1890
1891 /* Now connect the dots */
1892 prevp = &header->ctl_table;
1893 for (n = 0; n < npath; ++n, ++path) {
1894 /* Copy the procname */
1895 new->procname = path->procname;
1896 new->mode = 0555;
1897
1898 *prevp = new;
1899 prevp = &new->child;
1900
1901 new += 2;
1902 }
1903 *prevp = table;
1904 header->ctl_table_arg = table;
1905
1906 INIT_LIST_HEAD(&header->ctl_entry);
1907 header->used = 0;
1908 header->unregistering = NULL;
1909 header->root = root;
1910 sysctl_set_parent(NULL, header->ctl_table);
1911 header->count = 1;
1912#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1913 if (sysctl_check_table(namespaces, header->ctl_table)) {
1914 kfree(header);
1915 return NULL;
1916 }
1917#endif
1918 spin_lock(&sysctl_lock);
1919 header->set = lookup_header_set(root, namespaces);
1920 header->attached_by = header->ctl_table;
1921 header->attached_to = root_table;
1922 header->parent = &root_table_header;
1923 for (set = header->set; set; set = set->parent) {
1924 struct ctl_table_header *p;
1925 list_for_each_entry(p, &set->list, ctl_entry) {
1926 if (p->unregistering)
1927 continue;
1928 try_attach(p, header);
1929 }
1930 }
1931 header->parent->count++;
1932 list_add_tail(&header->ctl_entry, &header->set->list);
1933 spin_unlock(&sysctl_lock);
1934
1935 return header;
1936}
1937
1938/**
1939 * register_sysctl_table_path - register a sysctl table hierarchy
1940 * @path: The path to the directory the sysctl table is in.
1941 * @table: the top-level table structure
1942 *
1943 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1944 * array. A completely 0 filled entry terminates the table.
1945 *
1946 * See __register_sysctl_paths for more details.
1947 */
1948struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1949 struct ctl_table *table)
1950{
1951 return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
1952 path, table);
1953}
1954
1955/**
1956 * register_sysctl_table - register a sysctl table hierarchy
1957 * @table: the top-level table structure
1958 *
1959 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1960 * array. A completely 0 filled entry terminates the table.
1961 *
1962 * See register_sysctl_paths for more details.
1963 */
1964struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
1965{
1966 static const struct ctl_path null_path[] = { {} };
1967
1968 return register_sysctl_paths(null_path, table);
1969}
1970
1971/**
1972 * unregister_sysctl_table - unregister a sysctl table hierarchy
1973 * @header: the header returned from register_sysctl_table
1974 *
1975 * Unregisters the sysctl table and all children. proc entries may not
1976 * actually be removed until they are no longer used by anyone.
1977 */
1978void unregister_sysctl_table(struct ctl_table_header * header)
1979{
1980 might_sleep();
1981
1982 if (header == NULL)
1983 return;
1984
1985 spin_lock(&sysctl_lock);
1986 start_unregistering(header);
1987 if (!--header->parent->count) {
1988 WARN_ON(1);
1989 kfree_rcu(header->parent, rcu);
1990 }
1991 if (!--header->count)
1992 kfree_rcu(header, rcu);
1993 spin_unlock(&sysctl_lock);
1994}
1995
1996int sysctl_is_seen(struct ctl_table_header *p)
1997{
1998 struct ctl_table_set *set = p->set;
1999 int res;
2000 spin_lock(&sysctl_lock);
2001 if (p->unregistering)
2002 res = 0;
2003 else if (!set->is_seen)
2004 res = 1;
2005 else
2006 res = set->is_seen(set);
2007 spin_unlock(&sysctl_lock);
2008 return res;
2009}
2010
2011void setup_sysctl_set(struct ctl_table_set *p,
2012 struct ctl_table_set *parent,
2013 int (*is_seen)(struct ctl_table_set *))
2014{
2015 INIT_LIST_HEAD(&p->list);
2016 p->parent = parent ? parent : &sysctl_table_root.default_set;
2017 p->is_seen = is_seen;
2018}
2019
2020#else /* !CONFIG_SYSCTL */
2021struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
2022{
2023 return NULL;
2024}
2025
2026struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
2027 struct ctl_table *table)
2028{
2029 return NULL;
2030}
2031
2032void unregister_sysctl_table(struct ctl_table_header * table)
2033{
2034}
2035
2036void setup_sysctl_set(struct ctl_table_set *p,
2037 struct ctl_table_set *parent,
2038 int (*is_seen)(struct ctl_table_set *))
2039{
2040}
2041
2042void sysctl_head_put(struct ctl_table_header *head)
2043{
2044}
2045
2046#endif /* CONFIG_SYSCTL */ 1558#endif /* CONFIG_SYSCTL */
2047 1559
2048/* 1560/*
@@ -2884,9 +2396,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2884 } 2396 }
2885 } 2397 }
2886 2398
2887 while (val_a <= val_b) 2399 bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
2888 set_bit(val_a++, tmp_bitmap);
2889
2890 first = 0; 2400 first = 0;
2891 proc_skip_char(&kbuf, &left, '\n'); 2401 proc_skip_char(&kbuf, &left, '\n');
2892 } 2402 }
@@ -2929,8 +2439,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
2929 if (*ppos) 2439 if (*ppos)
2930 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); 2440 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
2931 else 2441 else
2932 memcpy(bitmap, tmp_bitmap, 2442 bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
2933 BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
2934 } 2443 }
2935 kfree(tmp_bitmap); 2444 kfree(tmp_bitmap);
2936 *lenp -= left; 2445 *lenp -= left;
@@ -3008,6 +2517,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
3008EXPORT_SYMBOL(proc_dostring); 2517EXPORT_SYMBOL(proc_dostring);
3009EXPORT_SYMBOL(proc_doulongvec_minmax); 2518EXPORT_SYMBOL(proc_doulongvec_minmax);
3010EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2519EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
3011EXPORT_SYMBOL(register_sysctl_table);
3012EXPORT_SYMBOL(register_sysctl_paths);
3013EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
deleted file mode 100644
index 362da653813d..000000000000
--- a/kernel/sysctl_check.c
+++ /dev/null
@@ -1,160 +0,0 @@
1#include <linux/stat.h>
2#include <linux/sysctl.h>
3#include "../fs/xfs/xfs_sysctl.h"
4#include <linux/sunrpc/debug.h>
5#include <linux/string.h>
6#include <net/ip_vs.h>
7
8
9static int sysctl_depth(struct ctl_table *table)
10{
11 struct ctl_table *tmp;
12 int depth;
13
14 depth = 0;
15 for (tmp = table; tmp->parent; tmp = tmp->parent)
16 depth++;
17
18 return depth;
19}
20
21static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
22{
23 int i;
24
25 for (i = 0; table && i < n; i++)
26 table = table->parent;
27
28 return table;
29}
30
31
32static void sysctl_print_path(struct ctl_table *table)
33{
34 struct ctl_table *tmp;
35 int depth, i;
36 depth = sysctl_depth(table);
37 if (table->procname) {
38 for (i = depth; i >= 0; i--) {
39 tmp = sysctl_parent(table, i);
40 printk("/%s", tmp->procname?tmp->procname:"");
41 }
42 }
43 printk(" ");
44}
45
46static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
47 struct ctl_table *table)
48{
49 struct ctl_table_header *head;
50 struct ctl_table *ref, *test;
51 int depth, cur_depth;
52
53 depth = sysctl_depth(table);
54
55 for (head = __sysctl_head_next(namespaces, NULL); head;
56 head = __sysctl_head_next(namespaces, head)) {
57 cur_depth = depth;
58 ref = head->ctl_table;
59repeat:
60 test = sysctl_parent(table, cur_depth);
61 for (; ref->procname; ref++) {
62 int match = 0;
63 if (cur_depth && !ref->child)
64 continue;
65
66 if (test->procname && ref->procname &&
67 (strcmp(test->procname, ref->procname) == 0))
68 match++;
69
70 if (match) {
71 if (cur_depth != 0) {
72 cur_depth--;
73 ref = ref->child;
74 goto repeat;
75 }
76 goto out;
77 }
78 }
79 }
80 ref = NULL;
81out:
82 sysctl_head_finish(head);
83 return ref;
84}
85
86static void set_fail(const char **fail, struct ctl_table *table, const char *str)
87{
88 if (*fail) {
89 printk(KERN_ERR "sysctl table check failed: ");
90 sysctl_print_path(table);
91 printk(" %s\n", *fail);
92 dump_stack();
93 }
94 *fail = str;
95}
96
97static void sysctl_check_leaf(struct nsproxy *namespaces,
98 struct ctl_table *table, const char **fail)
99{
100 struct ctl_table *ref;
101
102 ref = sysctl_check_lookup(namespaces, table);
103 if (ref && (ref != table))
104 set_fail(fail, table, "Sysctl already exists");
105}
106
107int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
108{
109 int error = 0;
110 for (; table->procname; table++) {
111 const char *fail = NULL;
112
113 if (table->parent) {
114 if (!table->parent->procname)
115 set_fail(&fail, table, "Parent without procname");
116 }
117 if (table->child) {
118 if (table->data)
119 set_fail(&fail, table, "Directory with data?");
120 if (table->maxlen)
121 set_fail(&fail, table, "Directory with maxlen?");
122 if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
123 set_fail(&fail, table, "Writable sysctl directory");
124 if (table->proc_handler)
125 set_fail(&fail, table, "Directory with proc_handler");
126 if (table->extra1)
127 set_fail(&fail, table, "Directory with extra1");
128 if (table->extra2)
129 set_fail(&fail, table, "Directory with extra2");
130 } else {
131 if ((table->proc_handler == proc_dostring) ||
132 (table->proc_handler == proc_dointvec) ||
133 (table->proc_handler == proc_dointvec_minmax) ||
134 (table->proc_handler == proc_dointvec_jiffies) ||
135 (table->proc_handler == proc_dointvec_userhz_jiffies) ||
136 (table->proc_handler == proc_dointvec_ms_jiffies) ||
137 (table->proc_handler == proc_doulongvec_minmax) ||
138 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
139 if (!table->data)
140 set_fail(&fail, table, "No data");
141 if (!table->maxlen)
142 set_fail(&fail, table, "No maxlen");
143 }
144#ifdef CONFIG_PROC_SYSCTL
145 if (!table->proc_handler)
146 set_fail(&fail, table, "No proc_handler");
147#endif
148 sysctl_check_leaf(namespaces, table, &fail);
149 }
150 if (table->mode > 0777)
151 set_fail(&fail, table, "bogus .mode");
152 if (fail) {
153 set_fail(&fail, table, NULL);
154 error = -EINVAL;
155 }
156 if (table->child)
157 error |= sysctl_check_table(namespaces, table->child);
158 }
159 return error;
160}
diff --git a/kernel/time.c b/kernel/time.c
index 73e416db0a1e..ba744cf80696 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -163,7 +163,6 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
163 return error; 163 return error;
164 164
165 if (tz) { 165 if (tz) {
166 /* SMP safe, global irq locking makes it work. */
167 sys_tz = *tz; 166 sys_tz = *tz;
168 update_vsyscall_tz(); 167 update_vsyscall_tz();
169 if (firsttime) { 168 if (firsttime) {
@@ -173,12 +172,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
173 } 172 }
174 } 173 }
175 if (tv) 174 if (tv)
176 {
177 /* SMP safe, again the code in arch/foo/time.c should
178 * globally block out interrupts when it runs.
179 */
180 return do_settimeofday(tv); 175 return do_settimeofday(tv);
181 }
182 return 0; 176 return 0;
183} 177}
184 178
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a46f5d64504..8a538c55fc7b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -96,6 +96,11 @@ static int alarmtimer_rtc_add_device(struct device *dev,
96 return 0; 96 return 0;
97} 97}
98 98
99static inline void alarmtimer_rtc_timer_init(void)
100{
101 rtc_timer_init(&rtctimer, NULL, NULL);
102}
103
99static struct class_interface alarmtimer_rtc_interface = { 104static struct class_interface alarmtimer_rtc_interface = {
100 .add_dev = &alarmtimer_rtc_add_device, 105 .add_dev = &alarmtimer_rtc_add_device,
101}; 106};
@@ -117,6 +122,7 @@ static inline struct rtc_device *alarmtimer_get_rtcdev(void)
117#define rtcdev (NULL) 122#define rtcdev (NULL)
118static inline int alarmtimer_rtc_interface_setup(void) { return 0; } 123static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
119static inline void alarmtimer_rtc_interface_remove(void) { } 124static inline void alarmtimer_rtc_interface_remove(void) { }
125static inline void alarmtimer_rtc_timer_init(void) { }
120#endif 126#endif
121 127
122/** 128/**
@@ -783,6 +789,8 @@ static int __init alarmtimer_init(void)
783 .nsleep = alarm_timer_nsleep, 789 .nsleep = alarm_timer_nsleep,
784 }; 790 };
785 791
792 alarmtimer_rtc_timer_init();
793
786 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); 794 posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
787 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); 795 posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
788 796
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index a45ca167ab24..c9583382141a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -500,7 +500,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
500{ 500{
501 u64 ret; 501 u64 ret;
502 /* 502 /*
503 * We won't try to correct for more then 11% adjustments (110,000 ppm), 503 * We won't try to correct for more than 11% adjustments (110,000 ppm),
504 */ 504 */
505 ret = (u64)cs->mult * 11; 505 ret = (u64)cs->mult * 11;
506 do_div(ret,100); 506 do_div(ret,100);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f6117a4c7cb8..f03fd83b170b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -22,17 +22,18 @@
22 * NTP timekeeping variables: 22 * NTP timekeeping variables:
23 */ 23 */
24 24
25DEFINE_SPINLOCK(ntp_lock);
26
27
25/* USER_HZ period (usecs): */ 28/* USER_HZ period (usecs): */
26unsigned long tick_usec = TICK_USEC; 29unsigned long tick_usec = TICK_USEC;
27 30
28/* ACTHZ period (nsecs): */ 31/* ACTHZ period (nsecs): */
29unsigned long tick_nsec; 32unsigned long tick_nsec;
30 33
31u64 tick_length; 34static u64 tick_length;
32static u64 tick_length_base; 35static u64 tick_length_base;
33 36
34static struct hrtimer leap_timer;
35
36#define MAX_TICKADJ 500LL /* usecs */ 37#define MAX_TICKADJ 500LL /* usecs */
37#define MAX_TICKADJ_SCALED \ 38#define MAX_TICKADJ_SCALED \
38 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) 39 (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -49,7 +50,7 @@ static struct hrtimer leap_timer;
49static int time_state = TIME_OK; 50static int time_state = TIME_OK;
50 51
51/* clock status bits: */ 52/* clock status bits: */
52int time_status = STA_UNSYNC; 53static int time_status = STA_UNSYNC;
53 54
54/* TAI offset (secs): */ 55/* TAI offset (secs): */
55static long time_tai; 56static long time_tai;
@@ -133,7 +134,7 @@ static inline void pps_reset_freq_interval(void)
133/** 134/**
134 * pps_clear - Clears the PPS state variables 135 * pps_clear - Clears the PPS state variables
135 * 136 *
136 * Must be called while holding a write on the xtime_lock 137 * Must be called while holding a write on the ntp_lock
137 */ 138 */
138static inline void pps_clear(void) 139static inline void pps_clear(void)
139{ 140{
@@ -149,7 +150,7 @@ static inline void pps_clear(void)
149 * the last PPS signal. When it reaches 0, indicate that PPS signal is 150 * the last PPS signal. When it reaches 0, indicate that PPS signal is
150 * missing. 151 * missing.
151 * 152 *
152 * Must be called while holding a write on the xtime_lock 153 * Must be called while holding a write on the ntp_lock
153 */ 154 */
154static inline void pps_dec_valid(void) 155static inline void pps_dec_valid(void)
155{ 156{
@@ -233,6 +234,17 @@ static inline void pps_fill_timex(struct timex *txc)
233 234
234#endif /* CONFIG_NTP_PPS */ 235#endif /* CONFIG_NTP_PPS */
235 236
237
238/**
239 * ntp_synced - Returns 1 if the NTP status is not UNSYNC
240 *
241 */
242static inline int ntp_synced(void)
243{
244 return !(time_status & STA_UNSYNC);
245}
246
247
236/* 248/*
237 * NTP methods: 249 * NTP methods:
238 */ 250 */
@@ -275,7 +287,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
275 287
276 time_status |= STA_MODE; 288 time_status |= STA_MODE;
277 289
278 return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); 290 return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
279} 291}
280 292
281static void ntp_update_offset(long offset) 293static void ntp_update_offset(long offset)
@@ -330,11 +342,13 @@ static void ntp_update_offset(long offset)
330 342
331/** 343/**
332 * ntp_clear - Clears the NTP state variables 344 * ntp_clear - Clears the NTP state variables
333 *
334 * Must be called while holding a write on the xtime_lock
335 */ 345 */
336void ntp_clear(void) 346void ntp_clear(void)
337{ 347{
348 unsigned long flags;
349
350 spin_lock_irqsave(&ntp_lock, flags);
351
338 time_adjust = 0; /* stop active adjtime() */ 352 time_adjust = 0; /* stop active adjtime() */
339 time_status |= STA_UNSYNC; 353 time_status |= STA_UNSYNC;
340 time_maxerror = NTP_PHASE_LIMIT; 354 time_maxerror = NTP_PHASE_LIMIT;
@@ -347,63 +361,81 @@ void ntp_clear(void)
347 361
348 /* Clear PPS state variables */ 362 /* Clear PPS state variables */
349 pps_clear(); 363 pps_clear();
364 spin_unlock_irqrestore(&ntp_lock, flags);
365
366}
367
368
369u64 ntp_tick_length(void)
370{
371 unsigned long flags;
372 s64 ret;
373
374 spin_lock_irqsave(&ntp_lock, flags);
375 ret = tick_length;
376 spin_unlock_irqrestore(&ntp_lock, flags);
377 return ret;
350} 378}
351 379
380
352/* 381/*
353 * Leap second processing. If in leap-insert state at the end of the 382 * this routine handles the overflow of the microsecond field
354 * day, the system clock is set back one second; if in leap-delete 383 *
355 * state, the system clock is set ahead one second. 384 * The tricky bits of code to handle the accurate clock support
385 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
386 * They were originally developed for SUN and DEC kernels.
387 * All the kudos should go to Dave for this stuff.
388 *
389 * Also handles leap second processing, and returns leap offset
356 */ 390 */
357static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) 391int second_overflow(unsigned long secs)
358{ 392{
359 enum hrtimer_restart res = HRTIMER_NORESTART; 393 s64 delta;
394 int leap = 0;
395 unsigned long flags;
360 396
361 write_seqlock(&xtime_lock); 397 spin_lock_irqsave(&ntp_lock, flags);
362 398
399 /*
400 * Leap second processing. If in leap-insert state at the end of the
401 * day, the system clock is set back one second; if in leap-delete
402 * state, the system clock is set ahead one second.
403 */
363 switch (time_state) { 404 switch (time_state) {
364 case TIME_OK: 405 case TIME_OK:
406 if (time_status & STA_INS)
407 time_state = TIME_INS;
408 else if (time_status & STA_DEL)
409 time_state = TIME_DEL;
365 break; 410 break;
366 case TIME_INS: 411 case TIME_INS:
367 timekeeping_leap_insert(-1); 412 if (secs % 86400 == 0) {
368 time_state = TIME_OOP; 413 leap = -1;
369 printk(KERN_NOTICE 414 time_state = TIME_OOP;
370 "Clock: inserting leap second 23:59:60 UTC\n"); 415 printk(KERN_NOTICE
371 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); 416 "Clock: inserting leap second 23:59:60 UTC\n");
372 res = HRTIMER_RESTART; 417 }
373 break; 418 break;
374 case TIME_DEL: 419 case TIME_DEL:
375 timekeeping_leap_insert(1); 420 if ((secs + 1) % 86400 == 0) {
376 time_tai--; 421 leap = 1;
377 time_state = TIME_WAIT; 422 time_tai--;
378 printk(KERN_NOTICE 423 time_state = TIME_WAIT;
379 "Clock: deleting leap second 23:59:59 UTC\n"); 424 printk(KERN_NOTICE
425 "Clock: deleting leap second 23:59:59 UTC\n");
426 }
380 break; 427 break;
381 case TIME_OOP: 428 case TIME_OOP:
382 time_tai++; 429 time_tai++;
383 time_state = TIME_WAIT; 430 time_state = TIME_WAIT;
384 /* fall through */ 431 break;
432
385 case TIME_WAIT: 433 case TIME_WAIT:
386 if (!(time_status & (STA_INS | STA_DEL))) 434 if (!(time_status & (STA_INS | STA_DEL)))
387 time_state = TIME_OK; 435 time_state = TIME_OK;
388 break; 436 break;
389 } 437 }
390 438
391 write_sequnlock(&xtime_lock);
392
393 return res;
394}
395
396/*
397 * this routine handles the overflow of the microsecond field
398 *
399 * The tricky bits of code to handle the accurate clock support
400 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
401 * They were originally developed for SUN and DEC kernels.
402 * All the kudos should go to Dave for this stuff.
403 */
404void second_overflow(void)
405{
406 s64 delta;
407 439
408 /* Bump the maxerror field */ 440 /* Bump the maxerror field */
409 time_maxerror += MAXFREQ / NSEC_PER_USEC; 441 time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -423,30 +455,34 @@ void second_overflow(void)
423 pps_dec_valid(); 455 pps_dec_valid();
424 456
425 if (!time_adjust) 457 if (!time_adjust)
426 return; 458 goto out;
427 459
428 if (time_adjust > MAX_TICKADJ) { 460 if (time_adjust > MAX_TICKADJ) {
429 time_adjust -= MAX_TICKADJ; 461 time_adjust -= MAX_TICKADJ;
430 tick_length += MAX_TICKADJ_SCALED; 462 tick_length += MAX_TICKADJ_SCALED;
431 return; 463 goto out;
432 } 464 }
433 465
434 if (time_adjust < -MAX_TICKADJ) { 466 if (time_adjust < -MAX_TICKADJ) {
435 time_adjust += MAX_TICKADJ; 467 time_adjust += MAX_TICKADJ;
436 tick_length -= MAX_TICKADJ_SCALED; 468 tick_length -= MAX_TICKADJ_SCALED;
437 return; 469 goto out;
438 } 470 }
439 471
440 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) 472 tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
441 << NTP_SCALE_SHIFT; 473 << NTP_SCALE_SHIFT;
442 time_adjust = 0; 474 time_adjust = 0;
475
476
477
478out:
479 spin_unlock_irqrestore(&ntp_lock, flags);
480
481 return leap;
443} 482}
444 483
445#ifdef CONFIG_GENERIC_CMOS_UPDATE 484#ifdef CONFIG_GENERIC_CMOS_UPDATE
446 485
447/* Disable the cmos update - used by virtualization and embedded */
448int no_sync_cmos_clock __read_mostly;
449
450static void sync_cmos_clock(struct work_struct *work); 486static void sync_cmos_clock(struct work_struct *work);
451 487
452static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); 488static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -493,35 +529,13 @@ static void sync_cmos_clock(struct work_struct *work)
493 529
494static void notify_cmos_timer(void) 530static void notify_cmos_timer(void)
495{ 531{
496 if (!no_sync_cmos_clock) 532 schedule_delayed_work(&sync_cmos_work, 0);
497 schedule_delayed_work(&sync_cmos_work, 0);
498} 533}
499 534
500#else 535#else
501static inline void notify_cmos_timer(void) { } 536static inline void notify_cmos_timer(void) { }
502#endif 537#endif
503 538
504/*
505 * Start the leap seconds timer:
506 */
507static inline void ntp_start_leap_timer(struct timespec *ts)
508{
509 long now = ts->tv_sec;
510
511 if (time_status & STA_INS) {
512 time_state = TIME_INS;
513 now += 86400 - now % 86400;
514 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
515
516 return;
517 }
518
519 if (time_status & STA_DEL) {
520 time_state = TIME_DEL;
521 now += 86400 - (now + 1) % 86400;
522 hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
523 }
524}
525 539
526/* 540/*
527 * Propagate a new txc->status value into the NTP state: 541 * Propagate a new txc->status value into the NTP state:
@@ -546,22 +560,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
546 time_status &= STA_RONLY; 560 time_status &= STA_RONLY;
547 time_status |= txc->status & ~STA_RONLY; 561 time_status |= txc->status & ~STA_RONLY;
548 562
549 switch (time_state) {
550 case TIME_OK:
551 ntp_start_leap_timer(ts);
552 break;
553 case TIME_INS:
554 case TIME_DEL:
555 time_state = TIME_OK;
556 ntp_start_leap_timer(ts);
557 case TIME_WAIT:
558 if (!(time_status & (STA_INS | STA_DEL)))
559 time_state = TIME_OK;
560 break;
561 case TIME_OOP:
562 hrtimer_restart(&leap_timer);
563 break;
564 }
565} 563}
566/* 564/*
567 * Called with the xtime lock held, so we can access and modify 565 * Called with the xtime lock held, so we can access and modify
@@ -643,9 +641,6 @@ int do_adjtimex(struct timex *txc)
643 (txc->tick < 900000/USER_HZ || 641 (txc->tick < 900000/USER_HZ ||
644 txc->tick > 1100000/USER_HZ)) 642 txc->tick > 1100000/USER_HZ))
645 return -EINVAL; 643 return -EINVAL;
646
647 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
648 hrtimer_cancel(&leap_timer);
649 } 644 }
650 645
651 if (txc->modes & ADJ_SETOFFSET) { 646 if (txc->modes & ADJ_SETOFFSET) {
@@ -663,7 +658,7 @@ int do_adjtimex(struct timex *txc)
663 658
664 getnstimeofday(&ts); 659 getnstimeofday(&ts);
665 660
666 write_seqlock_irq(&xtime_lock); 661 spin_lock_irq(&ntp_lock);
667 662
668 if (txc->modes & ADJ_ADJTIME) { 663 if (txc->modes & ADJ_ADJTIME) {
669 long save_adjust = time_adjust; 664 long save_adjust = time_adjust;
@@ -705,7 +700,7 @@ int do_adjtimex(struct timex *txc)
705 /* fill PPS status fields */ 700 /* fill PPS status fields */
706 pps_fill_timex(txc); 701 pps_fill_timex(txc);
707 702
708 write_sequnlock_irq(&xtime_lock); 703 spin_unlock_irq(&ntp_lock);
709 704
710 txc->time.tv_sec = ts.tv_sec; 705 txc->time.tv_sec = ts.tv_sec;
711 txc->time.tv_usec = ts.tv_nsec; 706 txc->time.tv_usec = ts.tv_nsec;
@@ -903,7 +898,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
903 898
904 pts_norm = pps_normalize_ts(*phase_ts); 899 pts_norm = pps_normalize_ts(*phase_ts);
905 900
906 write_seqlock_irqsave(&xtime_lock, flags); 901 spin_lock_irqsave(&ntp_lock, flags);
907 902
908 /* clear the error bits, they will be set again if needed */ 903 /* clear the error bits, they will be set again if needed */
909 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); 904 time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -916,7 +911,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
916 * just start the frequency interval */ 911 * just start the frequency interval */
917 if (unlikely(pps_fbase.tv_sec == 0)) { 912 if (unlikely(pps_fbase.tv_sec == 0)) {
918 pps_fbase = *raw_ts; 913 pps_fbase = *raw_ts;
919 write_sequnlock_irqrestore(&xtime_lock, flags); 914 spin_unlock_irqrestore(&ntp_lock, flags);
920 return; 915 return;
921 } 916 }
922 917
@@ -931,7 +926,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
931 time_status |= STA_PPSJITTER; 926 time_status |= STA_PPSJITTER;
932 /* restart the frequency calibration interval */ 927 /* restart the frequency calibration interval */
933 pps_fbase = *raw_ts; 928 pps_fbase = *raw_ts;
934 write_sequnlock_irqrestore(&xtime_lock, flags); 929 spin_unlock_irqrestore(&ntp_lock, flags);
935 pr_err("hardpps: PPSJITTER: bad pulse\n"); 930 pr_err("hardpps: PPSJITTER: bad pulse\n");
936 return; 931 return;
937 } 932 }
@@ -948,7 +943,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
948 943
949 hardpps_update_phase(pts_norm.nsec); 944 hardpps_update_phase(pts_norm.nsec);
950 945
951 write_sequnlock_irqrestore(&xtime_lock, flags); 946 spin_unlock_irqrestore(&ntp_lock, flags);
952} 947}
953EXPORT_SYMBOL(hardpps); 948EXPORT_SYMBOL(hardpps);
954 949
@@ -967,6 +962,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
967void __init ntp_init(void) 962void __init ntp_init(void)
968{ 963{
969 ntp_clear(); 964 ntp_clear();
970 hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
971 leap_timer.function = ntp_leap_second;
972} 965}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index fd4a7b1625a2..e883f57a3cd3 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -575,11 +575,15 @@ void tick_broadcast_switch_to_oneshot(void)
575 unsigned long flags; 575 unsigned long flags;
576 576
577 raw_spin_lock_irqsave(&tick_broadcast_lock, flags); 577 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
578 if (cpumask_empty(tick_get_broadcast_mask()))
579 goto end;
578 580
579 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 581 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
580 bc = tick_broadcast_device.evtdev; 582 bc = tick_broadcast_device.evtdev;
581 if (bc) 583 if (bc)
582 tick_broadcast_setup_oneshot(bc); 584 tick_broadcast_setup_oneshot(bc);
585
586end:
583 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); 587 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
584} 588}
585 589
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7656642e4b8e..3526038f2836 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -182,11 +182,7 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
182 182
183static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) 183static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
184{ 184{
185 ktime_t now; 185 ktime_t now = ktime_get();
186
187 now = ktime_get();
188
189 update_ts_time_stats(cpu, ts, now, NULL);
190 186
191 ts->idle_entrytime = now; 187 ts->idle_entrytime = now;
192 ts->idle_active = 1; 188 ts->idle_active = 1;
@@ -562,20 +558,21 @@ void tick_nohz_idle_exit(void)
562 558
563 local_irq_disable(); 559 local_irq_disable();
564 560
565 if (ts->idle_active || (ts->inidle && ts->tick_stopped)) 561 WARN_ON_ONCE(!ts->inidle);
562
563 ts->inidle = 0;
564
565 if (ts->idle_active || ts->tick_stopped)
566 now = ktime_get(); 566 now = ktime_get();
567 567
568 if (ts->idle_active) 568 if (ts->idle_active)
569 tick_nohz_stop_idle(cpu, now); 569 tick_nohz_stop_idle(cpu, now);
570 570
571 if (!ts->inidle || !ts->tick_stopped) { 571 if (!ts->tick_stopped) {
572 ts->inidle = 0;
573 local_irq_enable(); 572 local_irq_enable();
574 return; 573 return;
575 } 574 }
576 575
577 ts->inidle = 0;
578
579 /* Update jiffies first */ 576 /* Update jiffies first */
580 select_nohz_load_balancer(0); 577 select_nohz_load_balancer(0);
581 tick_do_update_jiffies64(now); 578 tick_do_update_jiffies64(now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0c6358186401..d66b21308f7c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -25,6 +25,8 @@
25struct timekeeper { 25struct timekeeper {
26 /* Current clocksource used for timekeeping. */ 26 /* Current clocksource used for timekeeping. */
27 struct clocksource *clock; 27 struct clocksource *clock;
28 /* NTP adjusted clock multiplier */
29 u32 mult;
28 /* The shift value of the current clocksource. */ 30 /* The shift value of the current clocksource. */
29 int shift; 31 int shift;
30 32
@@ -45,12 +47,47 @@ struct timekeeper {
45 /* Shift conversion between clock shifted nano seconds and 47 /* Shift conversion between clock shifted nano seconds and
46 * ntp shifted nano seconds. */ 48 * ntp shifted nano seconds. */
47 int ntp_error_shift; 49 int ntp_error_shift;
48 /* NTP adjusted clock multiplier */ 50
49 u32 mult; 51 /* The current time */
52 struct timespec xtime;
53 /*
54 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
55 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
56 * at zero at system boot time, so wall_to_monotonic will be negative,
57 * however, we will ALWAYS keep the tv_nsec part positive so we can use
58 * the usual normalization.
59 *
60 * wall_to_monotonic is moved after resume from suspend for the
61 * monotonic time not to jump. We need to add total_sleep_time to
62 * wall_to_monotonic to get the real boot based time offset.
63 *
64 * - wall_to_monotonic is no longer the boot time, getboottime must be
65 * used instead.
66 */
67 struct timespec wall_to_monotonic;
68 /* time spent in suspend */
69 struct timespec total_sleep_time;
70 /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
71 struct timespec raw_time;
72
73 /* Seqlock for all timekeeper values */
74 seqlock_t lock;
50}; 75};
51 76
52static struct timekeeper timekeeper; 77static struct timekeeper timekeeper;
53 78
79/*
80 * This read-write spinlock protects us from races in SMP while
81 * playing with xtime.
82 */
83__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
84
85
86/* flag for if timekeeping is suspended */
87int __read_mostly timekeeping_suspended;
88
89
90
54/** 91/**
55 * timekeeper_setup_internals - Set up internals to use clocksource clock. 92 * timekeeper_setup_internals - Set up internals to use clocksource clock.
56 * 93 *
@@ -135,49 +172,18 @@ static inline s64 timekeeping_get_ns_raw(void)
135 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 172 return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
136} 173}
137 174
138/* 175/* must hold write on timekeeper.lock */
139 * This read-write spinlock protects us from races in SMP while 176static void timekeeping_update(bool clearntp)
140 * playing with xtime.
141 */
142__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
143
144
145/*
146 * The current time
147 * wall_to_monotonic is what we need to add to xtime (or xtime corrected
148 * for sub jiffie times) to get to monotonic time. Monotonic is pegged
149 * at zero at system boot time, so wall_to_monotonic will be negative,
150 * however, we will ALWAYS keep the tv_nsec part positive so we can use
151 * the usual normalization.
152 *
153 * wall_to_monotonic is moved after resume from suspend for the monotonic
154 * time not to jump. We need to add total_sleep_time to wall_to_monotonic
155 * to get the real boot based time offset.
156 *
157 * - wall_to_monotonic is no longer the boot time, getboottime must be
158 * used instead.
159 */
160static struct timespec xtime __attribute__ ((aligned (16)));
161static struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
162static struct timespec total_sleep_time;
163
164/*
165 * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock.
166 */
167static struct timespec raw_time;
168
169/* flag for if timekeeping is suspended */
170int __read_mostly timekeeping_suspended;
171
172/* must hold xtime_lock */
173void timekeeping_leap_insert(int leapsecond)
174{ 177{
175 xtime.tv_sec += leapsecond; 178 if (clearntp) {
176 wall_to_monotonic.tv_sec -= leapsecond; 179 timekeeper.ntp_error = 0;
177 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 180 ntp_clear();
178 timekeeper.mult); 181 }
182 update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic,
183 timekeeper.clock, timekeeper.mult);
179} 184}
180 185
186
181/** 187/**
182 * timekeeping_forward_now - update clock to the current time 188 * timekeeping_forward_now - update clock to the current time
183 * 189 *
@@ -202,10 +208,10 @@ static void timekeeping_forward_now(void)
202 /* If arch requires, add in gettimeoffset() */ 208 /* If arch requires, add in gettimeoffset() */
203 nsec += arch_gettimeoffset(); 209 nsec += arch_gettimeoffset();
204 210
205 timespec_add_ns(&xtime, nsec); 211 timespec_add_ns(&timekeeper.xtime, nsec);
206 212
207 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); 213 nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
208 timespec_add_ns(&raw_time, nsec); 214 timespec_add_ns(&timekeeper.raw_time, nsec);
209} 215}
210 216
211/** 217/**
@@ -222,15 +228,15 @@ void getnstimeofday(struct timespec *ts)
222 WARN_ON(timekeeping_suspended); 228 WARN_ON(timekeeping_suspended);
223 229
224 do { 230 do {
225 seq = read_seqbegin(&xtime_lock); 231 seq = read_seqbegin(&timekeeper.lock);
226 232
227 *ts = xtime; 233 *ts = timekeeper.xtime;
228 nsecs = timekeeping_get_ns(); 234 nsecs = timekeeping_get_ns();
229 235
230 /* If arch requires, add in gettimeoffset() */ 236 /* If arch requires, add in gettimeoffset() */
231 nsecs += arch_gettimeoffset(); 237 nsecs += arch_gettimeoffset();
232 238
233 } while (read_seqretry(&xtime_lock, seq)); 239 } while (read_seqretry(&timekeeper.lock, seq));
234 240
235 timespec_add_ns(ts, nsecs); 241 timespec_add_ns(ts, nsecs);
236} 242}
@@ -245,14 +251,16 @@ ktime_t ktime_get(void)
245 WARN_ON(timekeeping_suspended); 251 WARN_ON(timekeeping_suspended);
246 252
247 do { 253 do {
248 seq = read_seqbegin(&xtime_lock); 254 seq = read_seqbegin(&timekeeper.lock);
249 secs = xtime.tv_sec + wall_to_monotonic.tv_sec; 255 secs = timekeeper.xtime.tv_sec +
250 nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; 256 timekeeper.wall_to_monotonic.tv_sec;
257 nsecs = timekeeper.xtime.tv_nsec +
258 timekeeper.wall_to_monotonic.tv_nsec;
251 nsecs += timekeeping_get_ns(); 259 nsecs += timekeeping_get_ns();
252 /* If arch requires, add in gettimeoffset() */ 260 /* If arch requires, add in gettimeoffset() */
253 nsecs += arch_gettimeoffset(); 261 nsecs += arch_gettimeoffset();
254 262
255 } while (read_seqretry(&xtime_lock, seq)); 263 } while (read_seqretry(&timekeeper.lock, seq));
256 /* 264 /*
257 * Use ktime_set/ktime_add_ns to create a proper ktime on 265 * Use ktime_set/ktime_add_ns to create a proper ktime on
258 * 32-bit architectures without CONFIG_KTIME_SCALAR. 266 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -278,14 +286,14 @@ void ktime_get_ts(struct timespec *ts)
278 WARN_ON(timekeeping_suspended); 286 WARN_ON(timekeeping_suspended);
279 287
280 do { 288 do {
281 seq = read_seqbegin(&xtime_lock); 289 seq = read_seqbegin(&timekeeper.lock);
282 *ts = xtime; 290 *ts = timekeeper.xtime;
283 tomono = wall_to_monotonic; 291 tomono = timekeeper.wall_to_monotonic;
284 nsecs = timekeeping_get_ns(); 292 nsecs = timekeeping_get_ns();
285 /* If arch requires, add in gettimeoffset() */ 293 /* If arch requires, add in gettimeoffset() */
286 nsecs += arch_gettimeoffset(); 294 nsecs += arch_gettimeoffset();
287 295
288 } while (read_seqretry(&xtime_lock, seq)); 296 } while (read_seqretry(&timekeeper.lock, seq));
289 297
290 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, 298 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
291 ts->tv_nsec + tomono.tv_nsec + nsecs); 299 ts->tv_nsec + tomono.tv_nsec + nsecs);
@@ -313,10 +321,10 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
313 do { 321 do {
314 u32 arch_offset; 322 u32 arch_offset;
315 323
316 seq = read_seqbegin(&xtime_lock); 324 seq = read_seqbegin(&timekeeper.lock);
317 325
318 *ts_raw = raw_time; 326 *ts_raw = timekeeper.raw_time;
319 *ts_real = xtime; 327 *ts_real = timekeeper.xtime;
320 328
321 nsecs_raw = timekeeping_get_ns_raw(); 329 nsecs_raw = timekeeping_get_ns_raw();
322 nsecs_real = timekeeping_get_ns(); 330 nsecs_real = timekeeping_get_ns();
@@ -326,7 +334,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
326 nsecs_raw += arch_offset; 334 nsecs_raw += arch_offset;
327 nsecs_real += arch_offset; 335 nsecs_real += arch_offset;
328 336
329 } while (read_seqretry(&xtime_lock, seq)); 337 } while (read_seqretry(&timekeeper.lock, seq));
330 338
331 timespec_add_ns(ts_raw, nsecs_raw); 339 timespec_add_ns(ts_raw, nsecs_raw);
332 timespec_add_ns(ts_real, nsecs_real); 340 timespec_add_ns(ts_real, nsecs_real);
@@ -365,23 +373,19 @@ int do_settimeofday(const struct timespec *tv)
365 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 373 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
366 return -EINVAL; 374 return -EINVAL;
367 375
368 write_seqlock_irqsave(&xtime_lock, flags); 376 write_seqlock_irqsave(&timekeeper.lock, flags);
369 377
370 timekeeping_forward_now(); 378 timekeeping_forward_now();
371 379
372 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; 380 ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec;
373 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; 381 ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec;
374 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); 382 timekeeper.wall_to_monotonic =
383 timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
375 384
376 xtime = *tv; 385 timekeeper.xtime = *tv;
377 386 timekeeping_update(true);
378 timekeeper.ntp_error = 0;
379 ntp_clear();
380 387
381 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 388 write_sequnlock_irqrestore(&timekeeper.lock, flags);
382 timekeeper.mult);
383
384 write_sequnlock_irqrestore(&xtime_lock, flags);
385 389
386 /* signal hrtimers about time change */ 390 /* signal hrtimers about time change */
387 clock_was_set(); 391 clock_was_set();
@@ -405,20 +409,17 @@ int timekeeping_inject_offset(struct timespec *ts)
405 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) 409 if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
406 return -EINVAL; 410 return -EINVAL;
407 411
408 write_seqlock_irqsave(&xtime_lock, flags); 412 write_seqlock_irqsave(&timekeeper.lock, flags);
409 413
410 timekeeping_forward_now(); 414 timekeeping_forward_now();
411 415
412 xtime = timespec_add(xtime, *ts); 416 timekeeper.xtime = timespec_add(timekeeper.xtime, *ts);
413 wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); 417 timekeeper.wall_to_monotonic =
414 418 timespec_sub(timekeeper.wall_to_monotonic, *ts);
415 timekeeper.ntp_error = 0;
416 ntp_clear();
417 419
418 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 420 timekeeping_update(true);
419 timekeeper.mult);
420 421
421 write_sequnlock_irqrestore(&xtime_lock, flags); 422 write_sequnlock_irqrestore(&timekeeper.lock, flags);
422 423
423 /* signal hrtimers about time change */ 424 /* signal hrtimers about time change */
424 clock_was_set(); 425 clock_was_set();
@@ -435,9 +436,12 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
435static int change_clocksource(void *data) 436static int change_clocksource(void *data)
436{ 437{
437 struct clocksource *new, *old; 438 struct clocksource *new, *old;
439 unsigned long flags;
438 440
439 new = (struct clocksource *) data; 441 new = (struct clocksource *) data;
440 442
443 write_seqlock_irqsave(&timekeeper.lock, flags);
444
441 timekeeping_forward_now(); 445 timekeeping_forward_now();
442 if (!new->enable || new->enable(new) == 0) { 446 if (!new->enable || new->enable(new) == 0) {
443 old = timekeeper.clock; 447 old = timekeeper.clock;
@@ -445,6 +449,10 @@ static int change_clocksource(void *data)
445 if (old->disable) 449 if (old->disable)
446 old->disable(old); 450 old->disable(old);
447 } 451 }
452 timekeeping_update(true);
453
454 write_sequnlock_irqrestore(&timekeeper.lock, flags);
455
448 return 0; 456 return 0;
449} 457}
450 458
@@ -490,11 +498,11 @@ void getrawmonotonic(struct timespec *ts)
490 s64 nsecs; 498 s64 nsecs;
491 499
492 do { 500 do {
493 seq = read_seqbegin(&xtime_lock); 501 seq = read_seqbegin(&timekeeper.lock);
494 nsecs = timekeeping_get_ns_raw(); 502 nsecs = timekeeping_get_ns_raw();
495 *ts = raw_time; 503 *ts = timekeeper.raw_time;
496 504
497 } while (read_seqretry(&xtime_lock, seq)); 505 } while (read_seqretry(&timekeeper.lock, seq));
498 506
499 timespec_add_ns(ts, nsecs); 507 timespec_add_ns(ts, nsecs);
500} 508}
@@ -510,24 +518,30 @@ int timekeeping_valid_for_hres(void)
510 int ret; 518 int ret;
511 519
512 do { 520 do {
513 seq = read_seqbegin(&xtime_lock); 521 seq = read_seqbegin(&timekeeper.lock);
514 522
515 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; 523 ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
516 524
517 } while (read_seqretry(&xtime_lock, seq)); 525 } while (read_seqretry(&timekeeper.lock, seq));
518 526
519 return ret; 527 return ret;
520} 528}
521 529
522/** 530/**
523 * timekeeping_max_deferment - Returns max time the clocksource can be deferred 531 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
524 *
525 * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
526 * ensure that the clocksource does not change!
527 */ 532 */
528u64 timekeeping_max_deferment(void) 533u64 timekeeping_max_deferment(void)
529{ 534{
530 return timekeeper.clock->max_idle_ns; 535 unsigned long seq;
536 u64 ret;
537 do {
538 seq = read_seqbegin(&timekeeper.lock);
539
540 ret = timekeeper.clock->max_idle_ns;
541
542 } while (read_seqretry(&timekeeper.lock, seq));
543
544 return ret;
531} 545}
532 546
533/** 547/**
@@ -572,28 +586,29 @@ void __init timekeeping_init(void)
572 read_persistent_clock(&now); 586 read_persistent_clock(&now);
573 read_boot_clock(&boot); 587 read_boot_clock(&boot);
574 588
575 write_seqlock_irqsave(&xtime_lock, flags); 589 seqlock_init(&timekeeper.lock);
576 590
577 ntp_init(); 591 ntp_init();
578 592
593 write_seqlock_irqsave(&timekeeper.lock, flags);
579 clock = clocksource_default_clock(); 594 clock = clocksource_default_clock();
580 if (clock->enable) 595 if (clock->enable)
581 clock->enable(clock); 596 clock->enable(clock);
582 timekeeper_setup_internals(clock); 597 timekeeper_setup_internals(clock);
583 598
584 xtime.tv_sec = now.tv_sec; 599 timekeeper.xtime.tv_sec = now.tv_sec;
585 xtime.tv_nsec = now.tv_nsec; 600 timekeeper.xtime.tv_nsec = now.tv_nsec;
586 raw_time.tv_sec = 0; 601 timekeeper.raw_time.tv_sec = 0;
587 raw_time.tv_nsec = 0; 602 timekeeper.raw_time.tv_nsec = 0;
588 if (boot.tv_sec == 0 && boot.tv_nsec == 0) { 603 if (boot.tv_sec == 0 && boot.tv_nsec == 0) {
589 boot.tv_sec = xtime.tv_sec; 604 boot.tv_sec = timekeeper.xtime.tv_sec;
590 boot.tv_nsec = xtime.tv_nsec; 605 boot.tv_nsec = timekeeper.xtime.tv_nsec;
591 } 606 }
592 set_normalized_timespec(&wall_to_monotonic, 607 set_normalized_timespec(&timekeeper.wall_to_monotonic,
593 -boot.tv_sec, -boot.tv_nsec); 608 -boot.tv_sec, -boot.tv_nsec);
594 total_sleep_time.tv_sec = 0; 609 timekeeper.total_sleep_time.tv_sec = 0;
595 total_sleep_time.tv_nsec = 0; 610 timekeeper.total_sleep_time.tv_nsec = 0;
596 write_sequnlock_irqrestore(&xtime_lock, flags); 611 write_sequnlock_irqrestore(&timekeeper.lock, flags);
597} 612}
598 613
599/* time in seconds when suspend began */ 614/* time in seconds when suspend began */
@@ -614,9 +629,11 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta)
614 return; 629 return;
615 } 630 }
616 631
617 xtime = timespec_add(xtime, *delta); 632 timekeeper.xtime = timespec_add(timekeeper.xtime, *delta);
618 wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); 633 timekeeper.wall_to_monotonic =
619 total_sleep_time = timespec_add(total_sleep_time, *delta); 634 timespec_sub(timekeeper.wall_to_monotonic, *delta);
635 timekeeper.total_sleep_time = timespec_add(
636 timekeeper.total_sleep_time, *delta);
620} 637}
621 638
622 639
@@ -640,17 +657,15 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
640 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) 657 if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
641 return; 658 return;
642 659
643 write_seqlock_irqsave(&xtime_lock, flags); 660 write_seqlock_irqsave(&timekeeper.lock, flags);
661
644 timekeeping_forward_now(); 662 timekeeping_forward_now();
645 663
646 __timekeeping_inject_sleeptime(delta); 664 __timekeeping_inject_sleeptime(delta);
647 665
648 timekeeper.ntp_error = 0; 666 timekeeping_update(true);
649 ntp_clear();
650 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
651 timekeeper.mult);
652 667
653 write_sequnlock_irqrestore(&xtime_lock, flags); 668 write_sequnlock_irqrestore(&timekeeper.lock, flags);
654 669
655 /* signal hrtimers about time change */ 670 /* signal hrtimers about time change */
656 clock_was_set(); 671 clock_was_set();
@@ -673,7 +688,7 @@ static void timekeeping_resume(void)
673 688
674 clocksource_resume(); 689 clocksource_resume();
675 690
676 write_seqlock_irqsave(&xtime_lock, flags); 691 write_seqlock_irqsave(&timekeeper.lock, flags);
677 692
678 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { 693 if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
679 ts = timespec_sub(ts, timekeeping_suspend_time); 694 ts = timespec_sub(ts, timekeeping_suspend_time);
@@ -683,7 +698,7 @@ static void timekeeping_resume(void)
683 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 698 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
684 timekeeper.ntp_error = 0; 699 timekeeper.ntp_error = 0;
685 timekeeping_suspended = 0; 700 timekeeping_suspended = 0;
686 write_sequnlock_irqrestore(&xtime_lock, flags); 701 write_sequnlock_irqrestore(&timekeeper.lock, flags);
687 702
688 touch_softlockup_watchdog(); 703 touch_softlockup_watchdog();
689 704
@@ -701,7 +716,7 @@ static int timekeeping_suspend(void)
701 716
702 read_persistent_clock(&timekeeping_suspend_time); 717 read_persistent_clock(&timekeeping_suspend_time);
703 718
704 write_seqlock_irqsave(&xtime_lock, flags); 719 write_seqlock_irqsave(&timekeeper.lock, flags);
705 timekeeping_forward_now(); 720 timekeeping_forward_now();
706 timekeeping_suspended = 1; 721 timekeeping_suspended = 1;
707 722
@@ -711,7 +726,7 @@ static int timekeeping_suspend(void)
711 * try to compensate so the difference in system time 726 * try to compensate so the difference in system time
712 * and persistent_clock time stays close to constant. 727 * and persistent_clock time stays close to constant.
713 */ 728 */
714 delta = timespec_sub(xtime, timekeeping_suspend_time); 729 delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time);
715 delta_delta = timespec_sub(delta, old_delta); 730 delta_delta = timespec_sub(delta, old_delta);
716 if (abs(delta_delta.tv_sec) >= 2) { 731 if (abs(delta_delta.tv_sec) >= 2) {
717 /* 732 /*
@@ -724,7 +739,7 @@ static int timekeeping_suspend(void)
724 timekeeping_suspend_time = 739 timekeeping_suspend_time =
725 timespec_add(timekeeping_suspend_time, delta_delta); 740 timespec_add(timekeeping_suspend_time, delta_delta);
726 } 741 }
727 write_sequnlock_irqrestore(&xtime_lock, flags); 742 write_sequnlock_irqrestore(&timekeeper.lock, flags);
728 743
729 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 744 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
730 clocksource_suspend(); 745 clocksource_suspend();
@@ -775,7 +790,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval,
775 * Now calculate the error in (1 << look_ahead) ticks, but first 790 * Now calculate the error in (1 << look_ahead) ticks, but first
776 * remove the single look ahead already included in the error. 791 * remove the single look ahead already included in the error.
777 */ 792 */
778 tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); 793 tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1);
779 tick_error -= timekeeper.xtime_interval >> 1; 794 tick_error -= timekeeper.xtime_interval >> 1;
780 error = ((error - tick_error) >> look_ahead) + tick_error; 795 error = ((error - tick_error) >> look_ahead) + tick_error;
781 796
@@ -807,7 +822,7 @@ static void timekeeping_adjust(s64 offset)
807 int adj; 822 int adj;
808 823
809 /* 824 /*
810 * The point of this is to check if the error is greater then half 825 * The point of this is to check if the error is greater than half
811 * an interval. 826 * an interval.
812 * 827 *
813 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. 828 * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
@@ -815,7 +830,7 @@ static void timekeeping_adjust(s64 offset)
815 * Note we subtract one in the shift, so that error is really error*2. 830 * Note we subtract one in the shift, so that error is really error*2.
816 * This "saves" dividing(shifting) interval twice, but keeps the 831 * This "saves" dividing(shifting) interval twice, but keeps the
817 * (error > interval) comparison as still measuring if error is 832 * (error > interval) comparison as still measuring if error is
818 * larger then half an interval. 833 * larger than half an interval.
819 * 834 *
820 * Note: It does not "save" on aggravation when reading the code. 835 * Note: It does not "save" on aggravation when reading the code.
821 */ 836 */
@@ -823,7 +838,7 @@ static void timekeeping_adjust(s64 offset)
823 if (error > interval) { 838 if (error > interval) {
824 /* 839 /*
825 * We now divide error by 4(via shift), which checks if 840 * We now divide error by 4(via shift), which checks if
826 * the error is greater then twice the interval. 841 * the error is greater than twice the interval.
827 * If it is greater, we need a bigadjust, if its smaller, 842 * If it is greater, we need a bigadjust, if its smaller,
828 * we can adjust by 1. 843 * we can adjust by 1.
829 */ 844 */
@@ -854,13 +869,15 @@ static void timekeeping_adjust(s64 offset)
854 } else /* No adjustment needed */ 869 } else /* No adjustment needed */
855 return; 870 return;
856 871
857 WARN_ONCE(timekeeper.clock->maxadj && 872 if (unlikely(timekeeper.clock->maxadj &&
858 (timekeeper.mult + adj > timekeeper.clock->mult + 873 (timekeeper.mult + adj >
859 timekeeper.clock->maxadj), 874 timekeeper.clock->mult + timekeeper.clock->maxadj))) {
860 "Adjusting %s more then 11%% (%ld vs %ld)\n", 875 printk_once(KERN_WARNING
876 "Adjusting %s more than 11%% (%ld vs %ld)\n",
861 timekeeper.clock->name, (long)timekeeper.mult + adj, 877 timekeeper.clock->name, (long)timekeeper.mult + adj,
862 (long)timekeeper.clock->mult + 878 (long)timekeeper.clock->mult +
863 timekeeper.clock->maxadj); 879 timekeeper.clock->maxadj);
880 }
864 /* 881 /*
865 * So the following can be confusing. 882 * So the following can be confusing.
866 * 883 *
@@ -932,7 +949,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
932 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 949 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
933 u64 raw_nsecs; 950 u64 raw_nsecs;
934 951
935 /* If the offset is smaller then a shifted interval, do nothing */ 952 /* If the offset is smaller than a shifted interval, do nothing */
936 if (offset < timekeeper.cycle_interval<<shift) 953 if (offset < timekeeper.cycle_interval<<shift)
937 return offset; 954 return offset;
938 955
@@ -942,23 +959,25 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
942 959
943 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; 960 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
944 while (timekeeper.xtime_nsec >= nsecps) { 961 while (timekeeper.xtime_nsec >= nsecps) {
962 int leap;
945 timekeeper.xtime_nsec -= nsecps; 963 timekeeper.xtime_nsec -= nsecps;
946 xtime.tv_sec++; 964 timekeeper.xtime.tv_sec++;
947 second_overflow(); 965 leap = second_overflow(timekeeper.xtime.tv_sec);
966 timekeeper.xtime.tv_sec += leap;
948 } 967 }
949 968
950 /* Accumulate raw time */ 969 /* Accumulate raw time */
951 raw_nsecs = timekeeper.raw_interval << shift; 970 raw_nsecs = timekeeper.raw_interval << shift;
952 raw_nsecs += raw_time.tv_nsec; 971 raw_nsecs += timekeeper.raw_time.tv_nsec;
953 if (raw_nsecs >= NSEC_PER_SEC) { 972 if (raw_nsecs >= NSEC_PER_SEC) {
954 u64 raw_secs = raw_nsecs; 973 u64 raw_secs = raw_nsecs;
955 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); 974 raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
956 raw_time.tv_sec += raw_secs; 975 timekeeper.raw_time.tv_sec += raw_secs;
957 } 976 }
958 raw_time.tv_nsec = raw_nsecs; 977 timekeeper.raw_time.tv_nsec = raw_nsecs;
959 978
960 /* Accumulate error between NTP and clock interval */ 979 /* Accumulate error between NTP and clock interval */
961 timekeeper.ntp_error += tick_length << shift; 980 timekeeper.ntp_error += ntp_tick_length() << shift;
962 timekeeper.ntp_error -= 981 timekeeper.ntp_error -=
963 (timekeeper.xtime_interval + timekeeper.xtime_remainder) << 982 (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
964 (timekeeper.ntp_error_shift + shift); 983 (timekeeper.ntp_error_shift + shift);
@@ -970,17 +989,19 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
970/** 989/**
971 * update_wall_time - Uses the current clocksource to increment the wall time 990 * update_wall_time - Uses the current clocksource to increment the wall time
972 * 991 *
973 * Called from the timer interrupt, must hold a write on xtime_lock.
974 */ 992 */
975static void update_wall_time(void) 993static void update_wall_time(void)
976{ 994{
977 struct clocksource *clock; 995 struct clocksource *clock;
978 cycle_t offset; 996 cycle_t offset;
979 int shift = 0, maxshift; 997 int shift = 0, maxshift;
998 unsigned long flags;
999
1000 write_seqlock_irqsave(&timekeeper.lock, flags);
980 1001
981 /* Make sure we're fully resumed: */ 1002 /* Make sure we're fully resumed: */
982 if (unlikely(timekeeping_suspended)) 1003 if (unlikely(timekeeping_suspended))
983 return; 1004 goto out;
984 1005
985 clock = timekeeper.clock; 1006 clock = timekeeper.clock;
986 1007
@@ -989,20 +1010,21 @@ static void update_wall_time(void)
989#else 1010#else
990 offset = (clock->read(clock) - clock->cycle_last) & clock->mask; 1011 offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
991#endif 1012#endif
992 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 1013 timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec <<
1014 timekeeper.shift;
993 1015
994 /* 1016 /*
995 * With NO_HZ we may have to accumulate many cycle_intervals 1017 * With NO_HZ we may have to accumulate many cycle_intervals
996 * (think "ticks") worth of time at once. To do this efficiently, 1018 * (think "ticks") worth of time at once. To do this efficiently,
997 * we calculate the largest doubling multiple of cycle_intervals 1019 * we calculate the largest doubling multiple of cycle_intervals
998 * that is smaller then the offset. We then accumulate that 1020 * that is smaller than the offset. We then accumulate that
999 * chunk in one go, and then try to consume the next smaller 1021 * chunk in one go, and then try to consume the next smaller
1000 * doubled multiple. 1022 * doubled multiple.
1001 */ 1023 */
1002 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); 1024 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
1003 shift = max(0, shift); 1025 shift = max(0, shift);
1004 /* Bound shift to one less then what overflows tick_length */ 1026 /* Bound shift to one less than what overflows tick_length */
1005 maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; 1027 maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
1006 shift = min(shift, maxshift); 1028 shift = min(shift, maxshift);
1007 while (offset >= timekeeper.cycle_interval) { 1029 while (offset >= timekeeper.cycle_interval) {
1008 offset = logarithmic_accumulation(offset, shift); 1030 offset = logarithmic_accumulation(offset, shift);
@@ -1040,24 +1062,30 @@ static void update_wall_time(void)
1040 * Store full nanoseconds into xtime after rounding it up and 1062 * Store full nanoseconds into xtime after rounding it up and
1041 * add the remainder to the error difference. 1063 * add the remainder to the error difference.
1042 */ 1064 */
1043 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; 1065 timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >>
1044 timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; 1066 timekeeper.shift) + 1;
1067 timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec <<
1068 timekeeper.shift;
1045 timekeeper.ntp_error += timekeeper.xtime_nsec << 1069 timekeeper.ntp_error += timekeeper.xtime_nsec <<
1046 timekeeper.ntp_error_shift; 1070 timekeeper.ntp_error_shift;
1047 1071
1048 /* 1072 /*
1049 * Finally, make sure that after the rounding 1073 * Finally, make sure that after the rounding
1050 * xtime.tv_nsec isn't larger then NSEC_PER_SEC 1074 * xtime.tv_nsec isn't larger than NSEC_PER_SEC
1051 */ 1075 */
1052 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { 1076 if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) {
1053 xtime.tv_nsec -= NSEC_PER_SEC; 1077 int leap;
1054 xtime.tv_sec++; 1078 timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
1055 second_overflow(); 1079 timekeeper.xtime.tv_sec++;
1080 leap = second_overflow(timekeeper.xtime.tv_sec);
1081 timekeeper.xtime.tv_sec += leap;
1056 } 1082 }
1057 1083
1058 /* check to see if there is a new clocksource to use */ 1084 timekeeping_update(false);
1059 update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, 1085
1060 timekeeper.mult); 1086out:
1087 write_sequnlock_irqrestore(&timekeeper.lock, flags);
1088
1061} 1089}
1062 1090
1063/** 1091/**
@@ -1074,8 +1102,10 @@ static void update_wall_time(void)
1074void getboottime(struct timespec *ts) 1102void getboottime(struct timespec *ts)
1075{ 1103{
1076 struct timespec boottime = { 1104 struct timespec boottime = {
1077 .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, 1105 .tv_sec = timekeeper.wall_to_monotonic.tv_sec +
1078 .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec 1106 timekeeper.total_sleep_time.tv_sec,
1107 .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec +
1108 timekeeper.total_sleep_time.tv_nsec
1079 }; 1109 };
1080 1110
1081 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 1111 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
@@ -1101,13 +1131,13 @@ void get_monotonic_boottime(struct timespec *ts)
1101 WARN_ON(timekeeping_suspended); 1131 WARN_ON(timekeeping_suspended);
1102 1132
1103 do { 1133 do {
1104 seq = read_seqbegin(&xtime_lock); 1134 seq = read_seqbegin(&timekeeper.lock);
1105 *ts = xtime; 1135 *ts = timekeeper.xtime;
1106 tomono = wall_to_monotonic; 1136 tomono = timekeeper.wall_to_monotonic;
1107 sleep = total_sleep_time; 1137 sleep = timekeeper.total_sleep_time;
1108 nsecs = timekeeping_get_ns(); 1138 nsecs = timekeeping_get_ns();
1109 1139
1110 } while (read_seqretry(&xtime_lock, seq)); 1140 } while (read_seqretry(&timekeeper.lock, seq));
1111 1141
1112 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, 1142 set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
1113 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); 1143 ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
@@ -1137,19 +1167,19 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
1137 */ 1167 */
1138void monotonic_to_bootbased(struct timespec *ts) 1168void monotonic_to_bootbased(struct timespec *ts)
1139{ 1169{
1140 *ts = timespec_add(*ts, total_sleep_time); 1170 *ts = timespec_add(*ts, timekeeper.total_sleep_time);
1141} 1171}
1142EXPORT_SYMBOL_GPL(monotonic_to_bootbased); 1172EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
1143 1173
1144unsigned long get_seconds(void) 1174unsigned long get_seconds(void)
1145{ 1175{
1146 return xtime.tv_sec; 1176 return timekeeper.xtime.tv_sec;
1147} 1177}
1148EXPORT_SYMBOL(get_seconds); 1178EXPORT_SYMBOL(get_seconds);
1149 1179
1150struct timespec __current_kernel_time(void) 1180struct timespec __current_kernel_time(void)
1151{ 1181{
1152 return xtime; 1182 return timekeeper.xtime;
1153} 1183}
1154 1184
1155struct timespec current_kernel_time(void) 1185struct timespec current_kernel_time(void)
@@ -1158,10 +1188,10 @@ struct timespec current_kernel_time(void)
1158 unsigned long seq; 1188 unsigned long seq;
1159 1189
1160 do { 1190 do {
1161 seq = read_seqbegin(&xtime_lock); 1191 seq = read_seqbegin(&timekeeper.lock);
1162 1192
1163 now = xtime; 1193 now = timekeeper.xtime;
1164 } while (read_seqretry(&xtime_lock, seq)); 1194 } while (read_seqretry(&timekeeper.lock, seq));
1165 1195
1166 return now; 1196 return now;
1167} 1197}
@@ -1173,11 +1203,11 @@ struct timespec get_monotonic_coarse(void)
1173 unsigned long seq; 1203 unsigned long seq;
1174 1204
1175 do { 1205 do {
1176 seq = read_seqbegin(&xtime_lock); 1206 seq = read_seqbegin(&timekeeper.lock);
1177 1207
1178 now = xtime; 1208 now = timekeeper.xtime;
1179 mono = wall_to_monotonic; 1209 mono = timekeeper.wall_to_monotonic;
1180 } while (read_seqretry(&xtime_lock, seq)); 1210 } while (read_seqretry(&timekeeper.lock, seq));
1181 1211
1182 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, 1212 set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
1183 now.tv_nsec + mono.tv_nsec); 1213 now.tv_nsec + mono.tv_nsec);
@@ -1209,11 +1239,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1209 unsigned long seq; 1239 unsigned long seq;
1210 1240
1211 do { 1241 do {
1212 seq = read_seqbegin(&xtime_lock); 1242 seq = read_seqbegin(&timekeeper.lock);
1213 *xtim = xtime; 1243 *xtim = timekeeper.xtime;
1214 *wtom = wall_to_monotonic; 1244 *wtom = timekeeper.wall_to_monotonic;
1215 *sleep = total_sleep_time; 1245 *sleep = timekeeper.total_sleep_time;
1216 } while (read_seqretry(&xtime_lock, seq)); 1246 } while (read_seqretry(&timekeeper.lock, seq));
1217} 1247}
1218 1248
1219/** 1249/**
@@ -1225,11 +1255,14 @@ ktime_t ktime_get_monotonic_offset(void)
1225 struct timespec wtom; 1255 struct timespec wtom;
1226 1256
1227 do { 1257 do {
1228 seq = read_seqbegin(&xtime_lock); 1258 seq = read_seqbegin(&timekeeper.lock);
1229 wtom = wall_to_monotonic; 1259 wtom = timekeeper.wall_to_monotonic;
1230 } while (read_seqretry(&xtime_lock, seq)); 1260 } while (read_seqretry(&timekeeper.lock, seq));
1261
1231 return timespec_to_ktime(wtom); 1262 return timespec_to_ktime(wtom);
1232} 1263}
1264EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
1265
1233 1266
1234/** 1267/**
1235 * xtime_update() - advances the timekeeping infrastructure 1268 * xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 683d559a0eef..867bd1dd2dd0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,6 +62,8 @@
62#define FTRACE_HASH_DEFAULT_BITS 10 62#define FTRACE_HASH_DEFAULT_BITS 10
63#define FTRACE_HASH_MAX_BITS 12 63#define FTRACE_HASH_MAX_BITS 12
64 64
65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
66
65/* ftrace_enabled is a method to turn ftrace on or off */ 67/* ftrace_enabled is a method to turn ftrace on or off */
66int ftrace_enabled __read_mostly; 68int ftrace_enabled __read_mostly;
67static int last_ftrace_enabled; 69static int last_ftrace_enabled;
@@ -89,12 +91,14 @@ static struct ftrace_ops ftrace_list_end __read_mostly = {
89}; 91};
90 92
91static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; 93static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
94static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
92static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 95static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
93ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 96ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
94static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; 97static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
95ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; 98ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
96ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; 99ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
97static struct ftrace_ops global_ops; 100static struct ftrace_ops global_ops;
101static struct ftrace_ops control_ops;
98 102
99static void 103static void
100ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); 104ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
@@ -168,6 +172,32 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
168} 172}
169#endif 173#endif
170 174
175static void control_ops_disable_all(struct ftrace_ops *ops)
176{
177 int cpu;
178
179 for_each_possible_cpu(cpu)
180 *per_cpu_ptr(ops->disabled, cpu) = 1;
181}
182
183static int control_ops_alloc(struct ftrace_ops *ops)
184{
185 int __percpu *disabled;
186
187 disabled = alloc_percpu(int);
188 if (!disabled)
189 return -ENOMEM;
190
191 ops->disabled = disabled;
192 control_ops_disable_all(ops);
193 return 0;
194}
195
196static void control_ops_free(struct ftrace_ops *ops)
197{
198 free_percpu(ops->disabled);
199}
200
171static void update_global_ops(void) 201static void update_global_ops(void)
172{ 202{
173 ftrace_func_t func; 203 ftrace_func_t func;
@@ -259,6 +289,26 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
259 return 0; 289 return 0;
260} 290}
261 291
292static void add_ftrace_list_ops(struct ftrace_ops **list,
293 struct ftrace_ops *main_ops,
294 struct ftrace_ops *ops)
295{
296 int first = *list == &ftrace_list_end;
297 add_ftrace_ops(list, ops);
298 if (first)
299 add_ftrace_ops(&ftrace_ops_list, main_ops);
300}
301
302static int remove_ftrace_list_ops(struct ftrace_ops **list,
303 struct ftrace_ops *main_ops,
304 struct ftrace_ops *ops)
305{
306 int ret = remove_ftrace_ops(list, ops);
307 if (!ret && *list == &ftrace_list_end)
308 ret = remove_ftrace_ops(&ftrace_ops_list, main_ops);
309 return ret;
310}
311
262static int __register_ftrace_function(struct ftrace_ops *ops) 312static int __register_ftrace_function(struct ftrace_ops *ops)
263{ 313{
264 if (ftrace_disabled) 314 if (ftrace_disabled)
@@ -270,15 +320,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
270 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) 320 if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
271 return -EBUSY; 321 return -EBUSY;
272 322
323 /* We don't support both control and global flags set. */
324 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
325 return -EINVAL;
326
273 if (!core_kernel_data((unsigned long)ops)) 327 if (!core_kernel_data((unsigned long)ops))
274 ops->flags |= FTRACE_OPS_FL_DYNAMIC; 328 ops->flags |= FTRACE_OPS_FL_DYNAMIC;
275 329
276 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 330 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
277 int first = ftrace_global_list == &ftrace_list_end; 331 add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops);
278 add_ftrace_ops(&ftrace_global_list, ops);
279 ops->flags |= FTRACE_OPS_FL_ENABLED; 332 ops->flags |= FTRACE_OPS_FL_ENABLED;
280 if (first) 333 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
281 add_ftrace_ops(&ftrace_ops_list, &global_ops); 334 if (control_ops_alloc(ops))
335 return -ENOMEM;
336 add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops);
282 } else 337 } else
283 add_ftrace_ops(&ftrace_ops_list, ops); 338 add_ftrace_ops(&ftrace_ops_list, ops);
284 339
@@ -302,11 +357,23 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
302 return -EINVAL; 357 return -EINVAL;
303 358
304 if (ops->flags & FTRACE_OPS_FL_GLOBAL) { 359 if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
305 ret = remove_ftrace_ops(&ftrace_global_list, ops); 360 ret = remove_ftrace_list_ops(&ftrace_global_list,
306 if (!ret && ftrace_global_list == &ftrace_list_end) 361 &global_ops, ops);
307 ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
308 if (!ret) 362 if (!ret)
309 ops->flags &= ~FTRACE_OPS_FL_ENABLED; 363 ops->flags &= ~FTRACE_OPS_FL_ENABLED;
364 } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
365 ret = remove_ftrace_list_ops(&ftrace_control_list,
366 &control_ops, ops);
367 if (!ret) {
368 /*
369 * The ftrace_ops is now removed from the list,
370 * so there'll be no new users. We must ensure
371 * all current users are done before we free
372 * the control data.
373 */
374 synchronize_sched();
375 control_ops_free(ops);
376 }
310 } else 377 } else
311 ret = remove_ftrace_ops(&ftrace_ops_list, ops); 378 ret = remove_ftrace_ops(&ftrace_ops_list, ops);
312 379
@@ -1119,6 +1186,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1119 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); 1186 call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
1120} 1187}
1121 1188
1189void ftrace_free_filter(struct ftrace_ops *ops)
1190{
1191 free_ftrace_hash(ops->filter_hash);
1192 free_ftrace_hash(ops->notrace_hash);
1193}
1194
1122static struct ftrace_hash *alloc_ftrace_hash(int size_bits) 1195static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1123{ 1196{
1124 struct ftrace_hash *hash; 1197 struct ftrace_hash *hash;
@@ -1129,7 +1202,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
1129 return NULL; 1202 return NULL;
1130 1203
1131 size = 1 << size_bits; 1204 size = 1 << size_bits;
1132 hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); 1205 hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL);
1133 1206
1134 if (!hash->buckets) { 1207 if (!hash->buckets) {
1135 kfree(hash); 1208 kfree(hash);
@@ -3146,8 +3219,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3146 mutex_lock(&ftrace_regex_lock); 3219 mutex_lock(&ftrace_regex_lock);
3147 if (reset) 3220 if (reset)
3148 ftrace_filter_reset(hash); 3221 ftrace_filter_reset(hash);
3149 if (buf) 3222 if (buf && !ftrace_match_records(hash, buf, len)) {
3150 ftrace_match_records(hash, buf, len); 3223 ret = -EINVAL;
3224 goto out_regex_unlock;
3225 }
3151 3226
3152 mutex_lock(&ftrace_lock); 3227 mutex_lock(&ftrace_lock);
3153 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 3228 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
@@ -3157,6 +3232,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3157 3232
3158 mutex_unlock(&ftrace_lock); 3233 mutex_unlock(&ftrace_lock);
3159 3234
3235 out_regex_unlock:
3160 mutex_unlock(&ftrace_regex_lock); 3236 mutex_unlock(&ftrace_regex_lock);
3161 3237
3162 free_ftrace_hash(hash); 3238 free_ftrace_hash(hash);
@@ -3173,10 +3249,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
3173 * Filters denote which functions should be enabled when tracing is enabled. 3249 * Filters denote which functions should be enabled when tracing is enabled.
3174 * If @buf is NULL and reset is set, all functions will be enabled for tracing. 3250 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
3175 */ 3251 */
3176void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, 3252int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
3177 int len, int reset) 3253 int len, int reset)
3178{ 3254{
3179 ftrace_set_regex(ops, buf, len, reset, 1); 3255 return ftrace_set_regex(ops, buf, len, reset, 1);
3180} 3256}
3181EXPORT_SYMBOL_GPL(ftrace_set_filter); 3257EXPORT_SYMBOL_GPL(ftrace_set_filter);
3182 3258
@@ -3191,10 +3267,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
3191 * is enabled. If @buf is NULL and reset is set, all functions will be enabled 3267 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
3192 * for tracing. 3268 * for tracing.
3193 */ 3269 */
3194void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, 3270int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
3195 int len, int reset) 3271 int len, int reset)
3196{ 3272{
3197 ftrace_set_regex(ops, buf, len, reset, 0); 3273 return ftrace_set_regex(ops, buf, len, reset, 0);
3198} 3274}
3199EXPORT_SYMBOL_GPL(ftrace_set_notrace); 3275EXPORT_SYMBOL_GPL(ftrace_set_notrace);
3200/** 3276/**
@@ -3871,6 +3947,36 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
3871#endif /* CONFIG_DYNAMIC_FTRACE */ 3947#endif /* CONFIG_DYNAMIC_FTRACE */
3872 3948
3873static void 3949static void
3950ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
3951{
3952 struct ftrace_ops *op;
3953
3954 if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
3955 return;
3956
3957 /*
3958 * Some of the ops may be dynamically allocated,
3959 * they must be freed after a synchronize_sched().
3960 */
3961 preempt_disable_notrace();
3962 trace_recursion_set(TRACE_CONTROL_BIT);
3963 op = rcu_dereference_raw(ftrace_control_list);
3964 while (op != &ftrace_list_end) {
3965 if (!ftrace_function_local_disabled(op) &&
3966 ftrace_ops_test(op, ip))
3967 op->func(ip, parent_ip);
3968
3969 op = rcu_dereference_raw(op->next);
3970 };
3971 trace_recursion_clear(TRACE_CONTROL_BIT);
3972 preempt_enable_notrace();
3973}
3974
3975static struct ftrace_ops control_ops = {
3976 .func = ftrace_ops_control_func,
3977};
3978
3979static void
3874ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) 3980ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
3875{ 3981{
3876 struct ftrace_ops *op; 3982 struct ftrace_ops *op;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3f1bc5d2a00..10d5503f0d04 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2764,12 +2764,12 @@ static const char readme_msg[] =
2764 "tracing mini-HOWTO:\n\n" 2764 "tracing mini-HOWTO:\n\n"
2765 "# mount -t debugfs nodev /sys/kernel/debug\n\n" 2765 "# mount -t debugfs nodev /sys/kernel/debug\n\n"
2766 "# cat /sys/kernel/debug/tracing/available_tracers\n" 2766 "# cat /sys/kernel/debug/tracing/available_tracers\n"
2767 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" 2767 "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n"
2768 "# cat /sys/kernel/debug/tracing/current_tracer\n" 2768 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2769 "nop\n" 2769 "nop\n"
2770 "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" 2770 "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n"
2771 "# cat /sys/kernel/debug/tracing/current_tracer\n" 2771 "# cat /sys/kernel/debug/tracing/current_tracer\n"
2772 "sched_switch\n" 2772 "wakeup\n"
2773 "# cat /sys/kernel/debug/tracing/trace_options\n" 2773 "# cat /sys/kernel/debug/tracing/trace_options\n"
2774 "noprint-parent nosym-offset nosym-addr noverbose\n" 2774 "noprint-parent nosym-offset nosym-addr noverbose\n"
2775 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" 2775 "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b93ecbadad6d..54faec790bc1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -56,17 +56,23 @@ enum trace_type {
56#define F_STRUCT(args...) args 56#define F_STRUCT(args...) args
57 57
58#undef FTRACE_ENTRY 58#undef FTRACE_ENTRY
59#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 59#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
60 struct struct_name { \ 60 struct struct_name { \
61 struct trace_entry ent; \ 61 struct trace_entry ent; \
62 tstruct \ 62 tstruct \
63 } 63 }
64 64
65#undef TP_ARGS 65#undef TP_ARGS
66#define TP_ARGS(args...) args 66#define TP_ARGS(args...) args
67 67
68#undef FTRACE_ENTRY_DUP 68#undef FTRACE_ENTRY_DUP
69#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) 69#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
70
71#undef FTRACE_ENTRY_REG
72#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
73 filter, regfn) \
74 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
75 filter)
70 76
71#include "trace_entries.h" 77#include "trace_entries.h"
72 78
@@ -288,6 +294,8 @@ struct tracer {
288/* for function tracing recursion */ 294/* for function tracing recursion */
289#define TRACE_INTERNAL_BIT (1<<11) 295#define TRACE_INTERNAL_BIT (1<<11)
290#define TRACE_GLOBAL_BIT (1<<12) 296#define TRACE_GLOBAL_BIT (1<<12)
297#define TRACE_CONTROL_BIT (1<<13)
298
291/* 299/*
292 * Abuse of the trace_recursion. 300 * Abuse of the trace_recursion.
293 * As we need a way to maintain state if we are tracing the function 301 * As we need a way to maintain state if we are tracing the function
@@ -589,6 +597,8 @@ static inline int ftrace_trace_task(struct task_struct *task)
589static inline int ftrace_is_dead(void) { return 0; } 597static inline int ftrace_is_dead(void) { return 0; }
590#endif 598#endif
591 599
600int ftrace_event_is_function(struct ftrace_event_call *call);
601
592/* 602/*
593 * struct trace_parser - servers for reading the user input separated by spaces 603 * struct trace_parser - servers for reading the user input separated by spaces
594 * @cont: set if the input is not complete - no final space char was found 604 * @cont: set if the input is not complete - no final space char was found
@@ -766,9 +776,7 @@ struct filter_pred {
766 u64 val; 776 u64 val;
767 struct regex regex; 777 struct regex regex;
768 unsigned short *ops; 778 unsigned short *ops;
769#ifdef CONFIG_FTRACE_STARTUP_TEST
770 struct ftrace_event_field *field; 779 struct ftrace_event_field *field;
771#endif
772 int offset; 780 int offset;
773 int not; 781 int not;
774 int op; 782 int op;
@@ -818,12 +826,22 @@ extern const char *__start___trace_bprintk_fmt[];
818extern const char *__stop___trace_bprintk_fmt[]; 826extern const char *__stop___trace_bprintk_fmt[];
819 827
820#undef FTRACE_ENTRY 828#undef FTRACE_ENTRY
821#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 829#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
822 extern struct ftrace_event_call \ 830 extern struct ftrace_event_call \
823 __attribute__((__aligned__(4))) event_##call; 831 __attribute__((__aligned__(4))) event_##call;
824#undef FTRACE_ENTRY_DUP 832#undef FTRACE_ENTRY_DUP
825#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 833#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \
826 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 834 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \
835 filter)
827#include "trace_entries.h" 836#include "trace_entries.h"
828 837
838#ifdef CONFIG_PERF_EVENTS
839#ifdef CONFIG_FUNCTION_TRACER
840int perf_ftrace_event_register(struct ftrace_event_call *call,
841 enum trace_reg type, void *data);
842#else
843#define perf_ftrace_event_register NULL
844#endif /* CONFIG_FUNCTION_TRACER */
845#endif /* CONFIG_PERF_EVENTS */
846
829#endif /* _LINUX_KERNEL_TRACE_H */ 847#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 93365907f219..d91eb0541b3a 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -55,7 +55,7 @@
55/* 55/*
56 * Function trace entry - function address and parent function address: 56 * Function trace entry - function address and parent function address:
57 */ 57 */
58FTRACE_ENTRY(function, ftrace_entry, 58FTRACE_ENTRY_REG(function, ftrace_entry,
59 59
60 TRACE_FN, 60 TRACE_FN,
61 61
@@ -64,7 +64,11 @@ FTRACE_ENTRY(function, ftrace_entry,
64 __field( unsigned long, parent_ip ) 64 __field( unsigned long, parent_ip )
65 ), 65 ),
66 66
67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) 67 F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip),
68
69 FILTER_TRACE_FN,
70
71 perf_ftrace_event_register
68); 72);
69 73
70/* Function call entry */ 74/* Function call entry */
@@ -78,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry,
78 __field_desc( int, graph_ent, depth ) 82 __field_desc( int, graph_ent, depth )
79 ), 83 ),
80 84
81 F_printk("--> %lx (%d)", __entry->func, __entry->depth) 85 F_printk("--> %lx (%d)", __entry->func, __entry->depth),
86
87 FILTER_OTHER
82); 88);
83 89
84/* Function return entry */ 90/* Function return entry */
@@ -98,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
98 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", 104 F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d",
99 __entry->func, __entry->depth, 105 __entry->func, __entry->depth,
100 __entry->calltime, __entry->rettime, 106 __entry->calltime, __entry->rettime,
101 __entry->depth) 107 __entry->depth),
108
109 FILTER_OTHER
102); 110);
103 111
104/* 112/*
@@ -127,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry,
127 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", 135 F_printk("%u:%u:%u ==> %u:%u:%u [%03u]",
128 __entry->prev_pid, __entry->prev_prio, __entry->prev_state, 136 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
129 __entry->next_pid, __entry->next_prio, __entry->next_state, 137 __entry->next_pid, __entry->next_prio, __entry->next_state,
130 __entry->next_cpu 138 __entry->next_cpu),
131 ) 139
140 FILTER_OTHER
132); 141);
133 142
134/* 143/*
@@ -146,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
146 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", 155 F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]",
147 __entry->prev_pid, __entry->prev_prio, __entry->prev_state, 156 __entry->prev_pid, __entry->prev_prio, __entry->prev_state,
148 __entry->next_pid, __entry->next_prio, __entry->next_state, 157 __entry->next_pid, __entry->next_prio, __entry->next_state,
149 __entry->next_cpu 158 __entry->next_cpu),
150 ) 159
160 FILTER_OTHER
151); 161);
152 162
153/* 163/*
@@ -169,7 +179,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
169 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", 179 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
170 __entry->caller[0], __entry->caller[1], __entry->caller[2], 180 __entry->caller[0], __entry->caller[1], __entry->caller[2],
171 __entry->caller[3], __entry->caller[4], __entry->caller[5], 181 __entry->caller[3], __entry->caller[4], __entry->caller[5],
172 __entry->caller[6], __entry->caller[7]) 182 __entry->caller[6], __entry->caller[7]),
183
184 FILTER_OTHER
173); 185);
174 186
175FTRACE_ENTRY(user_stack, userstack_entry, 187FTRACE_ENTRY(user_stack, userstack_entry,
@@ -185,7 +197,9 @@ FTRACE_ENTRY(user_stack, userstack_entry,
185 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", 197 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
186 __entry->caller[0], __entry->caller[1], __entry->caller[2], 198 __entry->caller[0], __entry->caller[1], __entry->caller[2],
187 __entry->caller[3], __entry->caller[4], __entry->caller[5], 199 __entry->caller[3], __entry->caller[4], __entry->caller[5],
188 __entry->caller[6], __entry->caller[7]) 200 __entry->caller[6], __entry->caller[7]),
201
202 FILTER_OTHER
189); 203);
190 204
191/* 205/*
@@ -202,7 +216,9 @@ FTRACE_ENTRY(bprint, bprint_entry,
202 ), 216 ),
203 217
204 F_printk("%08lx fmt:%p", 218 F_printk("%08lx fmt:%p",
205 __entry->ip, __entry->fmt) 219 __entry->ip, __entry->fmt),
220
221 FILTER_OTHER
206); 222);
207 223
208FTRACE_ENTRY(print, print_entry, 224FTRACE_ENTRY(print, print_entry,
@@ -215,7 +231,9 @@ FTRACE_ENTRY(print, print_entry,
215 ), 231 ),
216 232
217 F_printk("%08lx %s", 233 F_printk("%08lx %s",
218 __entry->ip, __entry->buf) 234 __entry->ip, __entry->buf),
235
236 FILTER_OTHER
219); 237);
220 238
221FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, 239FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
@@ -234,7 +252,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw,
234 252
235 F_printk("%lx %lx %lx %d %x %x", 253 F_printk("%lx %lx %lx %d %x %x",
236 (unsigned long)__entry->phys, __entry->value, __entry->pc, 254 (unsigned long)__entry->phys, __entry->value, __entry->pc,
237 __entry->map_id, __entry->opcode, __entry->width) 255 __entry->map_id, __entry->opcode, __entry->width),
256
257 FILTER_OTHER
238); 258);
239 259
240FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, 260FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
@@ -252,7 +272,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map,
252 272
253 F_printk("%lx %lx %lx %d %x", 273 F_printk("%lx %lx %lx %d %x",
254 (unsigned long)__entry->phys, __entry->virt, __entry->len, 274 (unsigned long)__entry->phys, __entry->virt, __entry->len,
255 __entry->map_id, __entry->opcode) 275 __entry->map_id, __entry->opcode),
276
277 FILTER_OTHER
256); 278);
257 279
258 280
@@ -272,6 +294,8 @@ FTRACE_ENTRY(branch, trace_branch,
272 294
273 F_printk("%u:%s:%s (%u)", 295 F_printk("%u:%s:%s (%u)",
274 __entry->line, 296 __entry->line,
275 __entry->func, __entry->file, __entry->correct) 297 __entry->func, __entry->file, __entry->correct),
298
299 FILTER_OTHER
276); 300);
277 301
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 19a359d5e6d5..fee3752ae8f6 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -24,6 +24,11 @@ static int total_ref_count;
24static int perf_trace_event_perm(struct ftrace_event_call *tp_event, 24static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
25 struct perf_event *p_event) 25 struct perf_event *p_event)
26{ 26{
27 /* The ftrace function trace is allowed only for root. */
28 if (ftrace_event_is_function(tp_event) &&
29 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
30 return -EPERM;
31
27 /* No tracing, just counting, so no obvious leak */ 32 /* No tracing, just counting, so no obvious leak */
28 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) 33 if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
29 return 0; 34 return 0;
@@ -44,23 +49,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
44 return 0; 49 return 0;
45} 50}
46 51
47static int perf_trace_event_init(struct ftrace_event_call *tp_event, 52static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
48 struct perf_event *p_event) 53 struct perf_event *p_event)
49{ 54{
50 struct hlist_head __percpu *list; 55 struct hlist_head __percpu *list;
51 int ret; 56 int ret = -ENOMEM;
52 int cpu; 57 int cpu;
53 58
54 ret = perf_trace_event_perm(tp_event, p_event);
55 if (ret)
56 return ret;
57
58 p_event->tp_event = tp_event; 59 p_event->tp_event = tp_event;
59 if (tp_event->perf_refcount++ > 0) 60 if (tp_event->perf_refcount++ > 0)
60 return 0; 61 return 0;
61 62
62 ret = -ENOMEM;
63
64 list = alloc_percpu(struct hlist_head); 63 list = alloc_percpu(struct hlist_head);
65 if (!list) 64 if (!list)
66 goto fail; 65 goto fail;
@@ -83,7 +82,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event,
83 } 82 }
84 } 83 }
85 84
86 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); 85 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
87 if (ret) 86 if (ret)
88 goto fail; 87 goto fail;
89 88
@@ -108,6 +107,69 @@ fail:
108 return ret; 107 return ret;
109} 108}
110 109
110static void perf_trace_event_unreg(struct perf_event *p_event)
111{
112 struct ftrace_event_call *tp_event = p_event->tp_event;
113 int i;
114
115 if (--tp_event->perf_refcount > 0)
116 goto out;
117
118 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
119
120 /*
121 * Ensure our callback won't be called anymore. The buffers
122 * will be freed after that.
123 */
124 tracepoint_synchronize_unregister();
125
126 free_percpu(tp_event->perf_events);
127 tp_event->perf_events = NULL;
128
129 if (!--total_ref_count) {
130 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
131 free_percpu(perf_trace_buf[i]);
132 perf_trace_buf[i] = NULL;
133 }
134 }
135out:
136 module_put(tp_event->mod);
137}
138
139static int perf_trace_event_open(struct perf_event *p_event)
140{
141 struct ftrace_event_call *tp_event = p_event->tp_event;
142 return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
143}
144
145static void perf_trace_event_close(struct perf_event *p_event)
146{
147 struct ftrace_event_call *tp_event = p_event->tp_event;
148 tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
149}
150
151static int perf_trace_event_init(struct ftrace_event_call *tp_event,
152 struct perf_event *p_event)
153{
154 int ret;
155
156 ret = perf_trace_event_perm(tp_event, p_event);
157 if (ret)
158 return ret;
159
160 ret = perf_trace_event_reg(tp_event, p_event);
161 if (ret)
162 return ret;
163
164 ret = perf_trace_event_open(p_event);
165 if (ret) {
166 perf_trace_event_unreg(p_event);
167 return ret;
168 }
169
170 return 0;
171}
172
111int perf_trace_init(struct perf_event *p_event) 173int perf_trace_init(struct perf_event *p_event)
112{ 174{
113 struct ftrace_event_call *tp_event; 175 struct ftrace_event_call *tp_event;
@@ -130,6 +192,14 @@ int perf_trace_init(struct perf_event *p_event)
130 return ret; 192 return ret;
131} 193}
132 194
195void perf_trace_destroy(struct perf_event *p_event)
196{
197 mutex_lock(&event_mutex);
198 perf_trace_event_close(p_event);
199 perf_trace_event_unreg(p_event);
200 mutex_unlock(&event_mutex);
201}
202
133int perf_trace_add(struct perf_event *p_event, int flags) 203int perf_trace_add(struct perf_event *p_event, int flags)
134{ 204{
135 struct ftrace_event_call *tp_event = p_event->tp_event; 205 struct ftrace_event_call *tp_event = p_event->tp_event;
@@ -146,43 +216,14 @@ int perf_trace_add(struct perf_event *p_event, int flags)
146 list = this_cpu_ptr(pcpu_list); 216 list = this_cpu_ptr(pcpu_list);
147 hlist_add_head_rcu(&p_event->hlist_entry, list); 217 hlist_add_head_rcu(&p_event->hlist_entry, list);
148 218
149 return 0; 219 return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
150} 220}
151 221
152void perf_trace_del(struct perf_event *p_event, int flags) 222void perf_trace_del(struct perf_event *p_event, int flags)
153{ 223{
154 hlist_del_rcu(&p_event->hlist_entry);
155}
156
157void perf_trace_destroy(struct perf_event *p_event)
158{
159 struct ftrace_event_call *tp_event = p_event->tp_event; 224 struct ftrace_event_call *tp_event = p_event->tp_event;
160 int i; 225 hlist_del_rcu(&p_event->hlist_entry);
161 226 tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
162 mutex_lock(&event_mutex);
163 if (--tp_event->perf_refcount > 0)
164 goto out;
165
166 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
167
168 /*
169 * Ensure our callback won't be called anymore. The buffers
170 * will be freed after that.
171 */
172 tracepoint_synchronize_unregister();
173
174 free_percpu(tp_event->perf_events);
175 tp_event->perf_events = NULL;
176
177 if (!--total_ref_count) {
178 for (i = 0; i < PERF_NR_CONTEXTS; i++) {
179 free_percpu(perf_trace_buf[i]);
180 perf_trace_buf[i] = NULL;
181 }
182 }
183out:
184 module_put(tp_event->mod);
185 mutex_unlock(&event_mutex);
186} 227}
187 228
188__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 229__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
@@ -214,3 +255,86 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
214 return raw_data; 255 return raw_data;
215} 256}
216EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 257EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
258
259#ifdef CONFIG_FUNCTION_TRACER
260static void
261perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
262{
263 struct ftrace_entry *entry;
264 struct hlist_head *head;
265 struct pt_regs regs;
266 int rctx;
267
268#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
269 sizeof(u64)) - sizeof(u32))
270
271 BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
272
273 perf_fetch_caller_regs(&regs);
274
275 entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
276 if (!entry)
277 return;
278
279 entry->ip = ip;
280 entry->parent_ip = parent_ip;
281
282 head = this_cpu_ptr(event_function.perf_events);
283 perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
284 1, &regs, head);
285
286#undef ENTRY_SIZE
287}
288
289static int perf_ftrace_function_register(struct perf_event *event)
290{
291 struct ftrace_ops *ops = &event->ftrace_ops;
292
293 ops->flags |= FTRACE_OPS_FL_CONTROL;
294 ops->func = perf_ftrace_function_call;
295 return register_ftrace_function(ops);
296}
297
298static int perf_ftrace_function_unregister(struct perf_event *event)
299{
300 struct ftrace_ops *ops = &event->ftrace_ops;
301 int ret = unregister_ftrace_function(ops);
302 ftrace_free_filter(ops);
303 return ret;
304}
305
306static void perf_ftrace_function_enable(struct perf_event *event)
307{
308 ftrace_function_local_enable(&event->ftrace_ops);
309}
310
311static void perf_ftrace_function_disable(struct perf_event *event)
312{
313 ftrace_function_local_disable(&event->ftrace_ops);
314}
315
316int perf_ftrace_event_register(struct ftrace_event_call *call,
317 enum trace_reg type, void *data)
318{
319 switch (type) {
320 case TRACE_REG_REGISTER:
321 case TRACE_REG_UNREGISTER:
322 break;
323 case TRACE_REG_PERF_REGISTER:
324 case TRACE_REG_PERF_UNREGISTER:
325 return 0;
326 case TRACE_REG_PERF_OPEN:
327 return perf_ftrace_function_register(data);
328 case TRACE_REG_PERF_CLOSE:
329 return perf_ftrace_function_unregister(data);
330 case TRACE_REG_PERF_ADD:
331 perf_ftrace_function_enable(data);
332 return 0;
333 case TRACE_REG_PERF_DEL:
334 perf_ftrace_function_disable(data);
335 return 0;
336 }
337
338 return -EINVAL;
339}
340#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c212a7f934ec..079a93ae8a9d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -147,7 +147,8 @@ int trace_event_raw_init(struct ftrace_event_call *call)
147} 147}
148EXPORT_SYMBOL_GPL(trace_event_raw_init); 148EXPORT_SYMBOL_GPL(trace_event_raw_init);
149 149
150int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) 150int ftrace_event_reg(struct ftrace_event_call *call,
151 enum trace_reg type, void *data)
151{ 152{
152 switch (type) { 153 switch (type) {
153 case TRACE_REG_REGISTER: 154 case TRACE_REG_REGISTER:
@@ -170,6 +171,11 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type)
170 call->class->perf_probe, 171 call->class->perf_probe,
171 call); 172 call);
172 return 0; 173 return 0;
174 case TRACE_REG_PERF_OPEN:
175 case TRACE_REG_PERF_CLOSE:
176 case TRACE_REG_PERF_ADD:
177 case TRACE_REG_PERF_DEL:
178 return 0;
173#endif 179#endif
174 } 180 }
175 return 0; 181 return 0;
@@ -209,7 +215,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
209 tracing_stop_cmdline_record(); 215 tracing_stop_cmdline_record();
210 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; 216 call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
211 } 217 }
212 call->class->reg(call, TRACE_REG_UNREGISTER); 218 call->class->reg(call, TRACE_REG_UNREGISTER, NULL);
213 } 219 }
214 break; 220 break;
215 case 1: 221 case 1:
@@ -218,7 +224,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
218 tracing_start_cmdline_record(); 224 tracing_start_cmdline_record();
219 call->flags |= TRACE_EVENT_FL_RECORDED_CMD; 225 call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
220 } 226 }
221 ret = call->class->reg(call, TRACE_REG_REGISTER); 227 ret = call->class->reg(call, TRACE_REG_REGISTER, NULL);
222 if (ret) { 228 if (ret) {
223 tracing_stop_cmdline_record(); 229 tracing_stop_cmdline_record();
224 pr_info("event trace: Could not enable event " 230 pr_info("event trace: Could not enable event "
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 24aee7127451..431dba8b7542 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -81,6 +81,7 @@ enum {
81 FILT_ERR_TOO_MANY_PREDS, 81 FILT_ERR_TOO_MANY_PREDS,
82 FILT_ERR_MISSING_FIELD, 82 FILT_ERR_MISSING_FIELD,
83 FILT_ERR_INVALID_FILTER, 83 FILT_ERR_INVALID_FILTER,
84 FILT_ERR_IP_FIELD_ONLY,
84}; 85};
85 86
86static char *err_text[] = { 87static char *err_text[] = {
@@ -96,6 +97,7 @@ static char *err_text[] = {
96 "Too many terms in predicate expression", 97 "Too many terms in predicate expression",
97 "Missing field name and/or value", 98 "Missing field name and/or value",
98 "Meaningless filter expression", 99 "Meaningless filter expression",
100 "Only 'ip' field is supported for function trace",
99}; 101};
100 102
101struct opstack_op { 103struct opstack_op {
@@ -685,7 +687,7 @@ find_event_field(struct ftrace_event_call *call, char *name)
685 687
686static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) 688static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
687{ 689{
688 stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); 690 stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
689 if (!stack->preds) 691 if (!stack->preds)
690 return -ENOMEM; 692 return -ENOMEM;
691 stack->index = n_preds; 693 stack->index = n_preds;
@@ -826,8 +828,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds)
826 if (filter->preds) 828 if (filter->preds)
827 __free_preds(filter); 829 __free_preds(filter);
828 830
829 filter->preds = 831 filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL);
830 kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
831 832
832 if (!filter->preds) 833 if (!filter->preds)
833 return -ENOMEM; 834 return -ENOMEM;
@@ -900,6 +901,11 @@ int filter_assign_type(const char *type)
900 return FILTER_OTHER; 901 return FILTER_OTHER;
901} 902}
902 903
904static bool is_function_field(struct ftrace_event_field *field)
905{
906 return field->filter_type == FILTER_TRACE_FN;
907}
908
903static bool is_string_field(struct ftrace_event_field *field) 909static bool is_string_field(struct ftrace_event_field *field)
904{ 910{
905 return field->filter_type == FILTER_DYN_STRING || 911 return field->filter_type == FILTER_DYN_STRING ||
@@ -987,6 +993,11 @@ static int init_pred(struct filter_parse_state *ps,
987 fn = filter_pred_strloc; 993 fn = filter_pred_strloc;
988 else 994 else
989 fn = filter_pred_pchar; 995 fn = filter_pred_pchar;
996 } else if (is_function_field(field)) {
997 if (strcmp(field->name, "ip")) {
998 parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0);
999 return -EINVAL;
1000 }
990 } else { 1001 } else {
991 if (field->is_signed) 1002 if (field->is_signed)
992 ret = strict_strtoll(pred->regex.pattern, 0, &val); 1003 ret = strict_strtoll(pred->regex.pattern, 0, &val);
@@ -1334,10 +1345,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
1334 1345
1335 strcpy(pred.regex.pattern, operand2); 1346 strcpy(pred.regex.pattern, operand2);
1336 pred.regex.len = strlen(pred.regex.pattern); 1347 pred.regex.len = strlen(pred.regex.pattern);
1337
1338#ifdef CONFIG_FTRACE_STARTUP_TEST
1339 pred.field = field; 1348 pred.field = field;
1340#endif
1341 return init_pred(ps, field, &pred) ? NULL : &pred; 1349 return init_pred(ps, field, &pred) ? NULL : &pred;
1342} 1350}
1343 1351
@@ -1486,7 +1494,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
1486 children = count_leafs(preds, &preds[root->left]); 1494 children = count_leafs(preds, &preds[root->left]);
1487 children += count_leafs(preds, &preds[root->right]); 1495 children += count_leafs(preds, &preds[root->right]);
1488 1496
1489 root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); 1497 root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL);
1490 if (!root->ops) 1498 if (!root->ops)
1491 return -ENOMEM; 1499 return -ENOMEM;
1492 1500
@@ -1950,6 +1958,148 @@ void ftrace_profile_free_filter(struct perf_event *event)
1950 __free_filter(filter); 1958 __free_filter(filter);
1951} 1959}
1952 1960
1961struct function_filter_data {
1962 struct ftrace_ops *ops;
1963 int first_filter;
1964 int first_notrace;
1965};
1966
1967#ifdef CONFIG_FUNCTION_TRACER
1968static char **
1969ftrace_function_filter_re(char *buf, int len, int *count)
1970{
1971 char *str, *sep, **re;
1972
1973 str = kstrndup(buf, len, GFP_KERNEL);
1974 if (!str)
1975 return NULL;
1976
1977 /*
1978 * The argv_split function takes white space
1979 * as a separator, so convert ',' into spaces.
1980 */
1981 while ((sep = strchr(str, ',')))
1982 *sep = ' ';
1983
1984 re = argv_split(GFP_KERNEL, str, count);
1985 kfree(str);
1986 return re;
1987}
1988
1989static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter,
1990 int reset, char *re, int len)
1991{
1992 int ret;
1993
1994 if (filter)
1995 ret = ftrace_set_filter(ops, re, len, reset);
1996 else
1997 ret = ftrace_set_notrace(ops, re, len, reset);
1998
1999 return ret;
2000}
2001
2002static int __ftrace_function_set_filter(int filter, char *buf, int len,
2003 struct function_filter_data *data)
2004{
2005 int i, re_cnt, ret;
2006 int *reset;
2007 char **re;
2008
2009 reset = filter ? &data->first_filter : &data->first_notrace;
2010
2011 /*
2012 * The 'ip' field could have multiple filters set, separated
2013 * either by space or comma. We first cut the filter and apply
2014 * all pieces separatelly.
2015 */
2016 re = ftrace_function_filter_re(buf, len, &re_cnt);
2017 if (!re)
2018 return -EINVAL;
2019
2020 for (i = 0; i < re_cnt; i++) {
2021 ret = ftrace_function_set_regexp(data->ops, filter, *reset,
2022 re[i], strlen(re[i]));
2023 if (ret)
2024 break;
2025
2026 if (*reset)
2027 *reset = 0;
2028 }
2029
2030 argv_free(re);
2031 return ret;
2032}
2033
2034static int ftrace_function_check_pred(struct filter_pred *pred, int leaf)
2035{
2036 struct ftrace_event_field *field = pred->field;
2037
2038 if (leaf) {
2039 /*
2040 * Check the leaf predicate for function trace, verify:
2041 * - only '==' and '!=' is used
2042 * - the 'ip' field is used
2043 */
2044 if ((pred->op != OP_EQ) && (pred->op != OP_NE))
2045 return -EINVAL;
2046
2047 if (strcmp(field->name, "ip"))
2048 return -EINVAL;
2049 } else {
2050 /*
2051 * Check the non leaf predicate for function trace, verify:
2052 * - only '||' is used
2053 */
2054 if (pred->op != OP_OR)
2055 return -EINVAL;
2056 }
2057
2058 return 0;
2059}
2060
2061static int ftrace_function_set_filter_cb(enum move_type move,
2062 struct filter_pred *pred,
2063 int *err, void *data)
2064{
2065 /* Checking the node is valid for function trace. */
2066 if ((move != MOVE_DOWN) ||
2067 (pred->left != FILTER_PRED_INVALID)) {
2068 *err = ftrace_function_check_pred(pred, 0);
2069 } else {
2070 *err = ftrace_function_check_pred(pred, 1);
2071 if (*err)
2072 return WALK_PRED_ABORT;
2073
2074 *err = __ftrace_function_set_filter(pred->op == OP_EQ,
2075 pred->regex.pattern,
2076 pred->regex.len,
2077 data);
2078 }
2079
2080 return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT;
2081}
2082
2083static int ftrace_function_set_filter(struct perf_event *event,
2084 struct event_filter *filter)
2085{
2086 struct function_filter_data data = {
2087 .first_filter = 1,
2088 .first_notrace = 1,
2089 .ops = &event->ftrace_ops,
2090 };
2091
2092 return walk_pred_tree(filter->preds, filter->root,
2093 ftrace_function_set_filter_cb, &data);
2094}
2095#else
2096static int ftrace_function_set_filter(struct perf_event *event,
2097 struct event_filter *filter)
2098{
2099 return -ENODEV;
2100}
2101#endif /* CONFIG_FUNCTION_TRACER */
2102
1953int ftrace_profile_set_filter(struct perf_event *event, int event_id, 2103int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1954 char *filter_str) 2104 char *filter_str)
1955{ 2105{
@@ -1970,9 +2120,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1970 goto out_unlock; 2120 goto out_unlock;
1971 2121
1972 err = create_filter(call, filter_str, false, &filter); 2122 err = create_filter(call, filter_str, false, &filter);
1973 if (!err) 2123 if (err)
1974 event->filter = filter; 2124 goto free_filter;
2125
2126 if (ftrace_event_is_function(call))
2127 err = ftrace_function_set_filter(event, filter);
1975 else 2128 else
2129 event->filter = filter;
2130
2131free_filter:
2132 if (err || ftrace_event_is_function(call))
1976 __free_filter(filter); 2133 __free_filter(filter);
1977 2134
1978out_unlock: 2135out_unlock:
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index bbeec31e0ae3..7b46c9bd22ae 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -18,6 +18,16 @@
18#undef TRACE_SYSTEM 18#undef TRACE_SYSTEM
19#define TRACE_SYSTEM ftrace 19#define TRACE_SYSTEM ftrace
20 20
21/*
22 * The FTRACE_ENTRY_REG macro allows ftrace entry to define register
23 * function and thus become accesible via perf.
24 */
25#undef FTRACE_ENTRY_REG
26#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \
27 filter, regfn) \
28 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
29 filter)
30
21/* not needed for this file */ 31/* not needed for this file */
22#undef __field_struct 32#undef __field_struct
23#define __field_struct(type, item) 33#define __field_struct(type, item)
@@ -44,21 +54,22 @@
44#define F_printk(fmt, args...) fmt, args 54#define F_printk(fmt, args...) fmt, args
45 55
46#undef FTRACE_ENTRY 56#undef FTRACE_ENTRY
47#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 57#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
48struct ____ftrace_##name { \ 58struct ____ftrace_##name { \
49 tstruct \ 59 tstruct \
50}; \ 60}; \
51static void __always_unused ____ftrace_check_##name(void) \ 61static void __always_unused ____ftrace_check_##name(void) \
52{ \ 62{ \
53 struct ____ftrace_##name *__entry = NULL; \ 63 struct ____ftrace_##name *__entry = NULL; \
54 \ 64 \
55 /* force compile-time check on F_printk() */ \ 65 /* force compile-time check on F_printk() */ \
56 printk(print); \ 66 printk(print); \
57} 67}
58 68
59#undef FTRACE_ENTRY_DUP 69#undef FTRACE_ENTRY_DUP
60#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ 70#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \
61 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) 71 FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \
72 filter)
62 73
63#include "trace_entries.h" 74#include "trace_entries.h"
64 75
@@ -67,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void) \
67 ret = trace_define_field(event_call, #type, #item, \ 78 ret = trace_define_field(event_call, #type, #item, \
68 offsetof(typeof(field), item), \ 79 offsetof(typeof(field), item), \
69 sizeof(field.item), \ 80 sizeof(field.item), \
70 is_signed_type(type), FILTER_OTHER); \ 81 is_signed_type(type), filter_type); \
71 if (ret) \ 82 if (ret) \
72 return ret; 83 return ret;
73 84
@@ -77,7 +88,7 @@ static void __always_unused ____ftrace_check_##name(void) \
77 offsetof(typeof(field), \ 88 offsetof(typeof(field), \
78 container.item), \ 89 container.item), \
79 sizeof(field.container.item), \ 90 sizeof(field.container.item), \
80 is_signed_type(type), FILTER_OTHER); \ 91 is_signed_type(type), filter_type); \
81 if (ret) \ 92 if (ret) \
82 return ret; 93 return ret;
83 94
@@ -91,7 +102,7 @@ static void __always_unused ____ftrace_check_##name(void) \
91 ret = trace_define_field(event_call, event_storage, #item, \ 102 ret = trace_define_field(event_call, event_storage, #item, \
92 offsetof(typeof(field), item), \ 103 offsetof(typeof(field), item), \
93 sizeof(field.item), \ 104 sizeof(field.item), \
94 is_signed_type(type), FILTER_OTHER); \ 105 is_signed_type(type), filter_type); \
95 mutex_unlock(&event_storage_mutex); \ 106 mutex_unlock(&event_storage_mutex); \
96 if (ret) \ 107 if (ret) \
97 return ret; \ 108 return ret; \
@@ -104,7 +115,7 @@ static void __always_unused ____ftrace_check_##name(void) \
104 offsetof(typeof(field), \ 115 offsetof(typeof(field), \
105 container.item), \ 116 container.item), \
106 sizeof(field.container.item), \ 117 sizeof(field.container.item), \
107 is_signed_type(type), FILTER_OTHER); \ 118 is_signed_type(type), filter_type); \
108 if (ret) \ 119 if (ret) \
109 return ret; 120 return ret;
110 121
@@ -112,17 +123,18 @@ static void __always_unused ____ftrace_check_##name(void) \
112#define __dynamic_array(type, item) \ 123#define __dynamic_array(type, item) \
113 ret = trace_define_field(event_call, #type, #item, \ 124 ret = trace_define_field(event_call, #type, #item, \
114 offsetof(typeof(field), item), \ 125 offsetof(typeof(field), item), \
115 0, is_signed_type(type), FILTER_OTHER);\ 126 0, is_signed_type(type), filter_type);\
116 if (ret) \ 127 if (ret) \
117 return ret; 128 return ret;
118 129
119#undef FTRACE_ENTRY 130#undef FTRACE_ENTRY
120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 131#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
121int \ 132int \
122ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ 133ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
123{ \ 134{ \
124 struct struct_name field; \ 135 struct struct_name field; \
125 int ret; \ 136 int ret; \
137 int filter_type = filter; \
126 \ 138 \
127 tstruct; \ 139 tstruct; \
128 \ 140 \
@@ -152,13 +164,15 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
152#undef F_printk 164#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args) 165#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154 166
155#undef FTRACE_ENTRY 167#undef FTRACE_ENTRY_REG
156#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ 168#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
169 regfn) \
157 \ 170 \
158struct ftrace_event_class event_class_ftrace_##call = { \ 171struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \ 172 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \ 173 .define_fields = ftrace_define_fields_##call, \
161 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ 174 .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
175 .reg = regfn, \
162}; \ 176}; \
163 \ 177 \
164struct ftrace_event_call __used event_##call = { \ 178struct ftrace_event_call __used event_##call = { \
@@ -170,4 +184,14 @@ struct ftrace_event_call __used event_##call = { \
170struct ftrace_event_call __used \ 184struct ftrace_event_call __used \
171__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; 185__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
172 186
187#undef FTRACE_ENTRY
188#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \
189 FTRACE_ENTRY_REG(call, struct_name, etype, \
190 PARAMS(tstruct), PARAMS(print), filter, NULL)
191
192int ftrace_event_is_function(struct ftrace_event_call *call)
193{
194 return call == &event_function;
195}
196
173#include "trace_entries.h" 197#include "trace_entries.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 00d527c945a4..580a05ec926b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1892,7 +1892,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1892#endif /* CONFIG_PERF_EVENTS */ 1892#endif /* CONFIG_PERF_EVENTS */
1893 1893
1894static __kprobes 1894static __kprobes
1895int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) 1895int kprobe_register(struct ftrace_event_call *event,
1896 enum trace_reg type, void *data)
1896{ 1897{
1897 struct trace_probe *tp = (struct trace_probe *)event->data; 1898 struct trace_probe *tp = (struct trace_probe *)event->data;
1898 1899
@@ -1909,6 +1910,11 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1909 case TRACE_REG_PERF_UNREGISTER: 1910 case TRACE_REG_PERF_UNREGISTER:
1910 disable_trace_probe(tp, TP_FLAG_PROFILE); 1911 disable_trace_probe(tp, TP_FLAG_PROFILE);
1911 return 0; 1912 return 0;
1913 case TRACE_REG_PERF_OPEN:
1914 case TRACE_REG_PERF_CLOSE:
1915 case TRACE_REG_PERF_ADD:
1916 case TRACE_REG_PERF_DEL:
1917 return 0;
1912#endif 1918#endif
1913 } 1919 }
1914 return 0; 1920 return 0;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0d6ff3555942..859fae6b1825 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
264 return ret; 264 return ret;
265} 265}
266 266
267int trace_seq_path(struct trace_seq *s, struct path *path) 267int trace_seq_path(struct trace_seq *s, const struct path *path)
268{ 268{
269 unsigned char *p; 269 unsigned char *p;
270 270
@@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
300 unsigned long mask; 300 unsigned long mask;
301 const char *str; 301 const char *str;
302 const char *ret = p->buffer + p->len; 302 const char *ret = p->buffer + p->len;
303 int i; 303 int i, first = 1;
304 304
305 for (i = 0; flag_array[i].name && flags; i++) { 305 for (i = 0; flag_array[i].name && flags; i++) {
306 306
@@ -310,14 +310,16 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
310 310
311 str = flag_array[i].name; 311 str = flag_array[i].name;
312 flags &= ~mask; 312 flags &= ~mask;
313 if (p->len && delim) 313 if (!first && delim)
314 trace_seq_puts(p, delim); 314 trace_seq_puts(p, delim);
315 else
316 first = 0;
315 trace_seq_puts(p, str); 317 trace_seq_puts(p, str);
316 } 318 }
317 319
318 /* check for left over flags */ 320 /* check for left over flags */
319 if (flags) { 321 if (flags) {
320 if (p->len && delim) 322 if (!first && delim)
321 trace_seq_puts(p, delim); 323 trace_seq_puts(p, delim);
322 trace_seq_printf(p, "0x%lx", flags); 324 trace_seq_printf(p, "0x%lx", flags);
323 } 325 }
@@ -344,7 +346,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
344 break; 346 break;
345 } 347 }
346 348
347 if (!p->len) 349 if (ret == (const char *)(p->buffer + p->len))
348 trace_seq_printf(p, "0x%lx", val); 350 trace_seq_printf(p, "0x%lx", val);
349 351
350 trace_seq_putc(p, 0); 352 trace_seq_putc(p, 0);
@@ -370,7 +372,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
370 break; 372 break;
371 } 373 }
372 374
373 if (!p->len) 375 if (ret == (const char *)(p->buffer + p->len))
374 trace_seq_printf(p, "0x%llx", val); 376 trace_seq_printf(p, "0x%llx", val);
375 377
376 trace_seq_putc(p, 0); 378 trace_seq_putc(p, 0);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index cb654542c1a1..96fc73369099 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -17,9 +17,9 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
17static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 17static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
18 18
19static int syscall_enter_register(struct ftrace_event_call *event, 19static int syscall_enter_register(struct ftrace_event_call *event,
20 enum trace_reg type); 20 enum trace_reg type, void *data);
21static int syscall_exit_register(struct ftrace_event_call *event, 21static int syscall_exit_register(struct ftrace_event_call *event,
22 enum trace_reg type); 22 enum trace_reg type, void *data);
23 23
24static int syscall_enter_define_fields(struct ftrace_event_call *call); 24static int syscall_enter_define_fields(struct ftrace_event_call *call);
25static int syscall_exit_define_fields(struct ftrace_event_call *call); 25static int syscall_exit_define_fields(struct ftrace_event_call *call);
@@ -468,8 +468,8 @@ int __init init_ftrace_syscalls(void)
468 unsigned long addr; 468 unsigned long addr;
469 int i; 469 int i;
470 470
471 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * 471 syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata),
472 NR_syscalls, GFP_KERNEL); 472 GFP_KERNEL);
473 if (!syscalls_metadata) { 473 if (!syscalls_metadata) {
474 WARN_ON(1); 474 WARN_ON(1);
475 return -ENOMEM; 475 return -ENOMEM;
@@ -649,7 +649,7 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
649#endif /* CONFIG_PERF_EVENTS */ 649#endif /* CONFIG_PERF_EVENTS */
650 650
651static int syscall_enter_register(struct ftrace_event_call *event, 651static int syscall_enter_register(struct ftrace_event_call *event,
652 enum trace_reg type) 652 enum trace_reg type, void *data)
653{ 653{
654 switch (type) { 654 switch (type) {
655 case TRACE_REG_REGISTER: 655 case TRACE_REG_REGISTER:
@@ -664,13 +664,18 @@ static int syscall_enter_register(struct ftrace_event_call *event,
664 case TRACE_REG_PERF_UNREGISTER: 664 case TRACE_REG_PERF_UNREGISTER:
665 perf_sysenter_disable(event); 665 perf_sysenter_disable(event);
666 return 0; 666 return 0;
667 case TRACE_REG_PERF_OPEN:
668 case TRACE_REG_PERF_CLOSE:
669 case TRACE_REG_PERF_ADD:
670 case TRACE_REG_PERF_DEL:
671 return 0;
667#endif 672#endif
668 } 673 }
669 return 0; 674 return 0;
670} 675}
671 676
672static int syscall_exit_register(struct ftrace_event_call *event, 677static int syscall_exit_register(struct ftrace_event_call *event,
673 enum trace_reg type) 678 enum trace_reg type, void *data)
674{ 679{
675 switch (type) { 680 switch (type) {
676 case TRACE_REG_REGISTER: 681 case TRACE_REG_REGISTER:
@@ -685,6 +690,11 @@ static int syscall_exit_register(struct ftrace_event_call *event,
685 case TRACE_REG_PERF_UNREGISTER: 690 case TRACE_REG_PERF_UNREGISTER:
686 perf_sysexit_disable(event); 691 perf_sysexit_disable(event);
687 return 0; 692 return 0;
693 case TRACE_REG_PERF_OPEN:
694 case TRACE_REG_PERF_CLOSE:
695 case TRACE_REG_PERF_ADD:
696 case TRACE_REG_PERF_DEL:
697 return 0;
688#endif 698#endif
689 } 699 }
690 return 0; 700 return 0;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index f1539decd99d..d96ba22dabfa 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -25,7 +25,7 @@
25#include <linux/err.h> 25#include <linux/err.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/jump_label.h> 28#include <linux/static_key.h>
29 29
30extern struct tracepoint * const __start___tracepoints_ptrs[]; 30extern struct tracepoint * const __start___tracepoints_ptrs[];
31extern struct tracepoint * const __stop___tracepoints_ptrs[]; 31extern struct tracepoint * const __stop___tracepoints_ptrs[];
@@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
256{ 256{
257 WARN_ON(strcmp((*entry)->name, elem->name) != 0); 257 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
258 258
259 if (elem->regfunc && !jump_label_enabled(&elem->key) && active) 259 if (elem->regfunc && !static_key_enabled(&elem->key) && active)
260 elem->regfunc(); 260 elem->regfunc();
261 else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) 261 else if (elem->unregfunc && static_key_enabled(&elem->key) && !active)
262 elem->unregfunc(); 262 elem->unregfunc();
263 263
264 /* 264 /*
@@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
269 * is used. 269 * is used.
270 */ 270 */
271 rcu_assign_pointer(elem->funcs, (*entry)->funcs); 271 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
272 if (active && !jump_label_enabled(&elem->key)) 272 if (active && !static_key_enabled(&elem->key))
273 jump_label_inc(&elem->key); 273 static_key_slow_inc(&elem->key);
274 else if (!active && jump_label_enabled(&elem->key)) 274 else if (!active && static_key_enabled(&elem->key))
275 jump_label_dec(&elem->key); 275 static_key_slow_dec(&elem->key);
276} 276}
277 277
278/* 278/*
@@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
283 */ 283 */
284static void disable_tracepoint(struct tracepoint *elem) 284static void disable_tracepoint(struct tracepoint *elem)
285{ 285{
286 if (elem->unregfunc && jump_label_enabled(&elem->key)) 286 if (elem->unregfunc && static_key_enabled(&elem->key))
287 elem->unregfunc(); 287 elem->unregfunc();
288 288
289 if (jump_label_enabled(&elem->key)) 289 if (static_key_enabled(&elem->key))
290 jump_label_dec(&elem->key); 290 static_key_slow_dec(&elem->key);
291 rcu_assign_pointer(elem->funcs, NULL); 291 rcu_assign_pointer(elem->funcs, NULL);
292} 292}
293 293
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1d7bca7f4f52..df30ee08bdd4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -3,15 +3,14 @@
3 * 3 *
4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. 4 * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
5 * 5 *
6 * this code detects hard lockups: incidents in where on a CPU 6 * Note: Most of this code is borrowed heavily from the original softlockup
7 * the kernel does not respond to anything except NMI. 7 * detector, so thanks to Ingo for the initial implementation.
8 * 8 * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
9 * Note: Most of this code is borrowed heavily from softlockup.c,
10 * so thanks to Ingo for the initial implementation.
11 * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks
12 * to those contributors as well. 9 * to those contributors as well.
13 */ 10 */
14 11
12#define pr_fmt(fmt) "NMI watchdog: " fmt
13
15#include <linux/mm.h> 14#include <linux/mm.h>
16#include <linux/cpu.h> 15#include <linux/cpu.h>
17#include <linux/nmi.h> 16#include <linux/nmi.h>
@@ -117,9 +116,10 @@ static unsigned long get_sample_period(void)
117{ 116{
118 /* 117 /*
119 * convert watchdog_thresh from seconds to ns 118 * convert watchdog_thresh from seconds to ns
120 * the divide by 5 is to give hrtimer 5 chances to 119 * the divide by 5 is to give hrtimer several chances (two
121 * increment before the hardlockup detector generates 120 * or three with the current relation between the soft
122 * a warning 121 * and hard thresholds) to increment before the
122 * hardlockup detector generates a warning
123 */ 123 */
124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5); 124 return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
125} 125}
@@ -296,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
296 if (__this_cpu_read(soft_watchdog_warn) == true) 296 if (__this_cpu_read(soft_watchdog_warn) == true)
297 return HRTIMER_RESTART; 297 return HRTIMER_RESTART;
298 298
299 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 299 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
300 smp_processor_id(), duration, 300 smp_processor_id(), duration,
301 current->comm, task_pid_nr(current)); 301 current->comm, task_pid_nr(current));
302 print_modules(); 302 print_modules();
@@ -321,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
321 */ 321 */
322static int watchdog(void *unused) 322static int watchdog(void *unused)
323{ 323{
324 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 324 struct sched_param param = { .sched_priority = 0 };
325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 325 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
326 326
327 sched_setscheduler(current, SCHED_FIFO, &param);
328
329 /* initialize timestamp */ 327 /* initialize timestamp */
330 __touch_watchdog(); 328 __touch_watchdog();
331 329
@@ -336,9 +334,11 @@ static int watchdog(void *unused)
336 334
337 set_current_state(TASK_INTERRUPTIBLE); 335 set_current_state(TASK_INTERRUPTIBLE);
338 /* 336 /*
339 * Run briefly once per second to reset the softlockup timestamp. 337 * Run briefly (kicked by the hrtimer callback function) once every
340 * If this gets delayed for more than 60 seconds then the 338 * get_sample_period() seconds (4 seconds by default) to reset the
341 * debug-printout triggers in watchdog_timer_fn(). 339 * softlockup timestamp. If this gets delayed for more than
340 * 2*watchdog_thresh seconds then the debug-printout triggers in
341 * watchdog_timer_fn().
342 */ 342 */
343 while (!kthread_should_stop()) { 343 while (!kthread_should_stop()) {
344 __touch_watchdog(); 344 __touch_watchdog();
@@ -349,8 +349,11 @@ static int watchdog(void *unused)
349 349
350 set_current_state(TASK_INTERRUPTIBLE); 350 set_current_state(TASK_INTERRUPTIBLE);
351 } 351 }
352 /*
353 * Drop the policy/priority elevation during thread exit to avoid a
354 * scheduling latency spike.
355 */
352 __set_current_state(TASK_RUNNING); 356 __set_current_state(TASK_RUNNING);
353 param.sched_priority = 0;
354 sched_setscheduler(current, SCHED_NORMAL, &param); 357 sched_setscheduler(current, SCHED_NORMAL, &param);
355 return 0; 358 return 0;
356} 359}
@@ -376,18 +379,20 @@ static int watchdog_nmi_enable(int cpu)
376 /* Try to register using hardware perf events */ 379 /* Try to register using hardware perf events */
377 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 380 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
378 if (!IS_ERR(event)) { 381 if (!IS_ERR(event)) {
379 printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); 382 pr_info("enabled, takes one hw-pmu counter.\n");
380 goto out_save; 383 goto out_save;
381 } 384 }
382 385
383 386
384 /* vary the KERN level based on the returned errno */ 387 /* vary the KERN level based on the returned errno */
385 if (PTR_ERR(event) == -EOPNOTSUPP) 388 if (PTR_ERR(event) == -EOPNOTSUPP)
386 printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); 389 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
387 else if (PTR_ERR(event) == -ENOENT) 390 else if (PTR_ERR(event) == -ENOENT)
388 printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); 391 pr_warning("disabled (cpu%i): hardware events not enabled\n",
392 cpu);
389 else 393 else
390 printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); 394 pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
395 cpu, PTR_ERR(event));
391 return PTR_ERR(event); 396 return PTR_ERR(event);
392 397
393 /* success path */ 398 /* success path */
@@ -439,9 +444,10 @@ static int watchdog_enable(int cpu)
439 444
440 /* create the watchdog thread */ 445 /* create the watchdog thread */
441 if (!p) { 446 if (!p) {
447 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
442 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); 448 p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
443 if (IS_ERR(p)) { 449 if (IS_ERR(p)) {
444 printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); 450 pr_err("softlockup watchdog for %i failed\n", cpu);
445 if (!err) { 451 if (!err) {
446 /* if hardlockup hasn't already set this */ 452 /* if hardlockup hasn't already set this */
447 err = PTR_ERR(p); 453 err = PTR_ERR(p);
@@ -450,6 +456,7 @@ static int watchdog_enable(int cpu)
450 } 456 }
451 goto out; 457 goto out;
452 } 458 }
459 sched_setscheduler(p, SCHED_FIFO, &param);
453 kthread_bind(p, cpu); 460 kthread_bind(p, cpu);
454 per_cpu(watchdog_touch_ts, cpu) = 0; 461 per_cpu(watchdog_touch_ts, cpu) = 0;
455 per_cpu(softlockup_watchdog, cpu) = p; 462 per_cpu(softlockup_watchdog, cpu) = p;
@@ -496,7 +503,7 @@ static void watchdog_enable_all_cpus(void)
496 watchdog_enabled = 1; 503 watchdog_enabled = 1;
497 504
498 if (!watchdog_enabled) 505 if (!watchdog_enabled)
499 printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); 506 pr_err("failed to be enabled on some cpus\n");
500 507
501} 508}
502 509
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bec7b5b53e03..5abf42f63c08 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -253,11 +253,13 @@ struct workqueue_struct *system_long_wq __read_mostly;
253struct workqueue_struct *system_nrt_wq __read_mostly; 253struct workqueue_struct *system_nrt_wq __read_mostly;
254struct workqueue_struct *system_unbound_wq __read_mostly; 254struct workqueue_struct *system_unbound_wq __read_mostly;
255struct workqueue_struct *system_freezable_wq __read_mostly; 255struct workqueue_struct *system_freezable_wq __read_mostly;
256struct workqueue_struct *system_nrt_freezable_wq __read_mostly;
256EXPORT_SYMBOL_GPL(system_wq); 257EXPORT_SYMBOL_GPL(system_wq);
257EXPORT_SYMBOL_GPL(system_long_wq); 258EXPORT_SYMBOL_GPL(system_long_wq);
258EXPORT_SYMBOL_GPL(system_nrt_wq); 259EXPORT_SYMBOL_GPL(system_nrt_wq);
259EXPORT_SYMBOL_GPL(system_unbound_wq); 260EXPORT_SYMBOL_GPL(system_unbound_wq);
260EXPORT_SYMBOL_GPL(system_freezable_wq); 261EXPORT_SYMBOL_GPL(system_freezable_wq);
262EXPORT_SYMBOL_GPL(system_nrt_freezable_wq);
261 263
262#define CREATE_TRACE_POINTS 264#define CREATE_TRACE_POINTS
263#include <trace/events/workqueue.h> 265#include <trace/events/workqueue.h>
@@ -474,13 +476,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu,
474 struct workqueue_struct *wq) 476 struct workqueue_struct *wq)
475{ 477{
476 if (!(wq->flags & WQ_UNBOUND)) { 478 if (!(wq->flags & WQ_UNBOUND)) {
477 if (likely(cpu < nr_cpu_ids)) { 479 if (likely(cpu < nr_cpu_ids))
478#ifdef CONFIG_SMP
479 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); 480 return per_cpu_ptr(wq->cpu_wq.pcpu, cpu);
480#else
481 return wq->cpu_wq.single;
482#endif
483 }
484 } else if (likely(cpu == WORK_CPU_UNBOUND)) 481 } else if (likely(cpu == WORK_CPU_UNBOUND))
485 return wq->cpu_wq.single; 482 return wq->cpu_wq.single;
486 return NULL; 483 return NULL;
@@ -2897,13 +2894,8 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2897 const size_t size = sizeof(struct cpu_workqueue_struct); 2894 const size_t size = sizeof(struct cpu_workqueue_struct);
2898 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, 2895 const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
2899 __alignof__(unsigned long long)); 2896 __alignof__(unsigned long long));
2900#ifdef CONFIG_SMP
2901 bool percpu = !(wq->flags & WQ_UNBOUND);
2902#else
2903 bool percpu = false;
2904#endif
2905 2897
2906 if (percpu) 2898 if (!(wq->flags & WQ_UNBOUND))
2907 wq->cpu_wq.pcpu = __alloc_percpu(size, align); 2899 wq->cpu_wq.pcpu = __alloc_percpu(size, align);
2908 else { 2900 else {
2909 void *ptr; 2901 void *ptr;
@@ -2927,13 +2919,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
2927 2919
2928static void free_cwqs(struct workqueue_struct *wq) 2920static void free_cwqs(struct workqueue_struct *wq)
2929{ 2921{
2930#ifdef CONFIG_SMP 2922 if (!(wq->flags & WQ_UNBOUND))
2931 bool percpu = !(wq->flags & WQ_UNBOUND);
2932#else
2933 bool percpu = false;
2934#endif
2935
2936 if (percpu)
2937 free_percpu(wq->cpu_wq.pcpu); 2923 free_percpu(wq->cpu_wq.pcpu);
2938 else if (wq->cpu_wq.single) { 2924 else if (wq->cpu_wq.single) {
2939 /* the pointer to free is stored right after the cwq */ 2925 /* the pointer to free is stored right after the cwq */
@@ -3833,8 +3819,11 @@ static int __init init_workqueues(void)
3833 WQ_UNBOUND_MAX_ACTIVE); 3819 WQ_UNBOUND_MAX_ACTIVE);
3834 system_freezable_wq = alloc_workqueue("events_freezable", 3820 system_freezable_wq = alloc_workqueue("events_freezable",
3835 WQ_FREEZABLE, 0); 3821 WQ_FREEZABLE, 0);
3822 system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable",
3823 WQ_NON_REENTRANT | WQ_FREEZABLE, 0);
3836 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || 3824 BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
3837 !system_unbound_wq || !system_freezable_wq); 3825 !system_unbound_wq || !system_freezable_wq ||
3826 !system_nrt_freezable_wq);
3838 return 0; 3827 return 0;
3839} 3828}
3840early_initcall(init_workqueues); 3829early_initcall(init_workqueues);